1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::pair(TypeIdx, LLT::scalarOrVector( 112 ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 if (Size <= 32) { 138 // <2 x s8> -> s16 139 // <4 x s8> -> s32 140 return LLT::scalar(Size); 141 } 142 143 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 144 } 145 146 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 147 return [=](const LegalityQuery &Query) { 148 const LLT Ty = Query.Types[TypeIdx]; 149 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 150 }; 151 } 152 153 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 154 return [=](const LegalityQuery &Query) { 155 const LLT Ty = Query.Types[TypeIdx]; 156 unsigned Size = Ty.getSizeInBits(); 157 assert(Size % 32 == 0); 158 return std::pair( 159 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 160 }; 161 } 162 163 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 164 return [=](const LegalityQuery &Query) { 165 const LLT QueryTy = Query.Types[TypeIdx]; 166 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 167 }; 168 } 169 170 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 171 return [=](const LegalityQuery &Query) { 172 const LLT QueryTy = Query.Types[TypeIdx]; 173 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 174 }; 175 } 176 177 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 178 return [=](const LegalityQuery &Query) { 179 const LLT QueryTy = Query.Types[TypeIdx]; 180 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 181 }; 182 } 183 184 static bool isRegisterSize(unsigned Size) { 185 return Size % 32 == 0 && Size <= MaxRegisterSize; 186 } 187 188 static bool isRegisterVectorElementType(LLT EltTy) { 189 const int EltSize = EltTy.getSizeInBits(); 190 return EltSize == 16 || EltSize % 32 == 0; 191 } 192 193 static bool isRegisterVectorType(LLT Ty) { 194 const int EltSize = Ty.getElementType().getSizeInBits(); 195 return EltSize == 32 || EltSize == 64 || 196 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 197 EltSize == 128 || EltSize == 256; 198 } 199 200 static bool isRegisterType(LLT Ty) { 201 if (!isRegisterSize(Ty.getSizeInBits())) 202 return false; 203 204 if (Ty.isVector()) 205 return isRegisterVectorType(Ty); 206 207 return true; 208 } 209 210 // Any combination of 32 or 64-bit elements up the maximum register size, and 211 // multiples of v2s16. 212 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 213 return [=](const LegalityQuery &Query) { 214 return isRegisterType(Query.Types[TypeIdx]); 215 }; 216 } 217 218 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 219 return [=](const LegalityQuery &Query) { 220 const LLT QueryTy = Query.Types[TypeIdx]; 221 if (!QueryTy.isVector()) 222 return false; 223 const LLT EltTy = QueryTy.getElementType(); 224 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 225 }; 226 } 227 228 // If we have a truncating store or an extending load with a data size larger 229 // than 32-bits, we need to reduce to a 32-bit type. 230 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 231 return [=](const LegalityQuery &Query) { 232 const LLT Ty = Query.Types[TypeIdx]; 233 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 234 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 235 }; 236 } 237 238 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 239 // handle some operations by just promoting the register during 240 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 241 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 242 bool IsLoad) { 243 switch (AS) { 244 case AMDGPUAS::PRIVATE_ADDRESS: 245 // FIXME: Private element size. 246 return ST.enableFlatScratch() ? 128 : 32; 247 case AMDGPUAS::LOCAL_ADDRESS: 248 return ST.useDS128() ? 128 : 64; 249 case AMDGPUAS::GLOBAL_ADDRESS: 250 case AMDGPUAS::CONSTANT_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 252 // Treat constant and global as identical. SMRD loads are sometimes usable for 253 // global loads (ideally constant address space should be eliminated) 254 // depending on the context. Legality cannot be context dependent, but 255 // RegBankSelect can split the load as necessary depending on the pointer 256 // register bank/uniformity and if the memory is invariant or not written in a 257 // kernel. 258 return IsLoad ? 512 : 128; 259 default: 260 // Flat addresses may contextually need to be split to 32-bit parts if they 261 // may alias scratch depending on the subtarget. 262 return 128; 263 } 264 } 265 266 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 267 const LegalityQuery &Query) { 268 const LLT Ty = Query.Types[0]; 269 270 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 271 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 272 273 unsigned RegSize = Ty.getSizeInBits(); 274 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 275 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 276 unsigned AS = Query.Types[1].getAddressSpace(); 277 278 // All of these need to be custom lowered to cast the pointer operand. 279 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 280 return false; 281 282 // Do not handle extending vector loads. 283 if (Ty.isVector() && MemSize != RegSize) 284 return false; 285 286 // TODO: We should be able to widen loads if the alignment is high enough, but 287 // we also need to modify the memory access size. 288 #if 0 289 // Accept widening loads based on alignment. 290 if (IsLoad && MemSize < Size) 291 MemSize = std::max(MemSize, Align); 292 #endif 293 294 // Only 1-byte and 2-byte to 32-bit extloads are valid. 295 if (MemSize != RegSize && RegSize != 32) 296 return false; 297 298 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 299 return false; 300 301 switch (MemSize) { 302 case 8: 303 case 16: 304 case 32: 305 case 64: 306 case 128: 307 break; 308 case 96: 309 if (!ST.hasDwordx3LoadStores()) 310 return false; 311 break; 312 case 256: 313 case 512: 314 // These may contextually need to be broken down. 315 break; 316 default: 317 return false; 318 } 319 320 assert(RegSize >= MemSize); 321 322 if (AlignBits < MemSize) { 323 const SITargetLowering *TLI = ST.getTargetLowering(); 324 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 325 Align(AlignBits / 8))) 326 return false; 327 } 328 329 return true; 330 } 331 332 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 333 // workaround this. Eventually it should ignore the type for loads and only care 334 // about the size. Return true in cases where we will workaround this for now by 335 // bitcasting. 336 static bool loadStoreBitcastWorkaround(const LLT Ty) { 337 if (EnableNewLegality) 338 return false; 339 340 const unsigned Size = Ty.getSizeInBits(); 341 if (Size <= 64) 342 return false; 343 if (!Ty.isVector()) 344 return true; 345 346 LLT EltTy = Ty.getElementType(); 347 if (EltTy.isPointer()) 348 return true; 349 350 unsigned EltSize = EltTy.getSizeInBits(); 351 return EltSize != 32 && EltSize != 64; 352 } 353 354 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 355 const LLT Ty = Query.Types[0]; 356 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 357 !loadStoreBitcastWorkaround(Ty); 358 } 359 360 /// Return true if a load or store of the type should be lowered with a bitcast 361 /// to a different type. 362 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 363 const LLT MemTy) { 364 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 365 const unsigned Size = Ty.getSizeInBits(); 366 if (Size != MemSizeInBits) 367 return Size <= 32 && Ty.isVector(); 368 369 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 370 return true; 371 372 // Don't try to handle bitcasting vector ext loads for now. 373 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 374 (Size <= 32 || isRegisterSize(Size)) && 375 !isRegisterVectorElementType(Ty.getElementType()); 376 } 377 378 /// Return true if we should legalize a load by widening an odd sized memory 379 /// access up to the alignment. Note this case when the memory access itself 380 /// changes, not the size of the result register. 381 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 382 uint64_t AlignInBits, unsigned AddrSpace, 383 unsigned Opcode) { 384 unsigned SizeInBits = MemoryTy.getSizeInBits(); 385 // We don't want to widen cases that are naturally legal. 386 if (isPowerOf2_32(SizeInBits)) 387 return false; 388 389 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 390 // end up widening these for a scalar load during RegBankSelect, since there 391 // aren't 96-bit scalar loads. 392 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 393 return false; 394 395 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 396 return false; 397 398 // A load is known dereferenceable up to the alignment, so it's legal to widen 399 // to it. 400 // 401 // TODO: Could check dereferenceable for less aligned cases. 402 unsigned RoundedSize = NextPowerOf2(SizeInBits); 403 if (AlignInBits < RoundedSize) 404 return false; 405 406 // Do not widen if it would introduce a slow unaligned load. 407 const SITargetLowering *TLI = ST.getTargetLowering(); 408 unsigned Fast = 0; 409 return TLI->allowsMisalignedMemoryAccessesImpl( 410 RoundedSize, AddrSpace, Align(AlignInBits / 8), 411 MachineMemOperand::MOLoad, &Fast) && 412 Fast; 413 } 414 415 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 416 unsigned Opcode) { 417 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 418 return false; 419 420 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 421 Query.MMODescrs[0].AlignInBits, 422 Query.Types[1].getAddressSpace(), Opcode); 423 } 424 425 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 426 const GCNTargetMachine &TM) 427 : ST(ST_) { 428 using namespace TargetOpcode; 429 430 auto GetAddrSpacePtr = [&TM](unsigned AS) { 431 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 432 }; 433 434 const LLT S1 = LLT::scalar(1); 435 const LLT S8 = LLT::scalar(8); 436 const LLT S16 = LLT::scalar(16); 437 const LLT S32 = LLT::scalar(32); 438 const LLT S64 = LLT::scalar(64); 439 const LLT S128 = LLT::scalar(128); 440 const LLT S256 = LLT::scalar(256); 441 const LLT S512 = LLT::scalar(512); 442 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 443 444 const LLT V2S8 = LLT::fixed_vector(2, 8); 445 const LLT V2S16 = LLT::fixed_vector(2, 16); 446 const LLT V4S16 = LLT::fixed_vector(4, 16); 447 448 const LLT V2S32 = LLT::fixed_vector(2, 32); 449 const LLT V3S32 = LLT::fixed_vector(3, 32); 450 const LLT V4S32 = LLT::fixed_vector(4, 32); 451 const LLT V5S32 = LLT::fixed_vector(5, 32); 452 const LLT V6S32 = LLT::fixed_vector(6, 32); 453 const LLT V7S32 = LLT::fixed_vector(7, 32); 454 const LLT V8S32 = LLT::fixed_vector(8, 32); 455 const LLT V9S32 = LLT::fixed_vector(9, 32); 456 const LLT V10S32 = LLT::fixed_vector(10, 32); 457 const LLT V11S32 = LLT::fixed_vector(11, 32); 458 const LLT V12S32 = LLT::fixed_vector(12, 32); 459 const LLT V13S32 = LLT::fixed_vector(13, 32); 460 const LLT V14S32 = LLT::fixed_vector(14, 32); 461 const LLT V15S32 = LLT::fixed_vector(15, 32); 462 const LLT V16S32 = LLT::fixed_vector(16, 32); 463 const LLT V32S32 = LLT::fixed_vector(32, 32); 464 465 const LLT V2S64 = LLT::fixed_vector(2, 64); 466 const LLT V3S64 = LLT::fixed_vector(3, 64); 467 const LLT V4S64 = LLT::fixed_vector(4, 64); 468 const LLT V5S64 = LLT::fixed_vector(5, 64); 469 const LLT V6S64 = LLT::fixed_vector(6, 64); 470 const LLT V7S64 = LLT::fixed_vector(7, 64); 471 const LLT V8S64 = LLT::fixed_vector(8, 64); 472 const LLT V16S64 = LLT::fixed_vector(16, 64); 473 474 std::initializer_list<LLT> AllS32Vectors = 475 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 476 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 477 std::initializer_list<LLT> AllS64Vectors = 478 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 479 480 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 481 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 482 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 483 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 484 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 485 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 486 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 487 488 const LLT CodePtr = FlatPtr; 489 490 const std::initializer_list<LLT> AddrSpaces64 = { 491 GlobalPtr, ConstantPtr, FlatPtr 492 }; 493 494 const std::initializer_list<LLT> AddrSpaces32 = { 495 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 496 }; 497 498 const std::initializer_list<LLT> FPTypesBase = { 499 S32, S64 500 }; 501 502 const std::initializer_list<LLT> FPTypes16 = { 503 S32, S64, S16 504 }; 505 506 const std::initializer_list<LLT> FPTypesPK16 = { 507 S32, S64, S16, V2S16 508 }; 509 510 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 511 512 // s1 for VCC branches, s32 for SCC branches. 513 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 514 515 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 516 // elements for v3s16 517 getActionDefinitionsBuilder(G_PHI) 518 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 519 .legalFor(AllS32Vectors) 520 .legalFor(AllS64Vectors) 521 .legalFor(AddrSpaces64) 522 .legalFor(AddrSpaces32) 523 .legalIf(isPointer(0)) 524 .clampScalar(0, S16, S256) 525 .widenScalarToNextPow2(0, 32) 526 .clampMaxNumElements(0, S32, 16) 527 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 528 .scalarize(0); 529 530 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 531 // Full set of gfx9 features. 532 getActionDefinitionsBuilder({G_ADD, G_SUB}) 533 .legalFor({S32, S16, V2S16}) 534 .clampMaxNumElementsStrict(0, S16, 2) 535 .scalarize(0) 536 .minScalar(0, S16) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32); 539 540 getActionDefinitionsBuilder(G_MUL) 541 .legalFor({S32, S16, V2S16}) 542 .clampMaxNumElementsStrict(0, S16, 2) 543 .scalarize(0) 544 .minScalar(0, S16) 545 .widenScalarToNextMultipleOf(0, 32) 546 .custom(); 547 assert(ST.hasMad64_32()); 548 549 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 550 .legalFor({S32, S16, V2S16}) // Clamp modifier 551 .minScalarOrElt(0, S16) 552 .clampMaxNumElementsStrict(0, S16, 2) 553 .scalarize(0) 554 .widenScalarToNextPow2(0, 32) 555 .lower(); 556 } else if (ST.has16BitInsts()) { 557 getActionDefinitionsBuilder({G_ADD, G_SUB}) 558 .legalFor({S32, S16}) 559 .minScalar(0, S16) 560 .widenScalarToNextMultipleOf(0, 32) 561 .maxScalar(0, S32) 562 .scalarize(0); 563 564 getActionDefinitionsBuilder(G_MUL) 565 .legalFor({S32, S16}) 566 .scalarize(0) 567 .minScalar(0, S16) 568 .widenScalarToNextMultipleOf(0, 32) 569 .custom(); 570 assert(ST.hasMad64_32()); 571 572 // Technically the saturating operations require clamp bit support, but this 573 // was introduced at the same time as 16-bit operations. 574 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 575 .legalFor({S32, S16}) // Clamp modifier 576 .minScalar(0, S16) 577 .scalarize(0) 578 .widenScalarToNextPow2(0, 16) 579 .lower(); 580 581 // We're just lowering this, but it helps get a better result to try to 582 // coerce to the desired type first. 583 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 584 .minScalar(0, S16) 585 .scalarize(0) 586 .lower(); 587 } else { 588 getActionDefinitionsBuilder({G_ADD, G_SUB}) 589 .legalFor({S32}) 590 .widenScalarToNextMultipleOf(0, 32) 591 .clampScalar(0, S32, S32) 592 .scalarize(0); 593 594 auto &Mul = getActionDefinitionsBuilder(G_MUL) 595 .legalFor({S32}) 596 .scalarize(0) 597 .minScalar(0, S32) 598 .widenScalarToNextMultipleOf(0, 32); 599 600 if (ST.hasMad64_32()) 601 Mul.custom(); 602 else 603 Mul.maxScalar(0, S32); 604 605 if (ST.hasIntClamp()) { 606 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 607 .legalFor({S32}) // Clamp modifier. 608 .scalarize(0) 609 .minScalarOrElt(0, S32) 610 .lower(); 611 } else { 612 // Clamp bit support was added in VI, along with 16-bit operations. 613 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 614 .minScalar(0, S32) 615 .scalarize(0) 616 .lower(); 617 } 618 619 // FIXME: DAG expansion gets better results. The widening uses the smaller 620 // range values and goes for the min/max lowering directly. 621 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 622 .minScalar(0, S32) 623 .scalarize(0) 624 .lower(); 625 } 626 627 getActionDefinitionsBuilder( 628 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 629 .customFor({S32, S64}) 630 .clampScalar(0, S32, S64) 631 .widenScalarToNextPow2(0, 32) 632 .scalarize(0); 633 634 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 635 .legalFor({S32}) 636 .maxScalar(0, S32); 637 638 if (ST.hasVOP3PInsts()) { 639 Mulh 640 .clampMaxNumElements(0, S8, 2) 641 .lowerFor({V2S8}); 642 } 643 644 Mulh 645 .scalarize(0) 646 .lower(); 647 648 // Report legal for any types we can handle anywhere. For the cases only legal 649 // on the SALU, RegBankSelect will be able to re-legalize. 650 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 651 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 652 .clampScalar(0, S32, S64) 653 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 654 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 655 .widenScalarToNextPow2(0) 656 .scalarize(0); 657 658 getActionDefinitionsBuilder( 659 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 660 .legalFor({{S32, S1}, {S32, S32}}) 661 .clampScalar(0, S32, S32) 662 .scalarize(0); 663 664 getActionDefinitionsBuilder(G_BITCAST) 665 // Don't worry about the size constraint. 666 .legalIf(all(isRegisterType(0), isRegisterType(1))) 667 .lower(); 668 669 670 getActionDefinitionsBuilder(G_CONSTANT) 671 .legalFor({S1, S32, S64, S16, GlobalPtr, 672 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 673 .legalIf(isPointer(0)) 674 .clampScalar(0, S32, S64) 675 .widenScalarToNextPow2(0); 676 677 getActionDefinitionsBuilder(G_FCONSTANT) 678 .legalFor({S32, S64, S16}) 679 .clampScalar(0, S16, S64); 680 681 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 682 .legalIf(isRegisterType(0)) 683 // s1 and s16 are special cases because they have legal operations on 684 // them, but don't really occupy registers in the normal way. 685 .legalFor({S1, S16}) 686 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 687 .clampScalarOrElt(0, S32, MaxScalar) 688 .widenScalarToNextPow2(0, 32) 689 .clampMaxNumElements(0, S32, 16); 690 691 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 692 693 // If the amount is divergent, we have to do a wave reduction to get the 694 // maximum value, so this is expanded during RegBankSelect. 695 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 696 .legalFor({{PrivatePtr, S32}}); 697 698 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 699 .customIf(typeIsNot(0, PrivatePtr)); 700 701 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 702 703 auto &FPOpActions = getActionDefinitionsBuilder( 704 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 705 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 706 .legalFor({S32, S64}); 707 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 708 .customFor({S32, S64}); 709 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 710 .customFor({S32, S64}); 711 712 if (ST.has16BitInsts()) { 713 if (ST.hasVOP3PInsts()) 714 FPOpActions.legalFor({S16, V2S16}); 715 else 716 FPOpActions.legalFor({S16}); 717 718 TrigActions.customFor({S16}); 719 FDIVActions.customFor({S16}); 720 } 721 722 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 723 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 724 725 if (ST.hasVOP3PInsts()) { 726 MinNumMaxNum.customFor(FPTypesPK16) 727 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 728 .clampMaxNumElements(0, S16, 2) 729 .clampScalar(0, S16, S64) 730 .scalarize(0); 731 } else if (ST.has16BitInsts()) { 732 MinNumMaxNum.customFor(FPTypes16) 733 .clampScalar(0, S16, S64) 734 .scalarize(0); 735 } else { 736 MinNumMaxNum.customFor(FPTypesBase) 737 .clampScalar(0, S32, S64) 738 .scalarize(0); 739 } 740 741 if (ST.hasVOP3PInsts()) 742 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 743 744 FPOpActions 745 .scalarize(0) 746 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 747 748 TrigActions 749 .scalarize(0) 750 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 751 752 FDIVActions 753 .scalarize(0) 754 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 755 756 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 757 .legalFor(FPTypesPK16) 758 .clampMaxNumElementsStrict(0, S16, 2) 759 .scalarize(0) 760 .clampScalar(0, S16, S64); 761 762 if (ST.has16BitInsts()) { 763 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 764 .legalFor({S32, S64, S16}) 765 .scalarize(0) 766 .clampScalar(0, S16, S64); 767 } else { 768 getActionDefinitionsBuilder(G_FSQRT) 769 .legalFor({S32, S64}) 770 .scalarize(0) 771 .clampScalar(0, S32, S64); 772 773 if (ST.hasFractBug()) { 774 getActionDefinitionsBuilder(G_FFLOOR) 775 .customFor({S64}) 776 .legalFor({S32, S64}) 777 .scalarize(0) 778 .clampScalar(0, S32, S64); 779 } else { 780 getActionDefinitionsBuilder(G_FFLOOR) 781 .legalFor({S32, S64}) 782 .scalarize(0) 783 .clampScalar(0, S32, S64); 784 } 785 } 786 787 getActionDefinitionsBuilder(G_FPTRUNC) 788 .legalFor({{S32, S64}, {S16, S32}}) 789 .scalarize(0) 790 .lower(); 791 792 getActionDefinitionsBuilder(G_FPEXT) 793 .legalFor({{S64, S32}, {S32, S16}}) 794 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 795 .scalarize(0); 796 797 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 798 if (ST.has16BitInsts()) { 799 FSubActions 800 // Use actual fsub instruction 801 .legalFor({S32, S16}) 802 // Must use fadd + fneg 803 .lowerFor({S64, V2S16}); 804 } else { 805 FSubActions 806 // Use actual fsub instruction 807 .legalFor({S32}) 808 // Must use fadd + fneg 809 .lowerFor({S64, S16, V2S16}); 810 } 811 812 FSubActions 813 .scalarize(0) 814 .clampScalar(0, S32, S64); 815 816 // Whether this is legal depends on the floating point mode for the function. 817 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 818 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 819 FMad.customFor({S32, S16}); 820 else if (ST.hasMadMacF32Insts()) 821 FMad.customFor({S32}); 822 else if (ST.hasMadF16()) 823 FMad.customFor({S16}); 824 FMad.scalarize(0) 825 .lower(); 826 827 auto &FRem = getActionDefinitionsBuilder(G_FREM); 828 if (ST.has16BitInsts()) { 829 FRem.customFor({S16, S32, S64}); 830 } else { 831 FRem.minScalar(0, S32) 832 .customFor({S32, S64}); 833 } 834 FRem.scalarize(0); 835 836 // TODO: Do we need to clamp maximum bitwidth? 837 getActionDefinitionsBuilder(G_TRUNC) 838 .legalIf(isScalar(0)) 839 .legalFor({{V2S16, V2S32}}) 840 .clampMaxNumElements(0, S16, 2) 841 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 842 // situations (like an invalid implicit use), we don't want to infinite loop 843 // in the legalizer. 844 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 845 .alwaysLegal(); 846 847 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 848 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 849 {S32, S1}, {S64, S1}, {S16, S1}}) 850 .scalarize(0) 851 .clampScalar(0, S32, S64) 852 .widenScalarToNextPow2(1, 32); 853 854 // TODO: Split s1->s64 during regbankselect for VALU. 855 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 856 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 857 .lowerIf(typeIs(1, S1)) 858 .customFor({{S32, S64}, {S64, S64}}); 859 if (ST.has16BitInsts()) 860 IToFP.legalFor({{S16, S16}}); 861 IToFP.clampScalar(1, S32, S64) 862 .minScalar(0, S32) 863 .scalarize(0) 864 .widenScalarToNextPow2(1); 865 866 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 867 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 868 .customFor({{S64, S32}, {S64, S64}}) 869 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 870 if (ST.has16BitInsts()) 871 FPToI.legalFor({{S16, S16}}); 872 else 873 FPToI.minScalar(1, S32); 874 875 FPToI.minScalar(0, S32) 876 .widenScalarToNextPow2(0, 32) 877 .scalarize(0) 878 .lower(); 879 880 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 881 .customFor({S16, S32}) 882 .scalarize(0) 883 .lower(); 884 885 // Lower roundeven into G_FRINT 886 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 887 .scalarize(0) 888 .lower(); 889 890 if (ST.has16BitInsts()) { 891 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 892 .legalFor({S16, S32, S64}) 893 .clampScalar(0, S16, S64) 894 .scalarize(0); 895 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 896 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 897 .legalFor({S32, S64}) 898 .clampScalar(0, S32, S64) 899 .scalarize(0); 900 } else { 901 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 902 .legalFor({S32}) 903 .customFor({S64}) 904 .clampScalar(0, S32, S64) 905 .scalarize(0); 906 } 907 908 getActionDefinitionsBuilder(G_PTR_ADD) 909 .legalIf(all(isPointer(0), sameSize(0, 1))) 910 .scalarize(0) 911 .scalarSameSizeAs(1, 0); 912 913 getActionDefinitionsBuilder(G_PTRMASK) 914 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 915 .scalarSameSizeAs(1, 0) 916 .scalarize(0); 917 918 auto &CmpBuilder = 919 getActionDefinitionsBuilder(G_ICMP) 920 // The compare output type differs based on the register bank of the output, 921 // so make both s1 and s32 legal. 922 // 923 // Scalar compares producing output in scc will be promoted to s32, as that 924 // is the allocatable register type that will be needed for the copy from 925 // scc. This will be promoted during RegBankSelect, and we assume something 926 // before that won't try to use s32 result types. 927 // 928 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 929 // bank. 930 .legalForCartesianProduct( 931 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 932 .legalForCartesianProduct( 933 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 934 if (ST.has16BitInsts()) { 935 CmpBuilder.legalFor({{S1, S16}}); 936 } 937 938 CmpBuilder 939 .widenScalarToNextPow2(1) 940 .clampScalar(1, S32, S64) 941 .scalarize(0) 942 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 943 944 getActionDefinitionsBuilder(G_FCMP) 945 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 946 .widenScalarToNextPow2(1) 947 .clampScalar(1, S32, S64) 948 .scalarize(0); 949 950 // FIXME: fpow has a selection pattern that should move to custom lowering. 951 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 952 if (ST.has16BitInsts()) 953 Exp2Ops.legalFor({S32, S16}); 954 else 955 Exp2Ops.legalFor({S32}); 956 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 957 Exp2Ops.scalarize(0); 958 959 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 960 if (ST.has16BitInsts()) 961 ExpOps.customFor({{S32}, {S16}}); 962 else 963 ExpOps.customFor({S32}); 964 ExpOps.clampScalar(0, MinScalarFPTy, S32) 965 .scalarize(0); 966 967 getActionDefinitionsBuilder(G_FPOWI) 968 .clampScalar(0, MinScalarFPTy, S32) 969 .lower(); 970 971 // The 64-bit versions produce 32-bit results, but only on the SALU. 972 getActionDefinitionsBuilder(G_CTPOP) 973 .legalFor({{S32, S32}, {S32, S64}}) 974 .clampScalar(0, S32, S32) 975 .widenScalarToNextPow2(1, 32) 976 .clampScalar(1, S32, S64) 977 .scalarize(0) 978 .widenScalarToNextPow2(0, 32); 979 980 // If no 16 bit instr is available, lower into different instructions. 981 if (ST.has16BitInsts()) 982 getActionDefinitionsBuilder(G_IS_FPCLASS) 983 .legalForCartesianProduct({S1}, FPTypes16) 984 .widenScalarToNextPow2(1) 985 .scalarize(0) 986 .lower(); 987 else 988 getActionDefinitionsBuilder(G_IS_FPCLASS) 989 .legalForCartesianProduct({S1}, FPTypesBase) 990 .lowerFor({S1, S16}) 991 .widenScalarToNextPow2(1) 992 .scalarize(0) 993 .lower(); 994 995 // The hardware instructions return a different result on 0 than the generic 996 // instructions expect. The hardware produces -1, but these produce the 997 // bitwidth. 998 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 999 .scalarize(0) 1000 .clampScalar(0, S32, S32) 1001 .clampScalar(1, S32, S64) 1002 .widenScalarToNextPow2(0, 32) 1003 .widenScalarToNextPow2(1, 32) 1004 .custom(); 1005 1006 // The 64-bit versions produce 32-bit results, but only on the SALU. 1007 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 1008 .legalFor({{S32, S32}, {S32, S64}}) 1009 .clampScalar(0, S32, S32) 1010 .clampScalar(1, S32, S64) 1011 .scalarize(0) 1012 .widenScalarToNextPow2(0, 32) 1013 .widenScalarToNextPow2(1, 32); 1014 1015 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1016 // RegBankSelect. 1017 getActionDefinitionsBuilder(G_BITREVERSE) 1018 .legalFor({S32, S64}) 1019 .clampScalar(0, S32, S64) 1020 .scalarize(0) 1021 .widenScalarToNextPow2(0); 1022 1023 if (ST.has16BitInsts()) { 1024 getActionDefinitionsBuilder(G_BSWAP) 1025 .legalFor({S16, S32, V2S16}) 1026 .clampMaxNumElementsStrict(0, S16, 2) 1027 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1028 // narrowScalar limitation. 1029 .widenScalarToNextPow2(0) 1030 .clampScalar(0, S16, S32) 1031 .scalarize(0); 1032 1033 if (ST.hasVOP3PInsts()) { 1034 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1035 .legalFor({S32, S16, V2S16}) 1036 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1037 .clampMaxNumElements(0, S16, 2) 1038 .minScalar(0, S16) 1039 .widenScalarToNextPow2(0) 1040 .scalarize(0) 1041 .lower(); 1042 } else { 1043 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1044 .legalFor({S32, S16}) 1045 .widenScalarToNextPow2(0) 1046 .minScalar(0, S16) 1047 .scalarize(0) 1048 .lower(); 1049 } 1050 } else { 1051 // TODO: Should have same legality without v_perm_b32 1052 getActionDefinitionsBuilder(G_BSWAP) 1053 .legalFor({S32}) 1054 .lowerIf(scalarNarrowerThan(0, 32)) 1055 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1056 // narrowScalar limitation. 1057 .widenScalarToNextPow2(0) 1058 .maxScalar(0, S32) 1059 .scalarize(0) 1060 .lower(); 1061 1062 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1063 .legalFor({S32}) 1064 .minScalar(0, S32) 1065 .widenScalarToNextPow2(0) 1066 .scalarize(0) 1067 .lower(); 1068 } 1069 1070 getActionDefinitionsBuilder(G_INTTOPTR) 1071 // List the common cases 1072 .legalForCartesianProduct(AddrSpaces64, {S64}) 1073 .legalForCartesianProduct(AddrSpaces32, {S32}) 1074 .scalarize(0) 1075 // Accept any address space as long as the size matches 1076 .legalIf(sameSize(0, 1)) 1077 .widenScalarIf(smallerThan(1, 0), 1078 [](const LegalityQuery &Query) { 1079 return std::pair( 1080 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1081 }) 1082 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1083 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1084 }); 1085 1086 getActionDefinitionsBuilder(G_PTRTOINT) 1087 // List the common cases 1088 .legalForCartesianProduct(AddrSpaces64, {S64}) 1089 .legalForCartesianProduct(AddrSpaces32, {S32}) 1090 .scalarize(0) 1091 // Accept any address space as long as the size matches 1092 .legalIf(sameSize(0, 1)) 1093 .widenScalarIf(smallerThan(0, 1), 1094 [](const LegalityQuery &Query) { 1095 return std::pair( 1096 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1097 }) 1098 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1099 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1100 }); 1101 1102 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1103 .scalarize(0) 1104 .custom(); 1105 1106 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1107 bool IsLoad) -> bool { 1108 const LLT DstTy = Query.Types[0]; 1109 1110 // Split vector extloads. 1111 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1112 1113 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1114 return true; 1115 1116 const LLT PtrTy = Query.Types[1]; 1117 unsigned AS = PtrTy.getAddressSpace(); 1118 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1119 return true; 1120 1121 // Catch weird sized loads that don't evenly divide into the access sizes 1122 // TODO: May be able to widen depending on alignment etc. 1123 unsigned NumRegs = (MemSize + 31) / 32; 1124 if (NumRegs == 3) { 1125 if (!ST.hasDwordx3LoadStores()) 1126 return true; 1127 } else { 1128 // If the alignment allows, these should have been widened. 1129 if (!isPowerOf2_32(NumRegs)) 1130 return true; 1131 } 1132 1133 return false; 1134 }; 1135 1136 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1137 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1138 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1139 1140 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1141 // LDS 1142 // TODO: Unsupported flat for SI. 1143 1144 for (unsigned Op : {G_LOAD, G_STORE}) { 1145 const bool IsStore = Op == G_STORE; 1146 1147 auto &Actions = getActionDefinitionsBuilder(Op); 1148 // Explicitly list some common cases. 1149 // TODO: Does this help compile time at all? 1150 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1151 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1152 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1153 {S64, GlobalPtr, S64, GlobalAlign32}, 1154 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1155 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1156 {S32, GlobalPtr, S8, GlobalAlign8}, 1157 {S32, GlobalPtr, S16, GlobalAlign16}, 1158 1159 {S32, LocalPtr, S32, 32}, 1160 {S64, LocalPtr, S64, 32}, 1161 {V2S32, LocalPtr, V2S32, 32}, 1162 {S32, LocalPtr, S8, 8}, 1163 {S32, LocalPtr, S16, 16}, 1164 {V2S16, LocalPtr, S32, 32}, 1165 1166 {S32, PrivatePtr, S32, 32}, 1167 {S32, PrivatePtr, S8, 8}, 1168 {S32, PrivatePtr, S16, 16}, 1169 {V2S16, PrivatePtr, S32, 32}, 1170 1171 {S32, ConstantPtr, S32, GlobalAlign32}, 1172 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1173 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1174 {S64, ConstantPtr, S64, GlobalAlign32}, 1175 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1176 Actions.legalIf( 1177 [=](const LegalityQuery &Query) -> bool { 1178 return isLoadStoreLegal(ST, Query); 1179 }); 1180 1181 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1182 // 64-bits. 1183 // 1184 // TODO: Should generalize bitcast action into coerce, which will also cover 1185 // inserting addrspacecasts. 1186 Actions.customIf(typeIs(1, Constant32Ptr)); 1187 1188 // Turn any illegal element vectors into something easier to deal 1189 // with. These will ultimately produce 32-bit scalar shifts to extract the 1190 // parts anyway. 1191 // 1192 // For odd 16-bit element vectors, prefer to split those into pieces with 1193 // 16-bit vector parts. 1194 Actions.bitcastIf( 1195 [=](const LegalityQuery &Query) -> bool { 1196 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1197 Query.MMODescrs[0].MemoryTy); 1198 }, bitcastToRegisterType(0)); 1199 1200 if (!IsStore) { 1201 // Widen suitably aligned loads by loading extra bytes. The standard 1202 // legalization actions can't properly express widening memory operands. 1203 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1204 return shouldWidenLoad(ST, Query, G_LOAD); 1205 }); 1206 } 1207 1208 // FIXME: load/store narrowing should be moved to lower action 1209 Actions 1210 .narrowScalarIf( 1211 [=](const LegalityQuery &Query) -> bool { 1212 return !Query.Types[0].isVector() && 1213 needToSplitMemOp(Query, Op == G_LOAD); 1214 }, 1215 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1216 const LLT DstTy = Query.Types[0]; 1217 const LLT PtrTy = Query.Types[1]; 1218 1219 const unsigned DstSize = DstTy.getSizeInBits(); 1220 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1221 1222 // Split extloads. 1223 if (DstSize > MemSize) 1224 return std::pair(0, LLT::scalar(MemSize)); 1225 1226 unsigned MaxSize = maxSizeForAddrSpace(ST, 1227 PtrTy.getAddressSpace(), 1228 Op == G_LOAD); 1229 if (MemSize > MaxSize) 1230 return std::pair(0, LLT::scalar(MaxSize)); 1231 1232 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1233 return std::pair(0, LLT::scalar(Align)); 1234 }) 1235 .fewerElementsIf( 1236 [=](const LegalityQuery &Query) -> bool { 1237 return Query.Types[0].isVector() && 1238 needToSplitMemOp(Query, Op == G_LOAD); 1239 }, 1240 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1241 const LLT DstTy = Query.Types[0]; 1242 const LLT PtrTy = Query.Types[1]; 1243 1244 LLT EltTy = DstTy.getElementType(); 1245 unsigned MaxSize = maxSizeForAddrSpace(ST, 1246 PtrTy.getAddressSpace(), 1247 Op == G_LOAD); 1248 1249 // FIXME: Handle widened to power of 2 results better. This ends 1250 // up scalarizing. 1251 // FIXME: 3 element stores scalarized on SI 1252 1253 // Split if it's too large for the address space. 1254 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1255 if (MemSize > MaxSize) { 1256 unsigned NumElts = DstTy.getNumElements(); 1257 unsigned EltSize = EltTy.getSizeInBits(); 1258 1259 if (MaxSize % EltSize == 0) { 1260 return std::pair( 1261 0, LLT::scalarOrVector( 1262 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1263 } 1264 1265 unsigned NumPieces = MemSize / MaxSize; 1266 1267 // FIXME: Refine when odd breakdowns handled 1268 // The scalars will need to be re-legalized. 1269 if (NumPieces == 1 || NumPieces >= NumElts || 1270 NumElts % NumPieces != 0) 1271 return std::pair(0, EltTy); 1272 1273 return std::pair(0, 1274 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1275 } 1276 1277 // FIXME: We could probably handle weird extending loads better. 1278 if (DstTy.getSizeInBits() > MemSize) 1279 return std::pair(0, EltTy); 1280 1281 unsigned EltSize = EltTy.getSizeInBits(); 1282 unsigned DstSize = DstTy.getSizeInBits(); 1283 if (!isPowerOf2_32(DstSize)) { 1284 // We're probably decomposing an odd sized store. Try to split 1285 // to the widest type. TODO: Account for alignment. As-is it 1286 // should be OK, since the new parts will be further legalized. 1287 unsigned FloorSize = PowerOf2Floor(DstSize); 1288 return std::pair( 1289 0, LLT::scalarOrVector( 1290 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1291 } 1292 1293 // May need relegalization for the scalars. 1294 return std::pair(0, EltTy); 1295 }) 1296 .minScalar(0, S32) 1297 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1298 .widenScalarToNextPow2(0) 1299 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1300 .lower(); 1301 } 1302 1303 // FIXME: Unaligned accesses not lowered. 1304 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1305 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1306 {S32, GlobalPtr, S16, 2 * 8}, 1307 {S32, LocalPtr, S8, 8}, 1308 {S32, LocalPtr, S16, 16}, 1309 {S32, PrivatePtr, S8, 8}, 1310 {S32, PrivatePtr, S16, 16}, 1311 {S32, ConstantPtr, S8, 8}, 1312 {S32, ConstantPtr, S16, 2 * 8}}) 1313 .legalIf( 1314 [=](const LegalityQuery &Query) -> bool { 1315 return isLoadStoreLegal(ST, Query); 1316 }); 1317 1318 if (ST.hasFlatAddressSpace()) { 1319 ExtLoads.legalForTypesWithMemDesc( 1320 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1321 } 1322 1323 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1324 // 64-bits. 1325 // 1326 // TODO: Should generalize bitcast action into coerce, which will also cover 1327 // inserting addrspacecasts. 1328 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1329 1330 ExtLoads.clampScalar(0, S32, S32) 1331 .widenScalarToNextPow2(0) 1332 .lower(); 1333 1334 auto &Atomics = getActionDefinitionsBuilder( 1335 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1336 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1337 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1338 G_ATOMICRMW_UMIN}) 1339 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1340 {S64, GlobalPtr}, {S64, LocalPtr}, 1341 {S32, RegionPtr}, {S64, RegionPtr}}); 1342 if (ST.hasFlatAddressSpace()) { 1343 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1344 } 1345 1346 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1347 if (ST.hasLDSFPAtomicAdd()) { 1348 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1349 if (ST.hasGFX90AInsts()) 1350 Atomic.legalFor({{S64, LocalPtr}}); 1351 if (ST.hasGFX940Insts()) 1352 Atomic.legalFor({{V2S16, LocalPtr}}); 1353 } 1354 if (ST.hasAtomicFaddInsts()) 1355 Atomic.legalFor({{S32, GlobalPtr}}); 1356 if (ST.hasFlatAtomicFaddF32Inst()) 1357 Atomic.legalFor({{S32, FlatPtr}}); 1358 1359 if (ST.hasGFX90AInsts()) { 1360 // These are legal with some caveats, and should have undergone expansion in 1361 // the IR in most situations 1362 // TODO: Move atomic expansion into legalizer 1363 Atomic.legalFor({ 1364 {S32, GlobalPtr}, 1365 {S64, GlobalPtr}, 1366 {S64, FlatPtr} 1367 }); 1368 } 1369 1370 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1371 // demarshalling 1372 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1373 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1374 {S32, FlatPtr}, {S64, FlatPtr}}) 1375 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1376 {S32, RegionPtr}, {S64, RegionPtr}}); 1377 // TODO: Pointer types, any 32-bit or 64-bit vector 1378 1379 // Condition should be s32 for scalar, s1 for vector. 1380 getActionDefinitionsBuilder(G_SELECT) 1381 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1382 LocalPtr, FlatPtr, PrivatePtr, 1383 LLT::fixed_vector(2, LocalPtr), 1384 LLT::fixed_vector(2, PrivatePtr)}, 1385 {S1, S32}) 1386 .clampScalar(0, S16, S64) 1387 .scalarize(1) 1388 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1389 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1390 .clampMaxNumElements(0, S32, 2) 1391 .clampMaxNumElements(0, LocalPtr, 2) 1392 .clampMaxNumElements(0, PrivatePtr, 2) 1393 .scalarize(0) 1394 .widenScalarToNextPow2(0) 1395 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1396 1397 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1398 // be more flexible with the shift amount type. 1399 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1400 .legalFor({{S32, S32}, {S64, S32}}); 1401 if (ST.has16BitInsts()) { 1402 if (ST.hasVOP3PInsts()) { 1403 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1404 .clampMaxNumElements(0, S16, 2); 1405 } else 1406 Shifts.legalFor({{S16, S16}}); 1407 1408 // TODO: Support 16-bit shift amounts for all types 1409 Shifts.widenScalarIf( 1410 [=](const LegalityQuery &Query) { 1411 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1412 // 32-bit amount. 1413 const LLT ValTy = Query.Types[0]; 1414 const LLT AmountTy = Query.Types[1]; 1415 return ValTy.getSizeInBits() <= 16 && 1416 AmountTy.getSizeInBits() < 16; 1417 }, changeTo(1, S16)); 1418 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1419 Shifts.clampScalar(1, S32, S32); 1420 Shifts.widenScalarToNextPow2(0, 16); 1421 Shifts.clampScalar(0, S16, S64); 1422 1423 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1424 .minScalar(0, S16) 1425 .scalarize(0) 1426 .lower(); 1427 } else { 1428 // Make sure we legalize the shift amount type first, as the general 1429 // expansion for the shifted type will produce much worse code if it hasn't 1430 // been truncated already. 1431 Shifts.clampScalar(1, S32, S32); 1432 Shifts.widenScalarToNextPow2(0, 32); 1433 Shifts.clampScalar(0, S32, S64); 1434 1435 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1436 .minScalar(0, S32) 1437 .scalarize(0) 1438 .lower(); 1439 } 1440 Shifts.scalarize(0); 1441 1442 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1443 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1444 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1445 unsigned IdxTypeIdx = 2; 1446 1447 getActionDefinitionsBuilder(Op) 1448 .customIf([=](const LegalityQuery &Query) { 1449 const LLT EltTy = Query.Types[EltTypeIdx]; 1450 const LLT VecTy = Query.Types[VecTypeIdx]; 1451 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1452 const unsigned EltSize = EltTy.getSizeInBits(); 1453 return (EltSize == 32 || EltSize == 64) && 1454 VecTy.getSizeInBits() % 32 == 0 && 1455 VecTy.getSizeInBits() <= MaxRegisterSize && 1456 IdxTy.getSizeInBits() == 32; 1457 }) 1458 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1459 bitcastToVectorElement32(VecTypeIdx)) 1460 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1461 .bitcastIf( 1462 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1463 [=](const LegalityQuery &Query) { 1464 // For > 64-bit element types, try to turn this into a 64-bit 1465 // element vector since we may be able to do better indexing 1466 // if this is scalar. If not, fall back to 32. 1467 const LLT EltTy = Query.Types[EltTypeIdx]; 1468 const LLT VecTy = Query.Types[VecTypeIdx]; 1469 const unsigned DstEltSize = EltTy.getSizeInBits(); 1470 const unsigned VecSize = VecTy.getSizeInBits(); 1471 1472 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1473 return std::pair( 1474 VecTypeIdx, 1475 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1476 }) 1477 .clampScalar(EltTypeIdx, S32, S64) 1478 .clampScalar(VecTypeIdx, S32, S64) 1479 .clampScalar(IdxTypeIdx, S32, S32) 1480 .clampMaxNumElements(VecTypeIdx, S32, 32) 1481 // TODO: Clamp elements for 64-bit vectors? 1482 // It should only be necessary with variable indexes. 1483 // As a last resort, lower to the stack 1484 .lower(); 1485 } 1486 1487 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1488 .unsupportedIf([=](const LegalityQuery &Query) { 1489 const LLT &EltTy = Query.Types[1].getElementType(); 1490 return Query.Types[0] != EltTy; 1491 }); 1492 1493 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1494 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1495 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1496 1497 // FIXME: Doesn't handle extract of illegal sizes. 1498 getActionDefinitionsBuilder(Op) 1499 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1500 .lowerIf([=](const LegalityQuery &Query) { 1501 // Sub-vector(or single element) insert and extract. 1502 // TODO: verify immediate offset here since lower only works with 1503 // whole elements. 1504 const LLT BigTy = Query.Types[BigTyIdx]; 1505 return BigTy.isVector(); 1506 }) 1507 // FIXME: Multiples of 16 should not be legal. 1508 .legalIf([=](const LegalityQuery &Query) { 1509 const LLT BigTy = Query.Types[BigTyIdx]; 1510 const LLT LitTy = Query.Types[LitTyIdx]; 1511 return (BigTy.getSizeInBits() % 32 == 0) && 1512 (LitTy.getSizeInBits() % 16 == 0); 1513 }) 1514 .widenScalarIf( 1515 [=](const LegalityQuery &Query) { 1516 const LLT BigTy = Query.Types[BigTyIdx]; 1517 return (BigTy.getScalarSizeInBits() < 16); 1518 }, 1519 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1520 .widenScalarIf( 1521 [=](const LegalityQuery &Query) { 1522 const LLT LitTy = Query.Types[LitTyIdx]; 1523 return (LitTy.getScalarSizeInBits() < 16); 1524 }, 1525 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1526 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1527 .widenScalarToNextPow2(BigTyIdx, 32); 1528 1529 } 1530 1531 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1532 .legalForCartesianProduct(AllS32Vectors, {S32}) 1533 .legalForCartesianProduct(AllS64Vectors, {S64}) 1534 .clampNumElements(0, V16S32, V32S32) 1535 .clampNumElements(0, V2S64, V16S64) 1536 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1537 1538 if (ST.hasScalarPackInsts()) { 1539 BuildVector 1540 // FIXME: Should probably widen s1 vectors straight to s32 1541 .minScalarOrElt(0, S16) 1542 .minScalar(1, S16); 1543 1544 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1545 .legalFor({V2S16, S32}) 1546 .lower(); 1547 } else { 1548 BuildVector.customFor({V2S16, S16}); 1549 BuildVector.minScalarOrElt(0, S32); 1550 1551 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1552 .customFor({V2S16, S32}) 1553 .lower(); 1554 } 1555 1556 BuildVector.legalIf(isRegisterType(0)); 1557 1558 // FIXME: Clamp maximum size 1559 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1560 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1561 .clampMaxNumElements(0, S32, 32) 1562 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1563 .clampMaxNumElements(0, S16, 64); 1564 1565 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1566 1567 // Merge/Unmerge 1568 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1569 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1570 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1571 1572 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1573 const LLT Ty = Query.Types[TypeIdx]; 1574 if (Ty.isVector()) { 1575 const LLT &EltTy = Ty.getElementType(); 1576 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1577 return true; 1578 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1579 return true; 1580 } 1581 return false; 1582 }; 1583 1584 auto &Builder = getActionDefinitionsBuilder(Op) 1585 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1586 .lowerFor({{S16, V2S16}}) 1587 .lowerIf([=](const LegalityQuery &Query) { 1588 const LLT BigTy = Query.Types[BigTyIdx]; 1589 return BigTy.getSizeInBits() == 32; 1590 }) 1591 // Try to widen to s16 first for small types. 1592 // TODO: Only do this on targets with legal s16 shifts 1593 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1594 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1595 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1596 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1597 elementTypeIs(1, S16)), 1598 changeTo(1, V2S16)) 1599 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1600 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1601 // valid. 1602 .clampScalar(LitTyIdx, S32, S512) 1603 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1604 // Break up vectors with weird elements into scalars 1605 .fewerElementsIf( 1606 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1607 scalarize(0)) 1608 .fewerElementsIf( 1609 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1610 scalarize(1)) 1611 .clampScalar(BigTyIdx, S32, MaxScalar); 1612 1613 if (Op == G_MERGE_VALUES) { 1614 Builder.widenScalarIf( 1615 // TODO: Use 16-bit shifts if legal for 8-bit values? 1616 [=](const LegalityQuery &Query) { 1617 const LLT Ty = Query.Types[LitTyIdx]; 1618 return Ty.getSizeInBits() < 32; 1619 }, 1620 changeTo(LitTyIdx, S32)); 1621 } 1622 1623 Builder.widenScalarIf( 1624 [=](const LegalityQuery &Query) { 1625 const LLT Ty = Query.Types[BigTyIdx]; 1626 return !isPowerOf2_32(Ty.getSizeInBits()) && 1627 Ty.getSizeInBits() % 16 != 0; 1628 }, 1629 [=](const LegalityQuery &Query) { 1630 // Pick the next power of 2, or a multiple of 64 over 128. 1631 // Whichever is smaller. 1632 const LLT &Ty = Query.Types[BigTyIdx]; 1633 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1634 if (NewSizeInBits >= 256) { 1635 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1636 if (RoundedTo < NewSizeInBits) 1637 NewSizeInBits = RoundedTo; 1638 } 1639 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1640 }) 1641 // Any vectors left are the wrong size. Scalarize them. 1642 .scalarize(0) 1643 .scalarize(1); 1644 } 1645 1646 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1647 // RegBankSelect. 1648 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1649 .legalFor({{S32}, {S64}}); 1650 1651 if (ST.hasVOP3PInsts()) { 1652 SextInReg.lowerFor({{V2S16}}) 1653 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1654 // get more vector shift opportunities, since we'll get those when 1655 // expanded. 1656 .clampMaxNumElementsStrict(0, S16, 2); 1657 } else if (ST.has16BitInsts()) { 1658 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1659 } else { 1660 // Prefer to promote to s32 before lowering if we don't have 16-bit 1661 // shifts. This avoid a lot of intermediate truncate and extend operations. 1662 SextInReg.lowerFor({{S32}, {S64}}); 1663 } 1664 1665 SextInReg 1666 .scalarize(0) 1667 .clampScalar(0, S32, S64) 1668 .lower(); 1669 1670 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1671 .scalarize(0) 1672 .lower(); 1673 1674 // TODO: Only Try to form v2s16 with legal packed instructions. 1675 getActionDefinitionsBuilder(G_FSHR) 1676 .legalFor({{S32, S32}}) 1677 .lowerFor({{V2S16, V2S16}}) 1678 .clampMaxNumElementsStrict(0, S16, 2) 1679 .scalarize(0) 1680 .lower(); 1681 1682 if (ST.hasVOP3PInsts()) { 1683 getActionDefinitionsBuilder(G_FSHL) 1684 .lowerFor({{V2S16, V2S16}}) 1685 .clampMaxNumElementsStrict(0, S16, 2) 1686 .scalarize(0) 1687 .lower(); 1688 } else { 1689 getActionDefinitionsBuilder(G_FSHL) 1690 .scalarize(0) 1691 .lower(); 1692 } 1693 1694 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1695 .legalFor({S64}); 1696 1697 getActionDefinitionsBuilder(G_FENCE) 1698 .alwaysLegal(); 1699 1700 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1701 .scalarize(0) 1702 .minScalar(0, S32) 1703 .lower(); 1704 1705 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1706 .legalFor({{S32, S32}, {S64, S32}}) 1707 .clampScalar(1, S32, S32) 1708 .clampScalar(0, S32, S64) 1709 .widenScalarToNextPow2(0) 1710 .scalarize(0); 1711 1712 getActionDefinitionsBuilder({ 1713 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1714 G_FCOPYSIGN, 1715 1716 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1717 G_ATOMICRMW_NAND, 1718 G_ATOMICRMW_FSUB, 1719 G_READ_REGISTER, 1720 G_WRITE_REGISTER, 1721 1722 G_SADDO, G_SSUBO, 1723 1724 // TODO: Implement 1725 G_FMINIMUM, G_FMAXIMUM}).lower(); 1726 1727 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1728 .lower(); 1729 1730 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1731 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1732 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1733 .unsupported(); 1734 1735 getLegacyLegalizerInfo().computeTables(); 1736 verify(*ST.getInstrInfo()); 1737 } 1738 1739 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1740 MachineInstr &MI) const { 1741 MachineIRBuilder &B = Helper.MIRBuilder; 1742 MachineRegisterInfo &MRI = *B.getMRI(); 1743 1744 switch (MI.getOpcode()) { 1745 case TargetOpcode::G_ADDRSPACE_CAST: 1746 return legalizeAddrSpaceCast(MI, MRI, B); 1747 case TargetOpcode::G_FRINT: 1748 return legalizeFrint(MI, MRI, B); 1749 case TargetOpcode::G_FCEIL: 1750 return legalizeFceil(MI, MRI, B); 1751 case TargetOpcode::G_FREM: 1752 return legalizeFrem(MI, MRI, B); 1753 case TargetOpcode::G_INTRINSIC_TRUNC: 1754 return legalizeIntrinsicTrunc(MI, MRI, B); 1755 case TargetOpcode::G_SITOFP: 1756 return legalizeITOFP(MI, MRI, B, true); 1757 case TargetOpcode::G_UITOFP: 1758 return legalizeITOFP(MI, MRI, B, false); 1759 case TargetOpcode::G_FPTOSI: 1760 return legalizeFPTOI(MI, MRI, B, true); 1761 case TargetOpcode::G_FPTOUI: 1762 return legalizeFPTOI(MI, MRI, B, false); 1763 case TargetOpcode::G_FMINNUM: 1764 case TargetOpcode::G_FMAXNUM: 1765 case TargetOpcode::G_FMINNUM_IEEE: 1766 case TargetOpcode::G_FMAXNUM_IEEE: 1767 return legalizeMinNumMaxNum(Helper, MI); 1768 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1769 return legalizeExtractVectorElt(MI, MRI, B); 1770 case TargetOpcode::G_INSERT_VECTOR_ELT: 1771 return legalizeInsertVectorElt(MI, MRI, B); 1772 case TargetOpcode::G_FSIN: 1773 case TargetOpcode::G_FCOS: 1774 return legalizeSinCos(MI, MRI, B); 1775 case TargetOpcode::G_GLOBAL_VALUE: 1776 return legalizeGlobalValue(MI, MRI, B); 1777 case TargetOpcode::G_LOAD: 1778 case TargetOpcode::G_SEXTLOAD: 1779 case TargetOpcode::G_ZEXTLOAD: 1780 return legalizeLoad(Helper, MI); 1781 case TargetOpcode::G_FMAD: 1782 return legalizeFMad(MI, MRI, B); 1783 case TargetOpcode::G_FDIV: 1784 return legalizeFDIV(MI, MRI, B); 1785 case TargetOpcode::G_UDIV: 1786 case TargetOpcode::G_UREM: 1787 case TargetOpcode::G_UDIVREM: 1788 return legalizeUnsignedDIV_REM(MI, MRI, B); 1789 case TargetOpcode::G_SDIV: 1790 case TargetOpcode::G_SREM: 1791 case TargetOpcode::G_SDIVREM: 1792 return legalizeSignedDIV_REM(MI, MRI, B); 1793 case TargetOpcode::G_ATOMIC_CMPXCHG: 1794 return legalizeAtomicCmpXChg(MI, MRI, B); 1795 case TargetOpcode::G_FLOG: 1796 return legalizeFlog(MI, B, numbers::ln2f); 1797 case TargetOpcode::G_FLOG10: 1798 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1799 case TargetOpcode::G_FEXP: 1800 return legalizeFExp(MI, B); 1801 case TargetOpcode::G_FPOW: 1802 return legalizeFPow(MI, B); 1803 case TargetOpcode::G_FFLOOR: 1804 return legalizeFFloor(MI, MRI, B); 1805 case TargetOpcode::G_BUILD_VECTOR: 1806 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 1807 return legalizeBuildVector(MI, MRI, B); 1808 case TargetOpcode::G_MUL: 1809 return legalizeMul(Helper, MI); 1810 case TargetOpcode::G_CTLZ: 1811 case TargetOpcode::G_CTTZ: 1812 return legalizeCTLZ_CTTZ(MI, MRI, B); 1813 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1814 return legalizeFPTruncRound(MI, B); 1815 default: 1816 return false; 1817 } 1818 1819 llvm_unreachable("expected switch to return"); 1820 } 1821 1822 Register AMDGPULegalizerInfo::getSegmentAperture( 1823 unsigned AS, 1824 MachineRegisterInfo &MRI, 1825 MachineIRBuilder &B) const { 1826 MachineFunction &MF = B.getMF(); 1827 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1828 const LLT S32 = LLT::scalar(32); 1829 const LLT S64 = LLT::scalar(64); 1830 1831 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1832 1833 if (ST.hasApertureRegs()) { 1834 // Note: this register is somewhat broken. When used as a 32-bit operand, 1835 // it only returns zeroes. The real value is in the upper 32 bits. 1836 // Thus, we must emit extract the high 32 bits. 1837 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 1838 ? AMDGPU::SRC_SHARED_BASE 1839 : AMDGPU::SRC_PRIVATE_BASE; 1840 // FIXME: It would be more natural to emit a COPY here, but then copy 1841 // coalescing would kick in and it would think it's okay to use the "HI" 1842 // subregister (instead of extracting the HI 32 bits) which is an artificial 1843 // (unusable) register. 1844 // Register TableGen definitions would need an overhaul to get rid of the 1845 // artificial "HI" aperture registers and prevent this kind of issue from 1846 // happening. 1847 Register Dst = MRI.createGenericVirtualRegister(S64); 1848 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 1849 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 1850 return B.buildUnmerge(S32, Dst).getReg(1); 1851 } 1852 1853 // TODO: can we be smarter about machine pointer info? 1854 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1855 Register LoadAddr = MRI.createGenericVirtualRegister( 1856 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1857 // For code object version 5, private_base and shared_base are passed through 1858 // implicit kernargs. 1859 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1860 AMDGPUTargetLowering::ImplicitParameter Param = 1861 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1862 : AMDGPUTargetLowering::PRIVATE_BASE; 1863 uint64_t Offset = 1864 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1865 1866 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1867 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1868 1869 if (!loadInputValue(KernargPtrReg, B, 1870 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1871 return Register(); 1872 1873 MachineMemOperand *MMO = MF.getMachineMemOperand( 1874 PtrInfo, 1875 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1876 MachineMemOperand::MOInvariant, 1877 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1878 1879 // Pointer address 1880 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1881 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1882 // Load address 1883 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1884 } 1885 1886 Register QueuePtr = MRI.createGenericVirtualRegister( 1887 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1888 1889 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1890 return Register(); 1891 1892 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1893 // private_segment_aperture_base_hi. 1894 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1895 1896 MachineMemOperand *MMO = MF.getMachineMemOperand( 1897 PtrInfo, 1898 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1899 MachineMemOperand::MOInvariant, 1900 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1901 1902 B.buildPtrAdd(LoadAddr, QueuePtr, 1903 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1904 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1905 } 1906 1907 /// Return true if the value is a known valid address, such that a null check is 1908 /// not necessary. 1909 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1910 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1911 MachineInstr *Def = MRI.getVRegDef(Val); 1912 switch (Def->getOpcode()) { 1913 case AMDGPU::G_FRAME_INDEX: 1914 case AMDGPU::G_GLOBAL_VALUE: 1915 case AMDGPU::G_BLOCK_ADDR: 1916 return true; 1917 case AMDGPU::G_CONSTANT: { 1918 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1919 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1920 } 1921 default: 1922 return false; 1923 } 1924 1925 return false; 1926 } 1927 1928 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1929 MachineInstr &MI, MachineRegisterInfo &MRI, 1930 MachineIRBuilder &B) const { 1931 MachineFunction &MF = B.getMF(); 1932 1933 const LLT S32 = LLT::scalar(32); 1934 Register Dst = MI.getOperand(0).getReg(); 1935 Register Src = MI.getOperand(1).getReg(); 1936 1937 LLT DstTy = MRI.getType(Dst); 1938 LLT SrcTy = MRI.getType(Src); 1939 unsigned DestAS = DstTy.getAddressSpace(); 1940 unsigned SrcAS = SrcTy.getAddressSpace(); 1941 1942 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1943 // vector element. 1944 assert(!DstTy.isVector()); 1945 1946 const AMDGPUTargetMachine &TM 1947 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1948 1949 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1950 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1951 return true; 1952 } 1953 1954 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1955 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1956 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1957 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1958 // Extract low 32-bits of the pointer. 1959 B.buildExtract(Dst, Src, 0); 1960 MI.eraseFromParent(); 1961 return true; 1962 } 1963 1964 unsigned NullVal = TM.getNullPointerValue(DestAS); 1965 1966 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1967 auto FlatNull = B.buildConstant(SrcTy, 0); 1968 1969 // Extract low 32-bits of the pointer. 1970 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1971 1972 auto CmpRes = 1973 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1974 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1975 1976 MI.eraseFromParent(); 1977 return true; 1978 } 1979 1980 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1981 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1982 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1983 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1984 if (!ApertureReg.isValid()) 1985 return false; 1986 1987 // Coerce the type of the low half of the result so we can use merge_values. 1988 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1989 1990 // TODO: Should we allow mismatched types but matching sizes in merges to 1991 // avoid the ptrtoint? 1992 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 1993 1994 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1995 B.buildCopy(Dst, BuildPtr); 1996 MI.eraseFromParent(); 1997 return true; 1998 } 1999 2000 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2001 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2002 2003 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2004 SegmentNull.getReg(0)); 2005 2006 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2007 2008 MI.eraseFromParent(); 2009 return true; 2010 } 2011 2012 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2013 SrcTy.getSizeInBits() == 64) { 2014 // Truncate. 2015 B.buildExtract(Dst, Src, 0); 2016 MI.eraseFromParent(); 2017 return true; 2018 } 2019 2020 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2021 DstTy.getSizeInBits() == 64) { 2022 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2023 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2024 auto PtrLo = B.buildPtrToInt(S32, Src); 2025 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2026 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2027 MI.eraseFromParent(); 2028 return true; 2029 } 2030 2031 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2032 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2033 2034 LLVMContext &Ctx = MF.getFunction().getContext(); 2035 Ctx.diagnose(InvalidAddrSpaceCast); 2036 B.buildUndef(Dst); 2037 MI.eraseFromParent(); 2038 return true; 2039 } 2040 2041 bool AMDGPULegalizerInfo::legalizeFrint( 2042 MachineInstr &MI, MachineRegisterInfo &MRI, 2043 MachineIRBuilder &B) const { 2044 Register Src = MI.getOperand(1).getReg(); 2045 LLT Ty = MRI.getType(Src); 2046 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2047 2048 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2049 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2050 2051 auto C1 = B.buildFConstant(Ty, C1Val); 2052 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2053 2054 // TODO: Should this propagate fast-math-flags? 2055 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2056 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2057 2058 auto C2 = B.buildFConstant(Ty, C2Val); 2059 auto Fabs = B.buildFAbs(Ty, Src); 2060 2061 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2062 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2063 MI.eraseFromParent(); 2064 return true; 2065 } 2066 2067 bool AMDGPULegalizerInfo::legalizeFceil( 2068 MachineInstr &MI, MachineRegisterInfo &MRI, 2069 MachineIRBuilder &B) const { 2070 2071 const LLT S1 = LLT::scalar(1); 2072 const LLT S64 = LLT::scalar(64); 2073 2074 Register Src = MI.getOperand(1).getReg(); 2075 assert(MRI.getType(Src) == S64); 2076 2077 // result = trunc(src) 2078 // if (src > 0.0 && src != result) 2079 // result += 1.0 2080 2081 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2082 2083 const auto Zero = B.buildFConstant(S64, 0.0); 2084 const auto One = B.buildFConstant(S64, 1.0); 2085 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2086 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2087 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2088 auto Add = B.buildSelect(S64, And, One, Zero); 2089 2090 // TODO: Should this propagate fast-math-flags? 2091 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2092 MI.eraseFromParent(); 2093 return true; 2094 } 2095 2096 bool AMDGPULegalizerInfo::legalizeFrem( 2097 MachineInstr &MI, MachineRegisterInfo &MRI, 2098 MachineIRBuilder &B) const { 2099 Register DstReg = MI.getOperand(0).getReg(); 2100 Register Src0Reg = MI.getOperand(1).getReg(); 2101 Register Src1Reg = MI.getOperand(2).getReg(); 2102 auto Flags = MI.getFlags(); 2103 LLT Ty = MRI.getType(DstReg); 2104 2105 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2106 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2107 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2108 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2109 MI.eraseFromParent(); 2110 return true; 2111 } 2112 2113 static MachineInstrBuilder extractF64Exponent(Register Hi, 2114 MachineIRBuilder &B) { 2115 const unsigned FractBits = 52; 2116 const unsigned ExpBits = 11; 2117 LLT S32 = LLT::scalar(32); 2118 2119 auto Const0 = B.buildConstant(S32, FractBits - 32); 2120 auto Const1 = B.buildConstant(S32, ExpBits); 2121 2122 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2123 .addUse(Hi) 2124 .addUse(Const0.getReg(0)) 2125 .addUse(Const1.getReg(0)); 2126 2127 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2128 } 2129 2130 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2131 MachineInstr &MI, MachineRegisterInfo &MRI, 2132 MachineIRBuilder &B) const { 2133 const LLT S1 = LLT::scalar(1); 2134 const LLT S32 = LLT::scalar(32); 2135 const LLT S64 = LLT::scalar(64); 2136 2137 Register Src = MI.getOperand(1).getReg(); 2138 assert(MRI.getType(Src) == S64); 2139 2140 // TODO: Should this use extract since the low half is unused? 2141 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2142 Register Hi = Unmerge.getReg(1); 2143 2144 // Extract the upper half, since this is where we will find the sign and 2145 // exponent. 2146 auto Exp = extractF64Exponent(Hi, B); 2147 2148 const unsigned FractBits = 52; 2149 2150 // Extract the sign bit. 2151 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2152 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2153 2154 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2155 2156 const auto Zero32 = B.buildConstant(S32, 0); 2157 2158 // Extend back to 64-bits. 2159 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2160 2161 auto Shr = B.buildAShr(S64, FractMask, Exp); 2162 auto Not = B.buildNot(S64, Shr); 2163 auto Tmp0 = B.buildAnd(S64, Src, Not); 2164 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2165 2166 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2167 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2168 2169 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2170 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2171 MI.eraseFromParent(); 2172 return true; 2173 } 2174 2175 bool AMDGPULegalizerInfo::legalizeITOFP( 2176 MachineInstr &MI, MachineRegisterInfo &MRI, 2177 MachineIRBuilder &B, bool Signed) const { 2178 2179 Register Dst = MI.getOperand(0).getReg(); 2180 Register Src = MI.getOperand(1).getReg(); 2181 2182 const LLT S64 = LLT::scalar(64); 2183 const LLT S32 = LLT::scalar(32); 2184 2185 assert(MRI.getType(Src) == S64); 2186 2187 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2188 auto ThirtyTwo = B.buildConstant(S32, 32); 2189 2190 if (MRI.getType(Dst) == S64) { 2191 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2192 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2193 2194 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2195 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2196 .addUse(CvtHi.getReg(0)) 2197 .addUse(ThirtyTwo.getReg(0)); 2198 2199 // TODO: Should this propagate fast-math-flags? 2200 B.buildFAdd(Dst, LdExp, CvtLo); 2201 MI.eraseFromParent(); 2202 return true; 2203 } 2204 2205 assert(MRI.getType(Dst) == S32); 2206 2207 auto One = B.buildConstant(S32, 1); 2208 2209 MachineInstrBuilder ShAmt; 2210 if (Signed) { 2211 auto ThirtyOne = B.buildConstant(S32, 31); 2212 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2213 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2214 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2215 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2216 /*HasSideEffects=*/false) 2217 .addUse(Unmerge.getReg(1)); 2218 auto LS2 = B.buildSub(S32, LS, One); 2219 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2220 } else 2221 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2222 auto Norm = B.buildShl(S64, Src, ShAmt); 2223 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2224 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2225 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2226 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2227 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2228 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2229 /*HasSideEffects=*/false) 2230 .addUse(FVal.getReg(0)) 2231 .addUse(Scale.getReg(0)); 2232 MI.eraseFromParent(); 2233 return true; 2234 } 2235 2236 // TODO: Copied from DAG implementation. Verify logic and document how this 2237 // actually works. 2238 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2239 MachineRegisterInfo &MRI, 2240 MachineIRBuilder &B, 2241 bool Signed) const { 2242 2243 Register Dst = MI.getOperand(0).getReg(); 2244 Register Src = MI.getOperand(1).getReg(); 2245 2246 const LLT S64 = LLT::scalar(64); 2247 const LLT S32 = LLT::scalar(32); 2248 2249 const LLT SrcLT = MRI.getType(Src); 2250 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2251 2252 unsigned Flags = MI.getFlags(); 2253 2254 // The basic idea of converting a floating point number into a pair of 32-bit 2255 // integers is illustrated as follows: 2256 // 2257 // tf := trunc(val); 2258 // hif := floor(tf * 2^-32); 2259 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2260 // hi := fptoi(hif); 2261 // lo := fptoi(lof); 2262 // 2263 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2264 MachineInstrBuilder Sign; 2265 if (Signed && SrcLT == S32) { 2266 // However, a 32-bit floating point number has only 23 bits mantissa and 2267 // it's not enough to hold all the significant bits of `lof` if val is 2268 // negative. To avoid the loss of precision, We need to take the absolute 2269 // value after truncating and flip the result back based on the original 2270 // signedness. 2271 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2272 Trunc = B.buildFAbs(S32, Trunc, Flags); 2273 } 2274 MachineInstrBuilder K0, K1; 2275 if (SrcLT == S64) { 2276 K0 = B.buildFConstant(S64, 2277 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2278 K1 = B.buildFConstant(S64, 2279 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2280 } else { 2281 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2282 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2283 } 2284 2285 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2286 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2287 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2288 2289 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2290 : B.buildFPTOUI(S32, FloorMul); 2291 auto Lo = B.buildFPTOUI(S32, Fma); 2292 2293 if (Signed && SrcLT == S32) { 2294 // Flip the result based on the signedness, which is either all 0s or 1s. 2295 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2296 // r := xor({lo, hi}, sign) - sign; 2297 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2298 Sign); 2299 } else 2300 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2301 MI.eraseFromParent(); 2302 2303 return true; 2304 } 2305 2306 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2307 MachineInstr &MI) const { 2308 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2309 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2310 2311 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2312 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2313 2314 // With ieee_mode disabled, the instructions have the correct behavior 2315 // already for G_FMINNUM/G_FMAXNUM 2316 if (!MFI->getMode().IEEE) 2317 return !IsIEEEOp; 2318 2319 if (IsIEEEOp) 2320 return true; 2321 2322 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2323 } 2324 2325 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2326 MachineInstr &MI, MachineRegisterInfo &MRI, 2327 MachineIRBuilder &B) const { 2328 // TODO: Should move some of this into LegalizerHelper. 2329 2330 // TODO: Promote dynamic indexing of s16 to s32 2331 2332 // FIXME: Artifact combiner probably should have replaced the truncated 2333 // constant before this, so we shouldn't need 2334 // getIConstantVRegValWithLookThrough. 2335 std::optional<ValueAndVReg> MaybeIdxVal = 2336 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2337 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2338 return true; 2339 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2340 2341 Register Dst = MI.getOperand(0).getReg(); 2342 Register Vec = MI.getOperand(1).getReg(); 2343 2344 LLT VecTy = MRI.getType(Vec); 2345 LLT EltTy = VecTy.getElementType(); 2346 assert(EltTy == MRI.getType(Dst)); 2347 2348 if (IdxVal < VecTy.getNumElements()) { 2349 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2350 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2351 } else { 2352 B.buildUndef(Dst); 2353 } 2354 2355 MI.eraseFromParent(); 2356 return true; 2357 } 2358 2359 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2360 MachineInstr &MI, MachineRegisterInfo &MRI, 2361 MachineIRBuilder &B) const { 2362 // TODO: Should move some of this into LegalizerHelper. 2363 2364 // TODO: Promote dynamic indexing of s16 to s32 2365 2366 // FIXME: Artifact combiner probably should have replaced the truncated 2367 // constant before this, so we shouldn't need 2368 // getIConstantVRegValWithLookThrough. 2369 std::optional<ValueAndVReg> MaybeIdxVal = 2370 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2371 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2372 return true; 2373 2374 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2375 Register Dst = MI.getOperand(0).getReg(); 2376 Register Vec = MI.getOperand(1).getReg(); 2377 Register Ins = MI.getOperand(2).getReg(); 2378 2379 LLT VecTy = MRI.getType(Vec); 2380 LLT EltTy = VecTy.getElementType(); 2381 assert(EltTy == MRI.getType(Ins)); 2382 (void)Ins; 2383 2384 unsigned NumElts = VecTy.getNumElements(); 2385 if (IdxVal < NumElts) { 2386 SmallVector<Register, 8> SrcRegs; 2387 for (unsigned i = 0; i < NumElts; ++i) 2388 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2389 B.buildUnmerge(SrcRegs, Vec); 2390 2391 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2392 B.buildMergeLikeInstr(Dst, SrcRegs); 2393 } else { 2394 B.buildUndef(Dst); 2395 } 2396 2397 MI.eraseFromParent(); 2398 return true; 2399 } 2400 2401 bool AMDGPULegalizerInfo::legalizeSinCos( 2402 MachineInstr &MI, MachineRegisterInfo &MRI, 2403 MachineIRBuilder &B) const { 2404 2405 Register DstReg = MI.getOperand(0).getReg(); 2406 Register SrcReg = MI.getOperand(1).getReg(); 2407 LLT Ty = MRI.getType(DstReg); 2408 unsigned Flags = MI.getFlags(); 2409 2410 Register TrigVal; 2411 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2412 if (ST.hasTrigReducedRange()) { 2413 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2414 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2415 .addUse(MulVal.getReg(0)) 2416 .setMIFlags(Flags).getReg(0); 2417 } else 2418 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2419 2420 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2421 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2422 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) 2423 .addUse(TrigVal) 2424 .setMIFlags(Flags); 2425 MI.eraseFromParent(); 2426 return true; 2427 } 2428 2429 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2430 MachineIRBuilder &B, 2431 const GlobalValue *GV, 2432 int64_t Offset, 2433 unsigned GAFlags) const { 2434 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2435 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2436 // to the following code sequence: 2437 // 2438 // For constant address space: 2439 // s_getpc_b64 s[0:1] 2440 // s_add_u32 s0, s0, $symbol 2441 // s_addc_u32 s1, s1, 0 2442 // 2443 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2444 // a fixup or relocation is emitted to replace $symbol with a literal 2445 // constant, which is a pc-relative offset from the encoding of the $symbol 2446 // operand to the global variable. 2447 // 2448 // For global address space: 2449 // s_getpc_b64 s[0:1] 2450 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2451 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2452 // 2453 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2454 // fixups or relocations are emitted to replace $symbol@*@lo and 2455 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2456 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2457 // operand to the global variable. 2458 // 2459 // What we want here is an offset from the value returned by s_getpc 2460 // (which is the address of the s_add_u32 instruction) to the global 2461 // variable, but since the encoding of $symbol starts 4 bytes after the start 2462 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2463 // small. This requires us to add 4 to the global variable offset in order to 2464 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2465 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2466 // instruction. 2467 2468 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2469 2470 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2471 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2472 2473 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2474 .addDef(PCReg); 2475 2476 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2477 if (GAFlags == SIInstrInfo::MO_NONE) 2478 MIB.addImm(0); 2479 else 2480 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2481 2482 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2483 2484 if (PtrTy.getSizeInBits() == 32) 2485 B.buildExtract(DstReg, PCReg, 0); 2486 return true; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2490 MachineInstr &MI, MachineRegisterInfo &MRI, 2491 MachineIRBuilder &B) const { 2492 Register DstReg = MI.getOperand(0).getReg(); 2493 LLT Ty = MRI.getType(DstReg); 2494 unsigned AS = Ty.getAddressSpace(); 2495 2496 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2497 MachineFunction &MF = B.getMF(); 2498 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2499 2500 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2501 if (!MFI->isModuleEntryFunction() && 2502 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2503 const Function &Fn = MF.getFunction(); 2504 DiagnosticInfoUnsupported BadLDSDecl( 2505 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2506 DS_Warning); 2507 Fn.getContext().diagnose(BadLDSDecl); 2508 2509 // We currently don't have a way to correctly allocate LDS objects that 2510 // aren't directly associated with a kernel. We do force inlining of 2511 // functions that use local objects. However, if these dead functions are 2512 // not eliminated, we don't want a compile time error. Just emit a warning 2513 // and a trap, since there should be no callable path here. 2514 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2515 B.buildUndef(DstReg); 2516 MI.eraseFromParent(); 2517 return true; 2518 } 2519 2520 // TODO: We could emit code to handle the initialization somewhere. 2521 // We ignore the initializer for now and legalize it to allow selection. 2522 // The initializer will anyway get errored out during assembly emission. 2523 const SITargetLowering *TLI = ST.getTargetLowering(); 2524 if (!TLI->shouldUseLDSConstAddress(GV)) { 2525 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2526 return true; // Leave in place; 2527 } 2528 2529 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2530 Type *Ty = GV->getValueType(); 2531 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2532 // zero-sized type in other languages to declare the dynamic shared 2533 // memory which size is not known at the compile time. They will be 2534 // allocated by the runtime and placed directly after the static 2535 // allocated ones. They all share the same offset. 2536 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2537 // Adjust alignment for that dynamic shared memory array. 2538 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2539 LLT S32 = LLT::scalar(32); 2540 auto Sz = 2541 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2542 B.buildIntToPtr(DstReg, Sz); 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 } 2547 2548 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2549 *cast<GlobalVariable>(GV))); 2550 MI.eraseFromParent(); 2551 return true; 2552 } 2553 2554 const SITargetLowering *TLI = ST.getTargetLowering(); 2555 2556 if (TLI->shouldEmitFixup(GV)) { 2557 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2558 MI.eraseFromParent(); 2559 return true; 2560 } 2561 2562 if (TLI->shouldEmitPCReloc(GV)) { 2563 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2564 MI.eraseFromParent(); 2565 return true; 2566 } 2567 2568 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2569 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2570 2571 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2572 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2573 MachinePointerInfo::getGOT(MF), 2574 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2575 MachineMemOperand::MOInvariant, 2576 LoadTy, Align(8)); 2577 2578 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2579 2580 if (Ty.getSizeInBits() == 32) { 2581 // Truncate if this is a 32-bit constant address. 2582 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2583 B.buildExtract(DstReg, Load, 0); 2584 } else 2585 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2586 2587 MI.eraseFromParent(); 2588 return true; 2589 } 2590 2591 static LLT widenToNextPowerOf2(LLT Ty) { 2592 if (Ty.isVector()) 2593 return Ty.changeElementCount( 2594 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2595 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2596 } 2597 2598 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2599 MachineInstr &MI) const { 2600 MachineIRBuilder &B = Helper.MIRBuilder; 2601 MachineRegisterInfo &MRI = *B.getMRI(); 2602 GISelChangeObserver &Observer = Helper.Observer; 2603 2604 Register PtrReg = MI.getOperand(1).getReg(); 2605 LLT PtrTy = MRI.getType(PtrReg); 2606 unsigned AddrSpace = PtrTy.getAddressSpace(); 2607 2608 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2609 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2610 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2611 Observer.changingInstr(MI); 2612 MI.getOperand(1).setReg(Cast.getReg(0)); 2613 Observer.changedInstr(MI); 2614 return true; 2615 } 2616 2617 if (MI.getOpcode() != AMDGPU::G_LOAD) 2618 return false; 2619 2620 Register ValReg = MI.getOperand(0).getReg(); 2621 LLT ValTy = MRI.getType(ValReg); 2622 2623 MachineMemOperand *MMO = *MI.memoperands_begin(); 2624 const unsigned ValSize = ValTy.getSizeInBits(); 2625 const LLT MemTy = MMO->getMemoryType(); 2626 const Align MemAlign = MMO->getAlign(); 2627 const unsigned MemSize = MemTy.getSizeInBits(); 2628 const uint64_t AlignInBits = 8 * MemAlign.value(); 2629 2630 // Widen non-power-of-2 loads to the alignment if needed 2631 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2632 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2633 2634 // This was already the correct extending load result type, so just adjust 2635 // the memory type. 2636 if (WideMemSize == ValSize) { 2637 MachineFunction &MF = B.getMF(); 2638 2639 MachineMemOperand *WideMMO = 2640 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2641 Observer.changingInstr(MI); 2642 MI.setMemRefs(MF, {WideMMO}); 2643 Observer.changedInstr(MI); 2644 return true; 2645 } 2646 2647 // Don't bother handling edge case that should probably never be produced. 2648 if (ValSize > WideMemSize) 2649 return false; 2650 2651 LLT WideTy = widenToNextPowerOf2(ValTy); 2652 2653 Register WideLoad; 2654 if (!WideTy.isVector()) { 2655 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2656 B.buildTrunc(ValReg, WideLoad).getReg(0); 2657 } else { 2658 // Extract the subvector. 2659 2660 if (isRegisterType(ValTy)) { 2661 // If this a case where G_EXTRACT is legal, use it. 2662 // (e.g. <3 x s32> -> <4 x s32>) 2663 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2664 B.buildExtract(ValReg, WideLoad, 0); 2665 } else { 2666 // For cases where the widened type isn't a nice register value, unmerge 2667 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2668 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2669 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2670 } 2671 } 2672 2673 MI.eraseFromParent(); 2674 return true; 2675 } 2676 2677 return false; 2678 } 2679 2680 bool AMDGPULegalizerInfo::legalizeFMad( 2681 MachineInstr &MI, MachineRegisterInfo &MRI, 2682 MachineIRBuilder &B) const { 2683 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2684 assert(Ty.isScalar()); 2685 2686 MachineFunction &MF = B.getMF(); 2687 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2688 2689 // TODO: Always legal with future ftz flag. 2690 // FIXME: Do we need just output? 2691 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2692 return true; 2693 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2694 return true; 2695 2696 MachineIRBuilder HelperBuilder(MI); 2697 GISelObserverWrapper DummyObserver; 2698 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2699 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2700 } 2701 2702 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2703 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2704 Register DstReg = MI.getOperand(0).getReg(); 2705 Register PtrReg = MI.getOperand(1).getReg(); 2706 Register CmpVal = MI.getOperand(2).getReg(); 2707 Register NewVal = MI.getOperand(3).getReg(); 2708 2709 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2710 "this should not have been custom lowered"); 2711 2712 LLT ValTy = MRI.getType(CmpVal); 2713 LLT VecTy = LLT::fixed_vector(2, ValTy); 2714 2715 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2716 2717 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2718 .addDef(DstReg) 2719 .addUse(PtrReg) 2720 .addUse(PackedVal) 2721 .setMemRefs(MI.memoperands()); 2722 2723 MI.eraseFromParent(); 2724 return true; 2725 } 2726 2727 bool AMDGPULegalizerInfo::legalizeFlog( 2728 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2729 Register Dst = MI.getOperand(0).getReg(); 2730 Register Src = MI.getOperand(1).getReg(); 2731 LLT Ty = B.getMRI()->getType(Dst); 2732 unsigned Flags = MI.getFlags(); 2733 2734 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2735 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2736 2737 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2738 MI.eraseFromParent(); 2739 return true; 2740 } 2741 2742 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2743 MachineIRBuilder &B) const { 2744 Register Dst = MI.getOperand(0).getReg(); 2745 Register Src = MI.getOperand(1).getReg(); 2746 unsigned Flags = MI.getFlags(); 2747 LLT Ty = B.getMRI()->getType(Dst); 2748 2749 auto K = B.buildFConstant(Ty, numbers::log2e); 2750 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2751 B.buildFExp2(Dst, Mul, Flags); 2752 MI.eraseFromParent(); 2753 return true; 2754 } 2755 2756 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2757 MachineIRBuilder &B) const { 2758 Register Dst = MI.getOperand(0).getReg(); 2759 Register Src0 = MI.getOperand(1).getReg(); 2760 Register Src1 = MI.getOperand(2).getReg(); 2761 unsigned Flags = MI.getFlags(); 2762 LLT Ty = B.getMRI()->getType(Dst); 2763 const LLT S16 = LLT::scalar(16); 2764 const LLT S32 = LLT::scalar(32); 2765 2766 if (Ty == S32) { 2767 auto Log = B.buildFLog2(S32, Src0, Flags); 2768 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2769 .addUse(Log.getReg(0)) 2770 .addUse(Src1) 2771 .setMIFlags(Flags); 2772 B.buildFExp2(Dst, Mul, Flags); 2773 } else if (Ty == S16) { 2774 // There's no f16 fmul_legacy, so we need to convert for it. 2775 auto Log = B.buildFLog2(S16, Src0, Flags); 2776 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2777 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2778 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2779 .addUse(Ext0.getReg(0)) 2780 .addUse(Ext1.getReg(0)) 2781 .setMIFlags(Flags); 2782 2783 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2784 } else 2785 return false; 2786 2787 MI.eraseFromParent(); 2788 return true; 2789 } 2790 2791 // Find a source register, ignoring any possible source modifiers. 2792 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2793 Register ModSrc = OrigSrc; 2794 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2795 ModSrc = SrcFNeg->getOperand(1).getReg(); 2796 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2797 ModSrc = SrcFAbs->getOperand(1).getReg(); 2798 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2799 ModSrc = SrcFAbs->getOperand(1).getReg(); 2800 return ModSrc; 2801 } 2802 2803 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2804 MachineRegisterInfo &MRI, 2805 MachineIRBuilder &B) const { 2806 2807 const LLT S1 = LLT::scalar(1); 2808 const LLT S64 = LLT::scalar(64); 2809 Register Dst = MI.getOperand(0).getReg(); 2810 Register OrigSrc = MI.getOperand(1).getReg(); 2811 unsigned Flags = MI.getFlags(); 2812 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2813 "this should not have been custom lowered"); 2814 2815 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2816 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2817 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2818 // V_FRACT bug is: 2819 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2820 // 2821 // Convert floor(x) to (x - fract(x)) 2822 2823 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2824 .addUse(OrigSrc) 2825 .setMIFlags(Flags); 2826 2827 // Give source modifier matching some assistance before obscuring a foldable 2828 // pattern. 2829 2830 // TODO: We can avoid the neg on the fract? The input sign to fract 2831 // shouldn't matter? 2832 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2833 2834 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2835 2836 Register Min = MRI.createGenericVirtualRegister(S64); 2837 2838 // We don't need to concern ourselves with the snan handling difference, so 2839 // use the one which will directly select. 2840 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2841 if (MFI->getMode().IEEE) 2842 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2843 else 2844 B.buildFMinNum(Min, Fract, Const, Flags); 2845 2846 Register CorrectedFract = Min; 2847 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2848 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2849 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2850 } 2851 2852 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2853 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2854 2855 MI.eraseFromParent(); 2856 return true; 2857 } 2858 2859 // Turn an illegal packed v2s16 build vector into bit operations. 2860 // TODO: This should probably be a bitcast action in LegalizerHelper. 2861 bool AMDGPULegalizerInfo::legalizeBuildVector( 2862 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2863 Register Dst = MI.getOperand(0).getReg(); 2864 const LLT S32 = LLT::scalar(32); 2865 const LLT S16 = LLT::scalar(16); 2866 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2867 2868 Register Src0 = MI.getOperand(1).getReg(); 2869 Register Src1 = MI.getOperand(2).getReg(); 2870 2871 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 2872 assert(MRI.getType(Src0) == S32); 2873 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 2874 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 2875 } 2876 2877 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 2878 B.buildBitcast(Dst, Merge); 2879 2880 MI.eraseFromParent(); 2881 return true; 2882 } 2883 2884 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 2885 // 2886 // Source and accumulation registers must all be 32-bits. 2887 // 2888 // TODO: When the multiply is uniform, we should produce a code sequence 2889 // that is better suited to instruction selection on the SALU. Instead of 2890 // the outer loop going over parts of the result, the outer loop should go 2891 // over parts of one of the factors. This should result in instruction 2892 // selection that makes full use of S_ADDC_U32 instructions. 2893 void AMDGPULegalizerInfo::buildMultiply( 2894 LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 2895 ArrayRef<Register> Src0, ArrayRef<Register> Src1, 2896 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 2897 // Use (possibly empty) vectors of S1 registers to represent the set of 2898 // carries from one pair of positions to the next. 2899 using Carry = SmallVector<Register, 2>; 2900 2901 MachineIRBuilder &B = Helper.MIRBuilder; 2902 2903 const LLT S1 = LLT::scalar(1); 2904 const LLT S32 = LLT::scalar(32); 2905 const LLT S64 = LLT::scalar(64); 2906 2907 Register Zero32; 2908 Register Zero64; 2909 2910 auto getZero32 = [&]() -> Register { 2911 if (!Zero32) 2912 Zero32 = B.buildConstant(S32, 0).getReg(0); 2913 return Zero32; 2914 }; 2915 auto getZero64 = [&]() -> Register { 2916 if (!Zero64) 2917 Zero64 = B.buildConstant(S64, 0).getReg(0); 2918 return Zero64; 2919 }; 2920 2921 // Merge the given carries into the 32-bit LocalAccum, which is modified 2922 // in-place. 2923 // 2924 // Returns the carry-out, which is a single S1 register or null. 2925 auto mergeCarry = 2926 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 2927 if (CarryIn.empty()) 2928 return Register(); 2929 2930 bool HaveCarryOut = true; 2931 Register CarryAccum; 2932 if (CarryIn.size() == 1) { 2933 if (!LocalAccum) { 2934 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2935 return Register(); 2936 } 2937 2938 CarryAccum = getZero32(); 2939 } else { 2940 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2941 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 2942 CarryAccum = 2943 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 2944 .getReg(0); 2945 } 2946 2947 if (!LocalAccum) { 2948 LocalAccum = getZero32(); 2949 HaveCarryOut = false; 2950 } 2951 } 2952 2953 auto Add = 2954 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 2955 LocalAccum = Add.getReg(0); 2956 return HaveCarryOut ? Add.getReg(1) : Register(); 2957 }; 2958 2959 // Build a multiply-add chain to compute 2960 // 2961 // LocalAccum + (partial products at DstIndex) 2962 // + (opportunistic subset of CarryIn) 2963 // 2964 // LocalAccum is an array of one or two 32-bit registers that are updated 2965 // in-place. The incoming registers may be null. 2966 // 2967 // In some edge cases, carry-ins can be consumed "for free". In that case, 2968 // the consumed carry bits are removed from CarryIn in-place. 2969 auto buildMadChain = 2970 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 2971 -> Carry { 2972 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 2973 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 2974 2975 Carry CarryOut; 2976 unsigned j0 = 0; 2977 2978 // Use plain 32-bit multiplication for the most significant part of the 2979 // result by default. 2980 if (LocalAccum.size() == 1 && 2981 (!UsePartialMad64_32 || !CarryIn.empty())) { 2982 do { 2983 unsigned j1 = DstIndex - j0; 2984 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 2985 if (!LocalAccum[0]) { 2986 LocalAccum[0] = Mul.getReg(0); 2987 } else { 2988 if (CarryIn.empty()) { 2989 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 2990 } else { 2991 LocalAccum[0] = 2992 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 2993 .getReg(0); 2994 CarryIn.pop_back(); 2995 } 2996 } 2997 ++j0; 2998 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 2999 } 3000 3001 // Build full 64-bit multiplies. 3002 if (j0 <= DstIndex) { 3003 bool HaveSmallAccum = false; 3004 Register Tmp; 3005 3006 if (LocalAccum[0]) { 3007 if (LocalAccum.size() == 1) { 3008 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3009 HaveSmallAccum = true; 3010 } else if (LocalAccum[1]) { 3011 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 3012 HaveSmallAccum = false; 3013 } else { 3014 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3015 HaveSmallAccum = true; 3016 } 3017 } else { 3018 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3019 Tmp = getZero64(); 3020 HaveSmallAccum = true; 3021 } 3022 3023 do { 3024 unsigned j1 = DstIndex - j0; 3025 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3026 {Src0[j0], Src1[j1], Tmp}); 3027 Tmp = Mad.getReg(0); 3028 if (!HaveSmallAccum) 3029 CarryOut.push_back(Mad.getReg(1)); 3030 HaveSmallAccum = false; 3031 ++j0; 3032 } while (j0 <= DstIndex); 3033 3034 auto Unmerge = B.buildUnmerge(S32, Tmp); 3035 LocalAccum[0] = Unmerge.getReg(0); 3036 if (LocalAccum.size() > 1) 3037 LocalAccum[1] = Unmerge.getReg(1); 3038 } 3039 3040 return CarryOut; 3041 }; 3042 3043 // Outer multiply loop, iterating over destination parts from least 3044 // significant to most significant parts. 3045 // 3046 // The columns of the following diagram correspond to the destination parts 3047 // affected by one iteration of the outer loop (ignoring boundary 3048 // conditions). 3049 // 3050 // Dest index relative to 2 * i: 1 0 -1 3051 // ------ 3052 // Carries from previous iteration: e o 3053 // Even-aligned partial product sum: E E . 3054 // Odd-aligned partial product sum: O O 3055 // 3056 // 'o' is OddCarry, 'e' is EvenCarry. 3057 // EE and OO are computed from partial products via buildMadChain and use 3058 // accumulation where possible and appropriate. 3059 // 3060 Register SeparateOddCarry; 3061 Carry EvenCarry; 3062 Carry OddCarry; 3063 3064 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3065 Carry OddCarryIn = std::move(OddCarry); 3066 Carry EvenCarryIn = std::move(EvenCarry); 3067 OddCarry.clear(); 3068 EvenCarry.clear(); 3069 3070 // Partial products at offset 2 * i. 3071 if (2 * i < Accum.size()) { 3072 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3073 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3074 } 3075 3076 // Partial products at offset 2 * i - 1. 3077 if (i > 0) { 3078 if (!SeparateOddAlignedProducts) { 3079 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3080 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3081 } else { 3082 bool IsHighest = 2 * i >= Accum.size(); 3083 Register SeparateOddOut[2]; 3084 auto LocalAccum = MutableArrayRef(SeparateOddOut) 3085 .take_front(IsHighest ? 1 : 2); 3086 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3087 3088 MachineInstr *Lo; 3089 3090 if (i == 1) { 3091 if (!IsHighest) 3092 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3093 else 3094 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3095 } else { 3096 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3097 SeparateOddCarry); 3098 } 3099 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3100 3101 if (!IsHighest) { 3102 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3103 Lo->getOperand(1).getReg()); 3104 Accum[2 * i] = Hi.getReg(0); 3105 SeparateOddCarry = Hi.getReg(1); 3106 } 3107 } 3108 } 3109 3110 // Add in the carries from the previous iteration 3111 if (i > 0) { 3112 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3113 EvenCarryIn.push_back(CarryOut); 3114 3115 if (2 * i < Accum.size()) { 3116 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3117 OddCarry.push_back(CarryOut); 3118 } 3119 } 3120 } 3121 } 3122 3123 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3124 // 3125 // TODO: If the multiply is followed by an addition, we should attempt to 3126 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3127 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3128 MachineInstr &MI) const { 3129 assert(ST.hasMad64_32()); 3130 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3131 3132 MachineIRBuilder &B = Helper.MIRBuilder; 3133 MachineRegisterInfo &MRI = *B.getMRI(); 3134 3135 Register DstReg = MI.getOperand(0).getReg(); 3136 Register Src0 = MI.getOperand(1).getReg(); 3137 Register Src1 = MI.getOperand(2).getReg(); 3138 3139 LLT Ty = MRI.getType(DstReg); 3140 assert(Ty.isScalar()); 3141 3142 unsigned Size = Ty.getSizeInBits(); 3143 unsigned NumParts = Size / 32; 3144 assert((Size % 32) == 0); 3145 assert(NumParts >= 2); 3146 3147 // Whether to use MAD_64_32 for partial products whose high half is 3148 // discarded. This avoids some ADD instructions but risks false dependency 3149 // stalls on some subtargets in some cases. 3150 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3151 3152 // Whether to compute odd-aligned partial products separately. This is 3153 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3154 // in an even-aligned VGPR. 3155 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3156 3157 LLT S32 = LLT::scalar(32); 3158 SmallVector<Register, 2> Src0Parts, Src1Parts; 3159 for (unsigned i = 0; i < NumParts; ++i) { 3160 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3161 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3162 } 3163 B.buildUnmerge(Src0Parts, Src0); 3164 B.buildUnmerge(Src1Parts, Src1); 3165 3166 SmallVector<Register, 2> AccumRegs(NumParts); 3167 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3168 SeparateOddAlignedProducts); 3169 3170 B.buildMergeLikeInstr(DstReg, AccumRegs); 3171 MI.eraseFromParent(); 3172 return true; 3173 3174 } 3175 3176 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3177 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3178 // case with a single min instruction instead of a compare+select. 3179 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3180 MachineRegisterInfo &MRI, 3181 MachineIRBuilder &B) const { 3182 Register Dst = MI.getOperand(0).getReg(); 3183 Register Src = MI.getOperand(1).getReg(); 3184 LLT DstTy = MRI.getType(Dst); 3185 LLT SrcTy = MRI.getType(Src); 3186 3187 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3188 ? AMDGPU::G_AMDGPU_FFBH_U32 3189 : AMDGPU::G_AMDGPU_FFBL_B32; 3190 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3191 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3192 3193 MI.eraseFromParent(); 3194 return true; 3195 } 3196 3197 // Check that this is a G_XOR x, -1 3198 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3199 if (MI.getOpcode() != TargetOpcode::G_XOR) 3200 return false; 3201 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3202 return ConstVal && *ConstVal == -1; 3203 } 3204 3205 // Return the use branch instruction, otherwise null if the usage is invalid. 3206 static MachineInstr * 3207 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3208 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3209 Register CondDef = MI.getOperand(0).getReg(); 3210 if (!MRI.hasOneNonDBGUse(CondDef)) 3211 return nullptr; 3212 3213 MachineBasicBlock *Parent = MI.getParent(); 3214 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3215 3216 if (isNot(MRI, *UseMI)) { 3217 Register NegatedCond = UseMI->getOperand(0).getReg(); 3218 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3219 return nullptr; 3220 3221 // We're deleting the def of this value, so we need to remove it. 3222 eraseInstr(*UseMI, MRI); 3223 3224 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3225 Negated = true; 3226 } 3227 3228 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3229 return nullptr; 3230 3231 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3232 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3233 if (Next == Parent->end()) { 3234 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3235 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3236 return nullptr; 3237 UncondBrTarget = &*NextMBB; 3238 } else { 3239 if (Next->getOpcode() != AMDGPU::G_BR) 3240 return nullptr; 3241 Br = &*Next; 3242 UncondBrTarget = Br->getOperand(0).getMBB(); 3243 } 3244 3245 return UseMI; 3246 } 3247 3248 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3249 const ArgDescriptor *Arg, 3250 const TargetRegisterClass *ArgRC, 3251 LLT ArgTy) const { 3252 MCRegister SrcReg = Arg->getRegister(); 3253 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3254 assert(DstReg.isVirtual() && "Virtual register expected"); 3255 3256 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 3257 *ArgRC, B.getDebugLoc(), ArgTy); 3258 if (Arg->isMasked()) { 3259 // TODO: Should we try to emit this once in the entry block? 3260 const LLT S32 = LLT::scalar(32); 3261 const unsigned Mask = Arg->getMask(); 3262 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 3263 3264 Register AndMaskSrc = LiveIn; 3265 3266 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 3267 // 0. 3268 if (Shift != 0) { 3269 auto ShiftAmt = B.buildConstant(S32, Shift); 3270 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 3271 } 3272 3273 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 3274 } else { 3275 B.buildCopy(DstReg, LiveIn); 3276 } 3277 3278 return true; 3279 } 3280 3281 bool AMDGPULegalizerInfo::loadInputValue( 3282 Register DstReg, MachineIRBuilder &B, 3283 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3284 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3285 const ArgDescriptor *Arg; 3286 const TargetRegisterClass *ArgRC; 3287 LLT ArgTy; 3288 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3289 3290 if (!Arg) { 3291 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3292 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3293 // case the pointer argument may be missing and we use null. 3294 B.buildConstant(DstReg, 0); 3295 return true; 3296 } 3297 3298 // It's undefined behavior if a function marked with the amdgpu-no-* 3299 // attributes uses the corresponding intrinsic. 3300 B.buildUndef(DstReg); 3301 return true; 3302 } 3303 3304 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3305 return false; // TODO: Handle these 3306 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3307 } 3308 3309 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 3310 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3311 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3312 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3313 return false; 3314 3315 MI.eraseFromParent(); 3316 return true; 3317 } 3318 3319 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 3320 int64_t C) { 3321 B.buildConstant(MI.getOperand(0).getReg(), C); 3322 MI.eraseFromParent(); 3323 return true; 3324 } 3325 3326 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 3327 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3328 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3329 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 3330 if (MaxID == 0) 3331 return replaceWithConstant(B, MI, 0); 3332 3333 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3334 const ArgDescriptor *Arg; 3335 const TargetRegisterClass *ArgRC; 3336 LLT ArgTy; 3337 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3338 3339 Register DstReg = MI.getOperand(0).getReg(); 3340 if (!Arg) { 3341 // It's undefined behavior if a function marked with the amdgpu-no-* 3342 // attributes uses the corresponding intrinsic. 3343 B.buildUndef(DstReg); 3344 MI.eraseFromParent(); 3345 return true; 3346 } 3347 3348 if (Arg->isMasked()) { 3349 // Don't bother inserting AssertZext for packed IDs since we're emitting the 3350 // masking operations anyway. 3351 // 3352 // TODO: We could assert the top bit is 0 for the source copy. 3353 if (!loadInputValue(DstReg, B, ArgType)) 3354 return false; 3355 } else { 3356 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 3357 if (!loadInputValue(TmpReg, B, ArgType)) 3358 return false; 3359 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 3360 } 3361 3362 MI.eraseFromParent(); 3363 return true; 3364 } 3365 3366 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3367 int64_t Offset) const { 3368 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3369 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3370 3371 // TODO: If we passed in the base kernel offset we could have a better 3372 // alignment than 4, but we don't really need it. 3373 if (!loadInputValue(KernArgReg, B, 3374 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3375 llvm_unreachable("failed to find kernarg segment ptr"); 3376 3377 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3378 // TODO: Should get nuw 3379 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3380 } 3381 3382 /// Legalize a value that's loaded from kernel arguments. This is only used by 3383 /// legacy intrinsics. 3384 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3385 MachineIRBuilder &B, 3386 uint64_t Offset, 3387 Align Alignment) const { 3388 Register DstReg = MI.getOperand(0).getReg(); 3389 3390 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3391 "unexpected kernarg parameter type"); 3392 3393 Register Ptr = getKernargParameterPtr(B, Offset); 3394 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3395 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3396 MachineMemOperand::MODereferenceable | 3397 MachineMemOperand::MOInvariant); 3398 MI.eraseFromParent(); 3399 return true; 3400 } 3401 3402 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3403 MachineRegisterInfo &MRI, 3404 MachineIRBuilder &B) const { 3405 Register Dst = MI.getOperand(0).getReg(); 3406 LLT DstTy = MRI.getType(Dst); 3407 LLT S16 = LLT::scalar(16); 3408 LLT S32 = LLT::scalar(32); 3409 LLT S64 = LLT::scalar(64); 3410 3411 if (DstTy == S16) 3412 return legalizeFDIV16(MI, MRI, B); 3413 if (DstTy == S32) 3414 return legalizeFDIV32(MI, MRI, B); 3415 if (DstTy == S64) 3416 return legalizeFDIV64(MI, MRI, B); 3417 3418 return false; 3419 } 3420 3421 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3422 Register DstDivReg, 3423 Register DstRemReg, 3424 Register X, 3425 Register Y) const { 3426 const LLT S1 = LLT::scalar(1); 3427 const LLT S32 = LLT::scalar(32); 3428 3429 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3430 // algorithm used here. 3431 3432 // Initial estimate of inv(y). 3433 auto FloatY = B.buildUITOFP(S32, Y); 3434 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3435 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3436 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3437 auto Z = B.buildFPTOUI(S32, ScaledY); 3438 3439 // One round of UNR. 3440 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3441 auto NegYZ = B.buildMul(S32, NegY, Z); 3442 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3443 3444 // Quotient/remainder estimate. 3445 auto Q = B.buildUMulH(S32, X, Z); 3446 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3447 3448 // First quotient/remainder refinement. 3449 auto One = B.buildConstant(S32, 1); 3450 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3451 if (DstDivReg) 3452 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3453 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3454 3455 // Second quotient/remainder refinement. 3456 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3457 if (DstDivReg) 3458 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3459 3460 if (DstRemReg) 3461 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3462 } 3463 3464 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3465 // 3466 // Return lo, hi of result 3467 // 3468 // %cvt.lo = G_UITOFP Val.lo 3469 // %cvt.hi = G_UITOFP Val.hi 3470 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3471 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3472 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3473 // %mul2 = G_FMUL %mul1, 2**(-32) 3474 // %trunc = G_INTRINSIC_TRUNC %mul2 3475 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3476 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3477 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3478 Register Val) { 3479 const LLT S32 = LLT::scalar(32); 3480 auto Unmerge = B.buildUnmerge(S32, Val); 3481 3482 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3483 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3484 3485 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3486 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3487 3488 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3489 auto Mul1 = 3490 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3491 3492 // 2**(-32) 3493 auto Mul2 = 3494 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3495 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3496 3497 // -(2**32) 3498 auto Mad2 = B.buildFMAD(S32, Trunc, 3499 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3500 3501 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3502 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3503 3504 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3505 } 3506 3507 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3508 Register DstDivReg, 3509 Register DstRemReg, 3510 Register Numer, 3511 Register Denom) const { 3512 const LLT S32 = LLT::scalar(32); 3513 const LLT S64 = LLT::scalar(64); 3514 const LLT S1 = LLT::scalar(1); 3515 Register RcpLo, RcpHi; 3516 3517 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3518 3519 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 3520 3521 auto Zero64 = B.buildConstant(S64, 0); 3522 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3523 3524 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3525 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3526 3527 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3528 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3529 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3530 3531 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3532 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3533 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 3534 3535 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3536 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3537 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3538 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3539 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3540 3541 auto Zero32 = B.buildConstant(S32, 0); 3542 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3543 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3544 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 3545 3546 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3547 Register NumerLo = UnmergeNumer.getReg(0); 3548 Register NumerHi = UnmergeNumer.getReg(1); 3549 3550 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3551 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3552 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3553 Register Mul3_Lo = UnmergeMul3.getReg(0); 3554 Register Mul3_Hi = UnmergeMul3.getReg(1); 3555 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3556 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3557 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3558 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 3559 3560 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3561 Register DenomLo = UnmergeDenom.getReg(0); 3562 Register DenomHi = UnmergeDenom.getReg(1); 3563 3564 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3565 auto C1 = B.buildSExt(S32, CmpHi); 3566 3567 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3568 auto C2 = B.buildSExt(S32, CmpLo); 3569 3570 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3571 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3572 3573 // TODO: Here and below portions of the code can be enclosed into if/endif. 3574 // Currently control flow is unconditional and we have 4 selects after 3575 // potential endif to substitute PHIs. 3576 3577 // if C3 != 0 ... 3578 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3579 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3580 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3581 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 3582 3583 auto One64 = B.buildConstant(S64, 1); 3584 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3585 3586 auto C4 = 3587 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3588 auto C5 = 3589 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3590 auto C6 = B.buildSelect( 3591 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3592 3593 // if (C6 != 0) 3594 auto Add4 = B.buildAdd(S64, Add3, One64); 3595 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3596 3597 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3598 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3599 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 3600 3601 // endif C6 3602 // endif C3 3603 3604 if (DstDivReg) { 3605 auto Sel1 = B.buildSelect( 3606 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3607 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3608 Sel1, MulHi3); 3609 } 3610 3611 if (DstRemReg) { 3612 auto Sel2 = B.buildSelect( 3613 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3614 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3615 Sel2, Sub1); 3616 } 3617 } 3618 3619 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3620 MachineRegisterInfo &MRI, 3621 MachineIRBuilder &B) const { 3622 Register DstDivReg, DstRemReg; 3623 switch (MI.getOpcode()) { 3624 default: 3625 llvm_unreachable("Unexpected opcode!"); 3626 case AMDGPU::G_UDIV: { 3627 DstDivReg = MI.getOperand(0).getReg(); 3628 break; 3629 } 3630 case AMDGPU::G_UREM: { 3631 DstRemReg = MI.getOperand(0).getReg(); 3632 break; 3633 } 3634 case AMDGPU::G_UDIVREM: { 3635 DstDivReg = MI.getOperand(0).getReg(); 3636 DstRemReg = MI.getOperand(1).getReg(); 3637 break; 3638 } 3639 } 3640 3641 const LLT S64 = LLT::scalar(64); 3642 const LLT S32 = LLT::scalar(32); 3643 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3644 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3645 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3646 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3647 3648 if (Ty == S32) 3649 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3650 else if (Ty == S64) 3651 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3652 else 3653 return false; 3654 3655 MI.eraseFromParent(); 3656 return true; 3657 } 3658 3659 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3660 MachineRegisterInfo &MRI, 3661 MachineIRBuilder &B) const { 3662 const LLT S64 = LLT::scalar(64); 3663 const LLT S32 = LLT::scalar(32); 3664 3665 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3666 if (Ty != S32 && Ty != S64) 3667 return false; 3668 3669 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3670 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3671 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3672 3673 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3674 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3675 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3676 3677 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3678 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3679 3680 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3681 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3682 3683 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3684 switch (MI.getOpcode()) { 3685 default: 3686 llvm_unreachable("Unexpected opcode!"); 3687 case AMDGPU::G_SDIV: { 3688 DstDivReg = MI.getOperand(0).getReg(); 3689 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3690 break; 3691 } 3692 case AMDGPU::G_SREM: { 3693 DstRemReg = MI.getOperand(0).getReg(); 3694 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3695 break; 3696 } 3697 case AMDGPU::G_SDIVREM: { 3698 DstDivReg = MI.getOperand(0).getReg(); 3699 DstRemReg = MI.getOperand(1).getReg(); 3700 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3701 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3702 break; 3703 } 3704 } 3705 3706 if (Ty == S32) 3707 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3708 else 3709 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3710 3711 if (DstDivReg) { 3712 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3713 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3714 B.buildSub(DstDivReg, SignXor, Sign); 3715 } 3716 3717 if (DstRemReg) { 3718 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3719 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3720 B.buildSub(DstRemReg, SignXor, Sign); 3721 } 3722 3723 MI.eraseFromParent(); 3724 return true; 3725 } 3726 3727 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3728 MachineRegisterInfo &MRI, 3729 MachineIRBuilder &B) const { 3730 Register Res = MI.getOperand(0).getReg(); 3731 Register LHS = MI.getOperand(1).getReg(); 3732 Register RHS = MI.getOperand(2).getReg(); 3733 uint16_t Flags = MI.getFlags(); 3734 LLT ResTy = MRI.getType(Res); 3735 3736 const MachineFunction &MF = B.getMF(); 3737 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3738 MI.getFlag(MachineInstr::FmAfn); 3739 3740 if (!AllowInaccurateRcp) 3741 return false; 3742 3743 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3744 // 1 / x -> RCP(x) 3745 if (CLHS->isExactlyValue(1.0)) { 3746 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3747 .addUse(RHS) 3748 .setMIFlags(Flags); 3749 3750 MI.eraseFromParent(); 3751 return true; 3752 } 3753 3754 // -1 / x -> RCP( FNEG(x) ) 3755 if (CLHS->isExactlyValue(-1.0)) { 3756 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3757 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3758 .addUse(FNeg.getReg(0)) 3759 .setMIFlags(Flags); 3760 3761 MI.eraseFromParent(); 3762 return true; 3763 } 3764 } 3765 3766 // x / y -> x * (1.0 / y) 3767 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3768 .addUse(RHS) 3769 .setMIFlags(Flags); 3770 B.buildFMul(Res, LHS, RCP, Flags); 3771 3772 MI.eraseFromParent(); 3773 return true; 3774 } 3775 3776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3777 MachineRegisterInfo &MRI, 3778 MachineIRBuilder &B) const { 3779 Register Res = MI.getOperand(0).getReg(); 3780 Register X = MI.getOperand(1).getReg(); 3781 Register Y = MI.getOperand(2).getReg(); 3782 uint16_t Flags = MI.getFlags(); 3783 LLT ResTy = MRI.getType(Res); 3784 3785 const MachineFunction &MF = B.getMF(); 3786 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3787 MI.getFlag(MachineInstr::FmAfn); 3788 3789 if (!AllowInaccurateRcp) 3790 return false; 3791 3792 auto NegY = B.buildFNeg(ResTy, Y); 3793 auto One = B.buildFConstant(ResTy, 1.0); 3794 3795 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3796 .addUse(Y) 3797 .setMIFlags(Flags); 3798 3799 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3800 R = B.buildFMA(ResTy, Tmp0, R, R); 3801 3802 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3803 R = B.buildFMA(ResTy, Tmp1, R, R); 3804 3805 auto Ret = B.buildFMul(ResTy, X, R); 3806 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3807 3808 B.buildFMA(Res, Tmp2, R, Ret); 3809 MI.eraseFromParent(); 3810 return true; 3811 } 3812 3813 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3814 MachineRegisterInfo &MRI, 3815 MachineIRBuilder &B) const { 3816 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3817 return true; 3818 3819 Register Res = MI.getOperand(0).getReg(); 3820 Register LHS = MI.getOperand(1).getReg(); 3821 Register RHS = MI.getOperand(2).getReg(); 3822 3823 uint16_t Flags = MI.getFlags(); 3824 3825 LLT S16 = LLT::scalar(16); 3826 LLT S32 = LLT::scalar(32); 3827 3828 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3829 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3830 3831 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3832 .addUse(RHSExt.getReg(0)) 3833 .setMIFlags(Flags); 3834 3835 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3836 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3837 3838 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3839 .addUse(RDst.getReg(0)) 3840 .addUse(RHS) 3841 .addUse(LHS) 3842 .setMIFlags(Flags); 3843 3844 MI.eraseFromParent(); 3845 return true; 3846 } 3847 3848 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3849 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3850 static void toggleSPDenormMode(bool Enable, 3851 MachineIRBuilder &B, 3852 const GCNSubtarget &ST, 3853 AMDGPU::SIModeRegisterDefaults Mode) { 3854 // Set SP denorm mode to this value. 3855 unsigned SPDenormMode = 3856 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3857 3858 if (ST.hasDenormModeInst()) { 3859 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3860 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3861 3862 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3863 B.buildInstr(AMDGPU::S_DENORM_MODE) 3864 .addImm(NewDenormModeValue); 3865 3866 } else { 3867 // Select FP32 bit field in mode register. 3868 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3869 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3870 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3871 3872 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3873 .addImm(SPDenormMode) 3874 .addImm(SPDenormModeBitField); 3875 } 3876 } 3877 3878 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3879 MachineRegisterInfo &MRI, 3880 MachineIRBuilder &B) const { 3881 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3882 return true; 3883 3884 Register Res = MI.getOperand(0).getReg(); 3885 Register LHS = MI.getOperand(1).getReg(); 3886 Register RHS = MI.getOperand(2).getReg(); 3887 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3888 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3889 3890 uint16_t Flags = MI.getFlags(); 3891 3892 LLT S32 = LLT::scalar(32); 3893 LLT S1 = LLT::scalar(1); 3894 3895 auto One = B.buildFConstant(S32, 1.0f); 3896 3897 auto DenominatorScaled = 3898 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3899 .addUse(LHS) 3900 .addUse(RHS) 3901 .addImm(0) 3902 .setMIFlags(Flags); 3903 auto NumeratorScaled = 3904 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3905 .addUse(LHS) 3906 .addUse(RHS) 3907 .addImm(1) 3908 .setMIFlags(Flags); 3909 3910 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3911 .addUse(DenominatorScaled.getReg(0)) 3912 .setMIFlags(Flags); 3913 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3914 3915 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3916 // aren't modeled as reading it. 3917 if (!Mode.allFP32Denormals()) 3918 toggleSPDenormMode(true, B, ST, Mode); 3919 3920 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3921 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3922 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3923 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3924 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3925 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3926 3927 if (!Mode.allFP32Denormals()) 3928 toggleSPDenormMode(false, B, ST, Mode); 3929 3930 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3931 .addUse(Fma4.getReg(0)) 3932 .addUse(Fma1.getReg(0)) 3933 .addUse(Fma3.getReg(0)) 3934 .addUse(NumeratorScaled.getReg(1)) 3935 .setMIFlags(Flags); 3936 3937 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3938 .addUse(Fmas.getReg(0)) 3939 .addUse(RHS) 3940 .addUse(LHS) 3941 .setMIFlags(Flags); 3942 3943 MI.eraseFromParent(); 3944 return true; 3945 } 3946 3947 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3948 MachineRegisterInfo &MRI, 3949 MachineIRBuilder &B) const { 3950 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3951 return true; 3952 3953 Register Res = MI.getOperand(0).getReg(); 3954 Register LHS = MI.getOperand(1).getReg(); 3955 Register RHS = MI.getOperand(2).getReg(); 3956 3957 uint16_t Flags = MI.getFlags(); 3958 3959 LLT S64 = LLT::scalar(64); 3960 LLT S1 = LLT::scalar(1); 3961 3962 auto One = B.buildFConstant(S64, 1.0); 3963 3964 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3965 .addUse(LHS) 3966 .addUse(RHS) 3967 .addImm(0) 3968 .setMIFlags(Flags); 3969 3970 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3971 3972 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3973 .addUse(DivScale0.getReg(0)) 3974 .setMIFlags(Flags); 3975 3976 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3977 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3978 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3979 3980 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3981 .addUse(LHS) 3982 .addUse(RHS) 3983 .addImm(1) 3984 .setMIFlags(Flags); 3985 3986 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3987 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3988 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3989 3990 Register Scale; 3991 if (!ST.hasUsableDivScaleConditionOutput()) { 3992 // Workaround a hardware bug on SI where the condition output from div_scale 3993 // is not usable. 3994 3995 LLT S32 = LLT::scalar(32); 3996 3997 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3998 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3999 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4000 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4001 4002 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4003 Scale1Unmerge.getReg(1)); 4004 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4005 Scale0Unmerge.getReg(1)); 4006 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4007 } else { 4008 Scale = DivScale1.getReg(1); 4009 } 4010 4011 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4012 .addUse(Fma4.getReg(0)) 4013 .addUse(Fma3.getReg(0)) 4014 .addUse(Mul.getReg(0)) 4015 .addUse(Scale) 4016 .setMIFlags(Flags); 4017 4018 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) 4019 .addUse(Fmas.getReg(0)) 4020 .addUse(RHS) 4021 .addUse(LHS) 4022 .setMIFlags(Flags); 4023 4024 MI.eraseFromParent(); 4025 return true; 4026 } 4027 4028 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4029 MachineRegisterInfo &MRI, 4030 MachineIRBuilder &B) const { 4031 Register Res = MI.getOperand(0).getReg(); 4032 Register LHS = MI.getOperand(2).getReg(); 4033 Register RHS = MI.getOperand(3).getReg(); 4034 uint16_t Flags = MI.getFlags(); 4035 4036 LLT S32 = LLT::scalar(32); 4037 LLT S1 = LLT::scalar(1); 4038 4039 auto Abs = B.buildFAbs(S32, RHS, Flags); 4040 const APFloat C0Val(1.0f); 4041 4042 auto C0 = B.buildConstant(S32, 0x6f800000); 4043 auto C1 = B.buildConstant(S32, 0x2f800000); 4044 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 4045 4046 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4047 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4048 4049 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4050 4051 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4052 .addUse(Mul0.getReg(0)) 4053 .setMIFlags(Flags); 4054 4055 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4056 4057 B.buildFMul(Res, Sel, Mul1, Flags); 4058 4059 MI.eraseFromParent(); 4060 return true; 4061 } 4062 4063 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4064 // FIXME: Why do we handle this one but not other removed instructions? 4065 // 4066 // Reciprocal square root. The clamp prevents infinite results, clamping 4067 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4068 // +-max_float. 4069 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4070 MachineRegisterInfo &MRI, 4071 MachineIRBuilder &B) const { 4072 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4073 return true; 4074 4075 Register Dst = MI.getOperand(0).getReg(); 4076 Register Src = MI.getOperand(2).getReg(); 4077 auto Flags = MI.getFlags(); 4078 4079 LLT Ty = MRI.getType(Dst); 4080 4081 const fltSemantics *FltSemantics; 4082 if (Ty == LLT::scalar(32)) 4083 FltSemantics = &APFloat::IEEEsingle(); 4084 else if (Ty == LLT::scalar(64)) 4085 FltSemantics = &APFloat::IEEEdouble(); 4086 else 4087 return false; 4088 4089 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4090 .addUse(Src) 4091 .setMIFlags(Flags); 4092 4093 // We don't need to concern ourselves with the snan handling difference, since 4094 // the rsq quieted (or not) so use the one which will directly select. 4095 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4096 const bool UseIEEE = MFI->getMode().IEEE; 4097 4098 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4099 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4100 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4101 4102 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4103 4104 if (UseIEEE) 4105 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4106 else 4107 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4108 MI.eraseFromParent(); 4109 return true; 4110 } 4111 4112 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4113 switch (IID) { 4114 case Intrinsic::amdgcn_ds_fadd: 4115 return AMDGPU::G_ATOMICRMW_FADD; 4116 case Intrinsic::amdgcn_ds_fmin: 4117 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4118 case Intrinsic::amdgcn_ds_fmax: 4119 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4120 default: 4121 llvm_unreachable("not a DS FP intrinsic"); 4122 } 4123 } 4124 4125 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4126 MachineInstr &MI, 4127 Intrinsic::ID IID) const { 4128 GISelChangeObserver &Observer = Helper.Observer; 4129 Observer.changingInstr(MI); 4130 4131 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4132 4133 // The remaining operands were used to set fields in the MemOperand on 4134 // construction. 4135 for (int I = 6; I > 3; --I) 4136 MI.removeOperand(I); 4137 4138 MI.removeOperand(1); // Remove the intrinsic ID. 4139 Observer.changedInstr(MI); 4140 return true; 4141 } 4142 4143 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4144 MachineRegisterInfo &MRI, 4145 MachineIRBuilder &B) const { 4146 uint64_t Offset = 4147 ST.getTargetLowering()->getImplicitParameterOffset( 4148 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4149 LLT DstTy = MRI.getType(DstReg); 4150 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4151 4152 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4153 if (!loadInputValue(KernargPtrReg, B, 4154 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4155 return false; 4156 4157 // FIXME: This should be nuw 4158 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4159 return true; 4160 } 4161 4162 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 4163 MachineRegisterInfo &MRI, 4164 MachineIRBuilder &B) const { 4165 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4166 if (!MFI->isEntryFunction()) { 4167 return legalizePreloadedArgIntrin(MI, MRI, B, 4168 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 4169 } 4170 4171 Register DstReg = MI.getOperand(0).getReg(); 4172 if (!getImplicitArgPtr(DstReg, MRI, B)) 4173 return false; 4174 4175 MI.eraseFromParent(); 4176 return true; 4177 } 4178 4179 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 4180 MachineRegisterInfo &MRI, 4181 MachineIRBuilder &B) const { 4182 Function &F = B.getMF().getFunction(); 4183 std::optional<uint32_t> KnownSize = 4184 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 4185 if (KnownSize.has_value()) 4186 B.buildConstant(DstReg, *KnownSize); 4187 return false; 4188 } 4189 4190 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 4191 MachineRegisterInfo &MRI, 4192 MachineIRBuilder &B) const { 4193 4194 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4195 if (!MFI->isEntryFunction()) { 4196 return legalizePreloadedArgIntrin(MI, MRI, B, 4197 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 4198 } 4199 4200 Register DstReg = MI.getOperand(0).getReg(); 4201 if (!getLDSKernelId(DstReg, MRI, B)) 4202 return false; 4203 4204 MI.eraseFromParent(); 4205 return true; 4206 } 4207 4208 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 4209 MachineRegisterInfo &MRI, 4210 MachineIRBuilder &B, 4211 unsigned AddrSpace) const { 4212 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4213 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4214 Register Hi32 = Unmerge.getReg(1); 4215 4216 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 4217 MI.eraseFromParent(); 4218 return true; 4219 } 4220 4221 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 4222 // offset (the offset that is included in bounds checking and swizzling, to be 4223 // split between the instruction's voffset and immoffset fields) and soffset 4224 // (the offset that is excluded from bounds checking and swizzling, to go in 4225 // the instruction's soffset field). This function takes the first kind of 4226 // offset and figures out how to split it between voffset and immoffset. 4227 std::pair<Register, unsigned> 4228 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 4229 Register OrigOffset) const { 4230 const unsigned MaxImm = 4095; 4231 Register BaseReg; 4232 unsigned ImmOffset; 4233 const LLT S32 = LLT::scalar(32); 4234 MachineRegisterInfo &MRI = *B.getMRI(); 4235 4236 std::tie(BaseReg, ImmOffset) = 4237 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 4238 4239 // If BaseReg is a pointer, convert it to int. 4240 if (MRI.getType(BaseReg).isPointer()) 4241 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 4242 4243 // If the immediate value is too big for the immoffset field, put the value 4244 // and -4096 into the immoffset field so that the value that is copied/added 4245 // for the voffset field is a multiple of 4096, and it stands more chance 4246 // of being CSEd with the copy/add for another similar load/store. 4247 // However, do not do that rounding down to a multiple of 4096 if that is a 4248 // negative number, as it appears to be illegal to have a negative offset 4249 // in the vgpr, even if adding the immediate offset makes it positive. 4250 unsigned Overflow = ImmOffset & ~MaxImm; 4251 ImmOffset -= Overflow; 4252 if ((int32_t)Overflow < 0) { 4253 Overflow += ImmOffset; 4254 ImmOffset = 0; 4255 } 4256 4257 if (Overflow != 0) { 4258 if (!BaseReg) { 4259 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 4260 } else { 4261 auto OverflowVal = B.buildConstant(S32, Overflow); 4262 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 4263 } 4264 } 4265 4266 if (!BaseReg) 4267 BaseReg = B.buildConstant(S32, 0).getReg(0); 4268 4269 return std::pair(BaseReg, ImmOffset); 4270 } 4271 4272 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4273 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4274 Register VOffset, Register SOffset, 4275 unsigned ImmOffset, Register VIndex, 4276 MachineRegisterInfo &MRI) const { 4277 std::optional<ValueAndVReg> MaybeVOffsetVal = 4278 getIConstantVRegValWithLookThrough(VOffset, MRI); 4279 std::optional<ValueAndVReg> MaybeSOffsetVal = 4280 getIConstantVRegValWithLookThrough(SOffset, MRI); 4281 std::optional<ValueAndVReg> MaybeVIndexVal = 4282 getIConstantVRegValWithLookThrough(VIndex, MRI); 4283 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4284 // update the MMO with that offset. The stride is unknown so we can only do 4285 // this if VIndex is constant 0. 4286 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4287 MaybeVIndexVal->Value == 0) { 4288 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4289 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4290 MMO->setOffset(TotalOffset); 4291 } else { 4292 // We don't have a constant combined offset to use in the MMO. Give up. 4293 MMO->setValue((Value *)nullptr); 4294 } 4295 } 4296 4297 /// Handle register layout difference for f16 images for some subtargets. 4298 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 4299 MachineRegisterInfo &MRI, 4300 Register Reg, 4301 bool ImageStore) const { 4302 const LLT S16 = LLT::scalar(16); 4303 const LLT S32 = LLT::scalar(32); 4304 LLT StoreVT = MRI.getType(Reg); 4305 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 4306 4307 if (ST.hasUnpackedD16VMem()) { 4308 auto Unmerge = B.buildUnmerge(S16, Reg); 4309 4310 SmallVector<Register, 4> WideRegs; 4311 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4312 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 4313 4314 int NumElts = StoreVT.getNumElements(); 4315 4316 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4317 .getReg(0); 4318 } 4319 4320 if (ImageStore && ST.hasImageStoreD16Bug()) { 4321 if (StoreVT.getNumElements() == 2) { 4322 SmallVector<Register, 4> PackedRegs; 4323 Reg = B.buildBitcast(S32, Reg).getReg(0); 4324 PackedRegs.push_back(Reg); 4325 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4326 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4327 .getReg(0); 4328 } 4329 4330 if (StoreVT.getNumElements() == 3) { 4331 SmallVector<Register, 4> PackedRegs; 4332 auto Unmerge = B.buildUnmerge(S16, Reg); 4333 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4334 PackedRegs.push_back(Unmerge.getReg(I)); 4335 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4336 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4337 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4338 } 4339 4340 if (StoreVT.getNumElements() == 4) { 4341 SmallVector<Register, 4> PackedRegs; 4342 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4343 auto Unmerge = B.buildUnmerge(S32, Reg); 4344 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4345 PackedRegs.push_back(Unmerge.getReg(I)); 4346 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4347 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4348 .getReg(0); 4349 } 4350 4351 llvm_unreachable("invalid data type"); 4352 } 4353 4354 if (StoreVT == LLT::fixed_vector(3, S16)) { 4355 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 4356 .getReg(0); 4357 } 4358 return Reg; 4359 } 4360 4361 Register AMDGPULegalizerInfo::fixStoreSourceType( 4362 MachineIRBuilder &B, Register VData, bool IsFormat) const { 4363 MachineRegisterInfo *MRI = B.getMRI(); 4364 LLT Ty = MRI->getType(VData); 4365 4366 const LLT S16 = LLT::scalar(16); 4367 4368 // Fixup illegal register types for i8 stores. 4369 if (Ty == LLT::scalar(8) || Ty == S16) { 4370 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 4371 return AnyExt; 4372 } 4373 4374 if (Ty.isVector()) { 4375 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 4376 if (IsFormat) 4377 return handleD16VData(B, *MRI, VData); 4378 } 4379 } 4380 4381 return VData; 4382 } 4383 4384 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 4385 MachineRegisterInfo &MRI, 4386 MachineIRBuilder &B, 4387 bool IsTyped, 4388 bool IsFormat) const { 4389 Register VData = MI.getOperand(1).getReg(); 4390 LLT Ty = MRI.getType(VData); 4391 LLT EltTy = Ty.getScalarType(); 4392 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4393 const LLT S32 = LLT::scalar(32); 4394 4395 VData = fixStoreSourceType(B, VData, IsFormat); 4396 Register RSrc = MI.getOperand(2).getReg(); 4397 4398 MachineMemOperand *MMO = *MI.memoperands_begin(); 4399 const int MemSize = MMO->getSize(); 4400 4401 unsigned ImmOffset; 4402 4403 // The typed intrinsics add an immediate after the registers. 4404 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4405 4406 // The struct intrinsic variants add one additional operand over raw. 4407 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4408 Register VIndex; 4409 int OpOffset = 0; 4410 if (HasVIndex) { 4411 VIndex = MI.getOperand(3).getReg(); 4412 OpOffset = 1; 4413 } else { 4414 VIndex = B.buildConstant(S32, 0).getReg(0); 4415 } 4416 4417 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4418 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4419 4420 unsigned Format = 0; 4421 if (IsTyped) { 4422 Format = MI.getOperand(5 + OpOffset).getImm(); 4423 ++OpOffset; 4424 } 4425 4426 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4427 4428 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4429 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4430 4431 unsigned Opc; 4432 if (IsTyped) { 4433 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4434 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4435 } else if (IsFormat) { 4436 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4437 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4438 } else { 4439 switch (MemSize) { 4440 case 1: 4441 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4442 break; 4443 case 2: 4444 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4445 break; 4446 default: 4447 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4448 break; 4449 } 4450 } 4451 4452 auto MIB = B.buildInstr(Opc) 4453 .addUse(VData) // vdata 4454 .addUse(RSrc) // rsrc 4455 .addUse(VIndex) // vindex 4456 .addUse(VOffset) // voffset 4457 .addUse(SOffset) // soffset 4458 .addImm(ImmOffset); // offset(imm) 4459 4460 if (IsTyped) 4461 MIB.addImm(Format); 4462 4463 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4464 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4465 .addMemOperand(MMO); 4466 4467 MI.eraseFromParent(); 4468 return true; 4469 } 4470 4471 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 4472 Register VIndex, Register VOffset, Register SOffset, 4473 unsigned ImmOffset, unsigned Format, 4474 unsigned AuxiliaryData, MachineMemOperand *MMO, 4475 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 4476 auto MIB = B.buildInstr(Opc) 4477 .addDef(LoadDstReg) // vdata 4478 .addUse(RSrc) // rsrc 4479 .addUse(VIndex) // vindex 4480 .addUse(VOffset) // voffset 4481 .addUse(SOffset) // soffset 4482 .addImm(ImmOffset); // offset(imm) 4483 4484 if (IsTyped) 4485 MIB.addImm(Format); 4486 4487 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4488 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4489 .addMemOperand(MMO); 4490 } 4491 4492 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4493 MachineRegisterInfo &MRI, 4494 MachineIRBuilder &B, 4495 bool IsFormat, 4496 bool IsTyped) const { 4497 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4498 MachineMemOperand *MMO = *MI.memoperands_begin(); 4499 const LLT MemTy = MMO->getMemoryType(); 4500 const LLT S32 = LLT::scalar(32); 4501 4502 Register Dst = MI.getOperand(0).getReg(); 4503 4504 Register StatusDst; 4505 int OpOffset = 0; 4506 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 4507 bool IsTFE = MI.getNumExplicitDefs() == 2; 4508 if (IsTFE) { 4509 StatusDst = MI.getOperand(1).getReg(); 4510 ++OpOffset; 4511 } 4512 4513 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 4514 4515 // The typed intrinsics add an immediate after the registers. 4516 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4517 4518 // The struct intrinsic variants add one additional operand over raw. 4519 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 4520 Register VIndex; 4521 if (HasVIndex) { 4522 VIndex = MI.getOperand(3 + OpOffset).getReg(); 4523 ++OpOffset; 4524 } else { 4525 VIndex = B.buildConstant(S32, 0).getReg(0); 4526 } 4527 4528 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4529 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4530 4531 unsigned Format = 0; 4532 if (IsTyped) { 4533 Format = MI.getOperand(5 + OpOffset).getImm(); 4534 ++OpOffset; 4535 } 4536 4537 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4538 unsigned ImmOffset; 4539 4540 LLT Ty = MRI.getType(Dst); 4541 LLT EltTy = Ty.getScalarType(); 4542 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4543 const bool Unpacked = ST.hasUnpackedD16VMem(); 4544 4545 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4546 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4547 4548 unsigned Opc; 4549 4550 // TODO: Support TFE for typed and narrow loads. 4551 if (IsTyped) { 4552 if (IsTFE) 4553 return false; 4554 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4555 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4556 } else if (IsFormat) { 4557 if (IsD16) { 4558 if (IsTFE) 4559 return false; 4560 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 4561 } else { 4562 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 4563 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4564 } 4565 } else { 4566 if (IsTFE) 4567 return false; 4568 switch (MemTy.getSizeInBits()) { 4569 case 8: 4570 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4571 break; 4572 case 16: 4573 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4574 break; 4575 default: 4576 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4577 break; 4578 } 4579 } 4580 4581 if (IsTFE) { 4582 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 4583 unsigned NumLoadDWords = NumValueDWords + 1; 4584 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 4585 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 4586 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 4587 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 4588 if (NumValueDWords == 1) { 4589 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 4590 } else { 4591 SmallVector<Register, 5> LoadElts; 4592 for (unsigned I = 0; I != NumValueDWords; ++I) 4593 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 4594 LoadElts.push_back(StatusDst); 4595 B.buildUnmerge(LoadElts, LoadDstReg); 4596 LoadElts.truncate(NumValueDWords); 4597 B.buildMergeLikeInstr(Dst, LoadElts); 4598 } 4599 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 4600 (IsD16 && !Ty.isVector())) { 4601 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4602 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 4603 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 4604 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4605 B.buildTrunc(Dst, LoadDstReg); 4606 } else if (Unpacked && IsD16 && Ty.isVector()) { 4607 LLT UnpackedTy = Ty.changeElementSize(32); 4608 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4609 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 4610 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 4611 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4612 // FIXME: G_TRUNC should work, but legalization currently fails 4613 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4614 SmallVector<Register, 4> Repack; 4615 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4616 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4617 B.buildMergeLikeInstr(Dst, Repack); 4618 } else { 4619 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 4620 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 4621 } 4622 4623 MI.eraseFromParent(); 4624 return true; 4625 } 4626 4627 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4628 MachineIRBuilder &B, 4629 bool IsInc) const { 4630 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4631 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4632 B.buildInstr(Opc) 4633 .addDef(MI.getOperand(0).getReg()) 4634 .addUse(MI.getOperand(2).getReg()) 4635 .addUse(MI.getOperand(3).getReg()) 4636 .cloneMemRefs(MI); 4637 MI.eraseFromParent(); 4638 return true; 4639 } 4640 4641 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4642 switch (IntrID) { 4643 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4644 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4646 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4647 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4648 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4649 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4650 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4651 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4652 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4653 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4654 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4655 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4656 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4657 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4658 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4659 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4661 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4662 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4663 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4664 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4665 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4666 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4667 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4668 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4669 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4670 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4671 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4672 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4673 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4674 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4675 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4676 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4677 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4678 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4679 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4680 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4681 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4682 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4683 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4684 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4685 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4686 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4687 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4688 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4689 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4690 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4691 default: 4692 llvm_unreachable("unhandled atomic opcode"); 4693 } 4694 } 4695 4696 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4697 MachineIRBuilder &B, 4698 Intrinsic::ID IID) const { 4699 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4700 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4701 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4702 4703 Register Dst; 4704 4705 int OpOffset = 0; 4706 if (HasReturn) { 4707 // A few FP atomics do not support return values. 4708 Dst = MI.getOperand(0).getReg(); 4709 } else { 4710 OpOffset = -1; 4711 } 4712 4713 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4714 Register CmpVal; 4715 4716 if (IsCmpSwap) { 4717 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4718 ++OpOffset; 4719 } 4720 4721 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4722 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4723 4724 // The struct intrinsic variants add one additional operand over raw. 4725 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4726 Register VIndex; 4727 if (HasVIndex) { 4728 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4729 ++OpOffset; 4730 } else { 4731 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4732 } 4733 4734 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4735 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4736 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4737 4738 MachineMemOperand *MMO = *MI.memoperands_begin(); 4739 4740 unsigned ImmOffset; 4741 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4742 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4743 4744 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4745 4746 if (HasReturn) 4747 MIB.addDef(Dst); 4748 4749 MIB.addUse(VData); // vdata 4750 4751 if (IsCmpSwap) 4752 MIB.addReg(CmpVal); 4753 4754 MIB.addUse(RSrc) // rsrc 4755 .addUse(VIndex) // vindex 4756 .addUse(VOffset) // voffset 4757 .addUse(SOffset) // soffset 4758 .addImm(ImmOffset) // offset(imm) 4759 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4760 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4761 .addMemOperand(MMO); 4762 4763 MI.eraseFromParent(); 4764 return true; 4765 } 4766 4767 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4768 /// vector with s16 typed elements. 4769 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4770 SmallVectorImpl<Register> &PackedAddrs, 4771 unsigned ArgOffset, 4772 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4773 bool IsA16, bool IsG16) { 4774 const LLT S16 = LLT::scalar(16); 4775 const LLT V2S16 = LLT::fixed_vector(2, 16); 4776 auto EndIdx = Intr->VAddrEnd; 4777 4778 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4779 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4780 if (!SrcOp.isReg()) 4781 continue; // _L to _LZ may have eliminated this. 4782 4783 Register AddrReg = SrcOp.getReg(); 4784 4785 if ((I < Intr->GradientStart) || 4786 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4787 (I >= Intr->CoordStart && !IsA16)) { 4788 if ((I < Intr->GradientStart) && IsA16 && 4789 (B.getMRI()->getType(AddrReg) == S16)) { 4790 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4791 // Special handling of bias when A16 is on. Bias is of type half but 4792 // occupies full 32-bit. 4793 PackedAddrs.push_back( 4794 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4795 .getReg(0)); 4796 } else { 4797 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4798 "Bias needs to be converted to 16 bit in A16 mode"); 4799 // Handle any gradient or coordinate operands that should not be packed 4800 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4801 PackedAddrs.push_back(AddrReg); 4802 } 4803 } else { 4804 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4805 // derivatives dx/dh and dx/dv are packed with undef. 4806 if (((I + 1) >= EndIdx) || 4807 ((Intr->NumGradients / 2) % 2 == 1 && 4808 (I == static_cast<unsigned>(Intr->GradientStart + 4809 (Intr->NumGradients / 2) - 1) || 4810 I == static_cast<unsigned>(Intr->GradientStart + 4811 Intr->NumGradients - 1))) || 4812 // Check for _L to _LZ optimization 4813 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4814 PackedAddrs.push_back( 4815 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4816 .getReg(0)); 4817 } else { 4818 PackedAddrs.push_back( 4819 B.buildBuildVector( 4820 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4821 .getReg(0)); 4822 ++I; 4823 } 4824 } 4825 } 4826 } 4827 4828 /// Convert from separate vaddr components to a single vector address register, 4829 /// and replace the remaining operands with $noreg. 4830 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4831 int DimIdx, int NumVAddrs) { 4832 const LLT S32 = LLT::scalar(32); 4833 (void)S32; 4834 SmallVector<Register, 8> AddrRegs; 4835 for (int I = 0; I != NumVAddrs; ++I) { 4836 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4837 if (SrcOp.isReg()) { 4838 AddrRegs.push_back(SrcOp.getReg()); 4839 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4840 } 4841 } 4842 4843 int NumAddrRegs = AddrRegs.size(); 4844 if (NumAddrRegs != 1) { 4845 auto VAddr = 4846 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4847 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4848 } 4849 4850 for (int I = 1; I != NumVAddrs; ++I) { 4851 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4852 if (SrcOp.isReg()) 4853 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4854 } 4855 } 4856 4857 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4858 /// 4859 /// Depending on the subtarget, load/store with 16-bit element data need to be 4860 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4861 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4862 /// registers. 4863 /// 4864 /// We don't want to directly select image instructions just yet, but also want 4865 /// to exposes all register repacking to the legalizer/combiners. We also don't 4866 /// want a selected instruction entering RegBankSelect. In order to avoid 4867 /// defining a multitude of intermediate image instructions, directly hack on 4868 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4869 /// padding now unnecessary arguments with $noreg. 4870 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4871 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4872 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4873 4874 const MachineFunction &MF = *MI.getMF(); 4875 const unsigned NumDefs = MI.getNumExplicitDefs(); 4876 const unsigned ArgOffset = NumDefs + 1; 4877 bool IsTFE = NumDefs == 2; 4878 // We are only processing the operands of d16 image operations on subtargets 4879 // that use the unpacked register layout, or need to repack the TFE result. 4880 4881 // TODO: Do we need to guard against already legalized intrinsics? 4882 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4883 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4884 4885 MachineRegisterInfo *MRI = B.getMRI(); 4886 const LLT S32 = LLT::scalar(32); 4887 const LLT S16 = LLT::scalar(16); 4888 const LLT V2S16 = LLT::fixed_vector(2, 16); 4889 4890 unsigned DMask = 0; 4891 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4892 LLT Ty = MRI->getType(VData); 4893 4894 // Check for 16 bit addresses and pack if true. 4895 LLT GradTy = 4896 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4897 LLT AddrTy = 4898 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4899 const bool IsG16 = GradTy == S16; 4900 const bool IsA16 = AddrTy == S16; 4901 const bool IsD16 = Ty.getScalarType() == S16; 4902 4903 int DMaskLanes = 0; 4904 if (!BaseOpcode->Atomic) { 4905 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4906 if (BaseOpcode->Gather4) { 4907 DMaskLanes = 4; 4908 } else if (DMask != 0) { 4909 DMaskLanes = llvm::popcount(DMask); 4910 } else if (!IsTFE && !BaseOpcode->Store) { 4911 // If dmask is 0, this is a no-op load. This can be eliminated. 4912 B.buildUndef(MI.getOperand(0)); 4913 MI.eraseFromParent(); 4914 return true; 4915 } 4916 } 4917 4918 Observer.changingInstr(MI); 4919 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4920 4921 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4922 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4923 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4924 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4925 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4926 4927 // Track that we legalized this 4928 MI.setDesc(B.getTII().get(NewOpcode)); 4929 4930 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4931 // dmask to be at least 1 otherwise the instruction will fail 4932 if (IsTFE && DMask == 0) { 4933 DMask = 0x1; 4934 DMaskLanes = 1; 4935 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4936 } 4937 4938 if (BaseOpcode->Atomic) { 4939 Register VData0 = MI.getOperand(2).getReg(); 4940 LLT Ty = MRI->getType(VData0); 4941 4942 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4943 if (Ty.isVector()) 4944 return false; 4945 4946 if (BaseOpcode->AtomicX2) { 4947 Register VData1 = MI.getOperand(3).getReg(); 4948 // The two values are packed in one register. 4949 LLT PackedTy = LLT::fixed_vector(2, Ty); 4950 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4951 MI.getOperand(2).setReg(Concat.getReg(0)); 4952 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4953 } 4954 } 4955 4956 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4957 4958 // Rewrite the addressing register layout before doing anything else. 4959 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4960 // 16 bit gradients are supported, but are tied to the A16 control 4961 // so both gradients and addresses must be 16 bit 4962 return false; 4963 } 4964 4965 if (IsA16 && !ST.hasA16()) { 4966 // A16 not supported 4967 return false; 4968 } 4969 4970 if (IsA16 || IsG16) { 4971 if (Intr->NumVAddrs > 1) { 4972 SmallVector<Register, 4> PackedRegs; 4973 4974 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4975 IsG16); 4976 4977 // See also below in the non-a16 branch 4978 const bool UseNSA = ST.hasNSAEncoding() && 4979 PackedRegs.size() >= ST.getNSAThreshold(MF) && 4980 PackedRegs.size() <= ST.getNSAMaxSize(); 4981 4982 if (!UseNSA && PackedRegs.size() > 1) { 4983 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4984 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4985 PackedRegs[0] = Concat.getReg(0); 4986 PackedRegs.resize(1); 4987 } 4988 4989 const unsigned NumPacked = PackedRegs.size(); 4990 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4991 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4992 if (!SrcOp.isReg()) { 4993 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4994 continue; 4995 } 4996 4997 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4998 4999 if (I - Intr->VAddrStart < NumPacked) 5000 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 5001 else 5002 SrcOp.setReg(AMDGPU::NoRegister); 5003 } 5004 } 5005 } else { 5006 // If the register allocator cannot place the address registers contiguously 5007 // without introducing moves, then using the non-sequential address encoding 5008 // is always preferable, since it saves VALU instructions and is usually a 5009 // wash in terms of code size or even better. 5010 // 5011 // However, we currently have no way of hinting to the register allocator 5012 // that MIMG addresses should be placed contiguously when it is possible to 5013 // do so, so force non-NSA for the common 2-address case as a heuristic. 5014 // 5015 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 5016 // allocation when possible. 5017 // 5018 // TODO: we can actually allow partial NSA where the final register is a 5019 // contiguous set of the remaining addresses. 5020 // This could help where there are more addresses than supported. 5021 const bool UseNSA = ST.hasNSAEncoding() && 5022 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 5023 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 5024 5025 if (!UseNSA && Intr->NumVAddrs > 1) 5026 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5027 Intr->NumVAddrs); 5028 } 5029 5030 int Flags = 0; 5031 if (IsA16) 5032 Flags |= 1; 5033 if (IsG16) 5034 Flags |= 2; 5035 MI.addOperand(MachineOperand::CreateImm(Flags)); 5036 5037 if (BaseOpcode->Store) { // No TFE for stores? 5038 // TODO: Handle dmask trim 5039 if (!Ty.isVector() || !IsD16) 5040 return true; 5041 5042 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 5043 if (RepackedReg != VData) { 5044 MI.getOperand(1).setReg(RepackedReg); 5045 } 5046 5047 return true; 5048 } 5049 5050 Register DstReg = MI.getOperand(0).getReg(); 5051 const LLT EltTy = Ty.getScalarType(); 5052 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 5053 5054 // Confirm that the return type is large enough for the dmask specified 5055 if (NumElts < DMaskLanes) 5056 return false; 5057 5058 if (NumElts > 4 || DMaskLanes > 4) 5059 return false; 5060 5061 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5062 const LLT AdjustedTy = 5063 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 5064 5065 // The raw dword aligned data component of the load. The only legal cases 5066 // where this matters should be when using the packed D16 format, for 5067 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 5068 LLT RoundedTy; 5069 5070 // S32 vector to cover all data, plus TFE result element. 5071 LLT TFETy; 5072 5073 // Register type to use for each loaded component. Will be S32 or V2S16. 5074 LLT RegTy; 5075 5076 if (IsD16 && ST.hasUnpackedD16VMem()) { 5077 RoundedTy = 5078 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5079 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 5080 RegTy = S32; 5081 } else { 5082 unsigned EltSize = EltTy.getSizeInBits(); 5083 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 5084 unsigned RoundedSize = 32 * RoundedElts; 5085 RoundedTy = LLT::scalarOrVector( 5086 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5087 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 5088 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 5089 } 5090 5091 // The return type does not need adjustment. 5092 // TODO: Should we change s16 case to s32 or <2 x s16>? 5093 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 5094 return true; 5095 5096 Register Dst1Reg; 5097 5098 // Insert after the instruction. 5099 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 5100 5101 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 5102 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 5103 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 5104 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 5105 5106 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 5107 5108 MI.getOperand(0).setReg(NewResultReg); 5109 5110 // In the IR, TFE is supposed to be used with a 2 element struct return 5111 // type. The instruction really returns these two values in one contiguous 5112 // register, with one additional dword beyond the loaded data. Rewrite the 5113 // return type to use a single register result. 5114 5115 if (IsTFE) { 5116 Dst1Reg = MI.getOperand(1).getReg(); 5117 if (MRI->getType(Dst1Reg) != S32) 5118 return false; 5119 5120 // TODO: Make sure the TFE operand bit is set. 5121 MI.removeOperand(1); 5122 5123 // Handle the easy case that requires no repack instructions. 5124 if (Ty == S32) { 5125 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 5126 return true; 5127 } 5128 } 5129 5130 // Now figure out how to copy the new result register back into the old 5131 // result. 5132 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 5133 5134 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 5135 5136 if (ResultNumRegs == 1) { 5137 assert(!IsTFE); 5138 ResultRegs[0] = NewResultReg; 5139 } else { 5140 // We have to repack into a new vector of some kind. 5141 for (int I = 0; I != NumDataRegs; ++I) 5142 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 5143 B.buildUnmerge(ResultRegs, NewResultReg); 5144 5145 // Drop the final TFE element to get the data part. The TFE result is 5146 // directly written to the right place already. 5147 if (IsTFE) 5148 ResultRegs.resize(NumDataRegs); 5149 } 5150 5151 // For an s16 scalar result, we form an s32 result with a truncate regardless 5152 // of packed vs. unpacked. 5153 if (IsD16 && !Ty.isVector()) { 5154 B.buildTrunc(DstReg, ResultRegs[0]); 5155 return true; 5156 } 5157 5158 // Avoid a build/concat_vector of 1 entry. 5159 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 5160 B.buildBitcast(DstReg, ResultRegs[0]); 5161 return true; 5162 } 5163 5164 assert(Ty.isVector()); 5165 5166 if (IsD16) { 5167 // For packed D16 results with TFE enabled, all the data components are 5168 // S32. Cast back to the expected type. 5169 // 5170 // TODO: We don't really need to use load s32 elements. We would only need one 5171 // cast for the TFE result if a multiple of v2s16 was used. 5172 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 5173 for (Register &Reg : ResultRegs) 5174 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 5175 } else if (ST.hasUnpackedD16VMem()) { 5176 for (Register &Reg : ResultRegs) 5177 Reg = B.buildTrunc(S16, Reg).getReg(0); 5178 } 5179 } 5180 5181 auto padWithUndef = [&](LLT Ty, int NumElts) { 5182 if (NumElts == 0) 5183 return; 5184 Register Undef = B.buildUndef(Ty).getReg(0); 5185 for (int I = 0; I != NumElts; ++I) 5186 ResultRegs.push_back(Undef); 5187 }; 5188 5189 // Pad out any elements eliminated due to the dmask. 5190 LLT ResTy = MRI->getType(ResultRegs[0]); 5191 if (!ResTy.isVector()) { 5192 padWithUndef(ResTy, NumElts - ResultRegs.size()); 5193 B.buildBuildVector(DstReg, ResultRegs); 5194 return true; 5195 } 5196 5197 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 5198 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 5199 5200 // Deal with the one annoying legal case. 5201 const LLT V3S16 = LLT::fixed_vector(3, 16); 5202 if (Ty == V3S16) { 5203 if (IsTFE) { 5204 if (ResultRegs.size() == 1) { 5205 NewResultReg = ResultRegs[0]; 5206 } else if (ResultRegs.size() == 2) { 5207 LLT V4S16 = LLT::fixed_vector(4, 16); 5208 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 5209 } else { 5210 return false; 5211 } 5212 } 5213 5214 if (MRI->getType(DstReg).getNumElements() < 5215 MRI->getType(NewResultReg).getNumElements()) { 5216 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 5217 } else { 5218 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 5219 } 5220 return true; 5221 } 5222 5223 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 5224 B.buildConcatVectors(DstReg, ResultRegs); 5225 return true; 5226 } 5227 5228 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5229 LegalizerHelper &Helper, MachineInstr &MI) const { 5230 MachineIRBuilder &B = Helper.MIRBuilder; 5231 GISelChangeObserver &Observer = Helper.Observer; 5232 5233 Register Dst = MI.getOperand(0).getReg(); 5234 LLT Ty = B.getMRI()->getType(Dst); 5235 unsigned Size = Ty.getSizeInBits(); 5236 MachineFunction &MF = B.getMF(); 5237 5238 Observer.changingInstr(MI); 5239 5240 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5241 Ty = getBitcastRegisterType(Ty); 5242 Helper.bitcastDst(MI, Ty, 0); 5243 Dst = MI.getOperand(0).getReg(); 5244 B.setInsertPt(B.getMBB(), MI); 5245 } 5246 5247 // FIXME: We don't really need this intermediate instruction. The intrinsic 5248 // should be fixed to have a memory operand. Since it's readnone, we're not 5249 // allowed to add one. 5250 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 5251 MI.removeOperand(1); // Remove intrinsic ID 5252 5253 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 5254 // TODO: Should this use datalayout alignment? 5255 const unsigned MemSize = (Size + 7) / 8; 5256 const Align MemAlign(4); 5257 MachineMemOperand *MMO = MF.getMachineMemOperand( 5258 MachinePointerInfo(), 5259 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5260 MachineMemOperand::MOInvariant, 5261 MemSize, MemAlign); 5262 MI.addMemOperand(MF, MMO); 5263 5264 // There are no 96-bit result scalar loads, but widening to 128-bit should 5265 // always be legal. We may need to restore this to a 96-bit result if it turns 5266 // out this needs to be converted to a vector load during RegBankSelect. 5267 if (!isPowerOf2_32(Size)) { 5268 if (Ty.isVector()) 5269 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 5270 else 5271 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 5272 } 5273 5274 Observer.changedInstr(MI); 5275 return true; 5276 } 5277 5278 // TODO: Move to selection 5279 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 5280 MachineRegisterInfo &MRI, 5281 MachineIRBuilder &B) const { 5282 if (!ST.isTrapHandlerEnabled() || 5283 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5284 return legalizeTrapEndpgm(MI, MRI, B); 5285 5286 if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5287 switch (*HsaAbiVer) { 5288 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5289 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5290 return legalizeTrapHsaQueuePtr(MI, MRI, B); 5291 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 5292 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5293 return ST.supportsGetDoorbellID() ? 5294 legalizeTrapHsa(MI, MRI, B) : 5295 legalizeTrapHsaQueuePtr(MI, MRI, B); 5296 } 5297 } 5298 5299 llvm_unreachable("Unknown trap handler"); 5300 } 5301 5302 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5303 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5304 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5305 MI.eraseFromParent(); 5306 return true; 5307 } 5308 5309 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5310 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5311 MachineFunction &MF = B.getMF(); 5312 const LLT S64 = LLT::scalar(64); 5313 5314 Register SGPR01(AMDGPU::SGPR0_SGPR1); 5315 // For code object version 5, queue_ptr is passed through implicit kernarg. 5316 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 5317 AMDGPUTargetLowering::ImplicitParameter Param = 5318 AMDGPUTargetLowering::QUEUE_PTR; 5319 uint64_t Offset = 5320 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 5321 5322 Register KernargPtrReg = MRI.createGenericVirtualRegister( 5323 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5324 5325 if (!loadInputValue(KernargPtrReg, B, 5326 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5327 return false; 5328 5329 // TODO: can we be smarter about machine pointer info? 5330 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 5331 MachineMemOperand *MMO = MF.getMachineMemOperand( 5332 PtrInfo, 5333 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5334 MachineMemOperand::MOInvariant, 5335 LLT::scalar(64), commonAlignment(Align(64), Offset)); 5336 5337 // Pointer address 5338 Register LoadAddr = MRI.createGenericVirtualRegister( 5339 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5340 B.buildPtrAdd(LoadAddr, KernargPtrReg, 5341 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 5342 // Load address 5343 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 5344 B.buildCopy(SGPR01, Temp); 5345 B.buildInstr(AMDGPU::S_TRAP) 5346 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5347 .addReg(SGPR01, RegState::Implicit); 5348 MI.eraseFromParent(); 5349 return true; 5350 } 5351 5352 // Pass queue pointer to trap handler as input, and insert trap instruction 5353 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5354 Register LiveIn = 5355 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5356 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 5357 return false; 5358 5359 B.buildCopy(SGPR01, LiveIn); 5360 B.buildInstr(AMDGPU::S_TRAP) 5361 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5362 .addReg(SGPR01, RegState::Implicit); 5363 5364 MI.eraseFromParent(); 5365 return true; 5366 } 5367 5368 bool AMDGPULegalizerInfo::legalizeTrapHsa( 5369 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5370 B.buildInstr(AMDGPU::S_TRAP) 5371 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 5372 MI.eraseFromParent(); 5373 return true; 5374 } 5375 5376 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 5377 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5378 // Is non-HSA path or trap-handler disabled? Then, report a warning 5379 // accordingly 5380 if (!ST.isTrapHandlerEnabled() || 5381 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 5382 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 5383 "debugtrap handler not supported", 5384 MI.getDebugLoc(), DS_Warning); 5385 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 5386 Ctx.diagnose(NoTrap); 5387 } else { 5388 // Insert debug-trap instruction 5389 B.buildInstr(AMDGPU::S_TRAP) 5390 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 5391 } 5392 5393 MI.eraseFromParent(); 5394 return true; 5395 } 5396 5397 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5398 MachineIRBuilder &B) const { 5399 MachineRegisterInfo &MRI = *B.getMRI(); 5400 const LLT S16 = LLT::scalar(16); 5401 const LLT S32 = LLT::scalar(32); 5402 const LLT V2S16 = LLT::fixed_vector(2, 16); 5403 const LLT V3S32 = LLT::fixed_vector(3, 32); 5404 5405 Register DstReg = MI.getOperand(0).getReg(); 5406 Register NodePtr = MI.getOperand(2).getReg(); 5407 Register RayExtent = MI.getOperand(3).getReg(); 5408 Register RayOrigin = MI.getOperand(4).getReg(); 5409 Register RayDir = MI.getOperand(5).getReg(); 5410 Register RayInvDir = MI.getOperand(6).getReg(); 5411 Register TDescr = MI.getOperand(7).getReg(); 5412 5413 if (!ST.hasGFX10_AEncoding()) { 5414 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5415 "intrinsic not supported on subtarget", 5416 MI.getDebugLoc()); 5417 B.getMF().getFunction().getContext().diagnose(BadIntrin); 5418 return false; 5419 } 5420 5421 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 5422 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5423 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5424 const unsigned NumVDataDwords = 4; 5425 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 5426 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 5427 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 5428 const unsigned BaseOpcodes[2][2] = { 5429 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5430 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5431 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5432 int Opcode; 5433 if (UseNSA) { 5434 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5435 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 5436 : AMDGPU::MIMGEncGfx10NSA, 5437 NumVDataDwords, NumVAddrDwords); 5438 } else { 5439 Opcode = AMDGPU::getMIMGOpcode( 5440 BaseOpcodes[Is64][IsA16], 5441 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 5442 NumVDataDwords, NumVAddrDwords); 5443 } 5444 assert(Opcode != -1); 5445 5446 SmallVector<Register, 12> Ops; 5447 if (UseNSA && IsGFX11Plus) { 5448 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 5449 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5450 auto Merged = B.buildMergeLikeInstr( 5451 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 5452 Ops.push_back(Merged.getReg(0)); 5453 }; 5454 5455 Ops.push_back(NodePtr); 5456 Ops.push_back(RayExtent); 5457 packLanes(RayOrigin); 5458 5459 if (IsA16) { 5460 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5461 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5462 auto MergedDir = B.buildMergeLikeInstr( 5463 V3S32, 5464 {B.buildBitcast( 5465 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 5466 UnmergeRayDir.getReg(0)})) 5467 .getReg(0), 5468 B.buildBitcast( 5469 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 5470 UnmergeRayDir.getReg(1)})) 5471 .getReg(0), 5472 B.buildBitcast( 5473 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 5474 UnmergeRayDir.getReg(2)})) 5475 .getReg(0)}); 5476 Ops.push_back(MergedDir.getReg(0)); 5477 } else { 5478 packLanes(RayDir); 5479 packLanes(RayInvDir); 5480 } 5481 } else { 5482 if (Is64) { 5483 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5484 Ops.push_back(Unmerge.getReg(0)); 5485 Ops.push_back(Unmerge.getReg(1)); 5486 } else { 5487 Ops.push_back(NodePtr); 5488 } 5489 Ops.push_back(RayExtent); 5490 5491 auto packLanes = [&Ops, &S32, &B](Register Src) { 5492 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5493 Ops.push_back(Unmerge.getReg(0)); 5494 Ops.push_back(Unmerge.getReg(1)); 5495 Ops.push_back(Unmerge.getReg(2)); 5496 }; 5497 5498 packLanes(RayOrigin); 5499 if (IsA16) { 5500 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5501 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5502 Register R1 = MRI.createGenericVirtualRegister(S32); 5503 Register R2 = MRI.createGenericVirtualRegister(S32); 5504 Register R3 = MRI.createGenericVirtualRegister(S32); 5505 B.buildMergeLikeInstr(R1, 5506 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5507 B.buildMergeLikeInstr( 5508 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5509 B.buildMergeLikeInstr( 5510 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5511 Ops.push_back(R1); 5512 Ops.push_back(R2); 5513 Ops.push_back(R3); 5514 } else { 5515 packLanes(RayDir); 5516 packLanes(RayInvDir); 5517 } 5518 } 5519 5520 if (!UseNSA) { 5521 // Build a single vector containing all the operands so far prepared. 5522 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5523 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 5524 Ops.clear(); 5525 Ops.push_back(MergedOps); 5526 } 5527 5528 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5529 .addDef(DstReg) 5530 .addImm(Opcode); 5531 5532 for (Register R : Ops) { 5533 MIB.addUse(R); 5534 } 5535 5536 MIB.addUse(TDescr) 5537 .addImm(IsA16 ? 1 : 0) 5538 .cloneMemRefs(MI); 5539 5540 MI.eraseFromParent(); 5541 return true; 5542 } 5543 5544 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5545 MachineIRBuilder &B) const { 5546 unsigned Opc; 5547 int RoundMode = MI.getOperand(2).getImm(); 5548 5549 if (RoundMode == (int)RoundingMode::TowardPositive) 5550 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5551 else if (RoundMode == (int)RoundingMode::TowardNegative) 5552 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5553 else 5554 return false; 5555 5556 B.buildInstr(Opc) 5557 .addDef(MI.getOperand(0).getReg()) 5558 .addUse(MI.getOperand(1).getReg()); 5559 5560 MI.eraseFromParent(); 5561 5562 return true; 5563 } 5564 5565 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5566 MachineInstr &MI) const { 5567 MachineIRBuilder &B = Helper.MIRBuilder; 5568 MachineRegisterInfo &MRI = *B.getMRI(); 5569 5570 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5571 auto IntrID = MI.getIntrinsicID(); 5572 switch (IntrID) { 5573 case Intrinsic::amdgcn_if: 5574 case Intrinsic::amdgcn_else: { 5575 MachineInstr *Br = nullptr; 5576 MachineBasicBlock *UncondBrTarget = nullptr; 5577 bool Negated = false; 5578 if (MachineInstr *BrCond = 5579 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5580 const SIRegisterInfo *TRI 5581 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5582 5583 Register Def = MI.getOperand(1).getReg(); 5584 Register Use = MI.getOperand(3).getReg(); 5585 5586 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5587 5588 if (Negated) 5589 std::swap(CondBrTarget, UncondBrTarget); 5590 5591 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5592 if (IntrID == Intrinsic::amdgcn_if) { 5593 B.buildInstr(AMDGPU::SI_IF) 5594 .addDef(Def) 5595 .addUse(Use) 5596 .addMBB(UncondBrTarget); 5597 } else { 5598 B.buildInstr(AMDGPU::SI_ELSE) 5599 .addDef(Def) 5600 .addUse(Use) 5601 .addMBB(UncondBrTarget); 5602 } 5603 5604 if (Br) { 5605 Br->getOperand(0).setMBB(CondBrTarget); 5606 } else { 5607 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5608 // since we're swapping branch targets it needs to be reinserted. 5609 // FIXME: IRTranslator should probably not do this 5610 B.buildBr(*CondBrTarget); 5611 } 5612 5613 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5614 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5615 MI.eraseFromParent(); 5616 BrCond->eraseFromParent(); 5617 return true; 5618 } 5619 5620 return false; 5621 } 5622 case Intrinsic::amdgcn_loop: { 5623 MachineInstr *Br = nullptr; 5624 MachineBasicBlock *UncondBrTarget = nullptr; 5625 bool Negated = false; 5626 if (MachineInstr *BrCond = 5627 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5628 const SIRegisterInfo *TRI 5629 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5630 5631 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5632 Register Reg = MI.getOperand(2).getReg(); 5633 5634 if (Negated) 5635 std::swap(CondBrTarget, UncondBrTarget); 5636 5637 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5638 B.buildInstr(AMDGPU::SI_LOOP) 5639 .addUse(Reg) 5640 .addMBB(UncondBrTarget); 5641 5642 if (Br) 5643 Br->getOperand(0).setMBB(CondBrTarget); 5644 else 5645 B.buildBr(*CondBrTarget); 5646 5647 MI.eraseFromParent(); 5648 BrCond->eraseFromParent(); 5649 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5650 return true; 5651 } 5652 5653 return false; 5654 } 5655 case Intrinsic::amdgcn_kernarg_segment_ptr: 5656 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5657 // This only makes sense to call in a kernel, so just lower to null. 5658 B.buildConstant(MI.getOperand(0).getReg(), 0); 5659 MI.eraseFromParent(); 5660 return true; 5661 } 5662 5663 return legalizePreloadedArgIntrin( 5664 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5665 case Intrinsic::amdgcn_implicitarg_ptr: 5666 return legalizeImplicitArgPtr(MI, MRI, B); 5667 case Intrinsic::amdgcn_workitem_id_x: 5668 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 5669 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5670 case Intrinsic::amdgcn_workitem_id_y: 5671 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 5672 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5673 case Intrinsic::amdgcn_workitem_id_z: 5674 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 5675 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5676 case Intrinsic::amdgcn_workgroup_id_x: 5677 return legalizePreloadedArgIntrin(MI, MRI, B, 5678 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5679 case Intrinsic::amdgcn_workgroup_id_y: 5680 return legalizePreloadedArgIntrin(MI, MRI, B, 5681 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5682 case Intrinsic::amdgcn_workgroup_id_z: 5683 return legalizePreloadedArgIntrin(MI, MRI, B, 5684 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5685 case Intrinsic::amdgcn_lds_kernel_id: 5686 return legalizePreloadedArgIntrin(MI, MRI, B, 5687 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5688 case Intrinsic::amdgcn_dispatch_ptr: 5689 return legalizePreloadedArgIntrin(MI, MRI, B, 5690 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5691 case Intrinsic::amdgcn_queue_ptr: 5692 return legalizePreloadedArgIntrin(MI, MRI, B, 5693 AMDGPUFunctionArgInfo::QUEUE_PTR); 5694 case Intrinsic::amdgcn_implicit_buffer_ptr: 5695 return legalizePreloadedArgIntrin( 5696 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5697 case Intrinsic::amdgcn_dispatch_id: 5698 return legalizePreloadedArgIntrin(MI, MRI, B, 5699 AMDGPUFunctionArgInfo::DISPATCH_ID); 5700 case Intrinsic::r600_read_ngroups_x: 5701 // TODO: Emit error for hsa 5702 return legalizeKernargMemParameter(MI, B, 5703 SI::KernelInputOffsets::NGROUPS_X); 5704 case Intrinsic::r600_read_ngroups_y: 5705 return legalizeKernargMemParameter(MI, B, 5706 SI::KernelInputOffsets::NGROUPS_Y); 5707 case Intrinsic::r600_read_ngroups_z: 5708 return legalizeKernargMemParameter(MI, B, 5709 SI::KernelInputOffsets::NGROUPS_Z); 5710 case Intrinsic::r600_read_local_size_x: 5711 // TODO: Could insert G_ASSERT_ZEXT from s16 5712 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5713 case Intrinsic::r600_read_local_size_y: 5714 // TODO: Could insert G_ASSERT_ZEXT from s16 5715 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5716 // TODO: Could insert G_ASSERT_ZEXT from s16 5717 case Intrinsic::r600_read_local_size_z: 5718 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5719 case Intrinsic::r600_read_global_size_x: 5720 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5721 case Intrinsic::r600_read_global_size_y: 5722 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5723 case Intrinsic::r600_read_global_size_z: 5724 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5725 case Intrinsic::amdgcn_fdiv_fast: 5726 return legalizeFDIVFastIntrin(MI, MRI, B); 5727 case Intrinsic::amdgcn_is_shared: 5728 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5729 case Intrinsic::amdgcn_is_private: 5730 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5731 case Intrinsic::amdgcn_wavefrontsize: { 5732 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5733 MI.eraseFromParent(); 5734 return true; 5735 } 5736 case Intrinsic::amdgcn_s_buffer_load: 5737 return legalizeSBufferLoad(Helper, MI); 5738 case Intrinsic::amdgcn_raw_buffer_store: 5739 case Intrinsic::amdgcn_struct_buffer_store: 5740 return legalizeBufferStore(MI, MRI, B, false, false); 5741 case Intrinsic::amdgcn_raw_buffer_store_format: 5742 case Intrinsic::amdgcn_struct_buffer_store_format: 5743 return legalizeBufferStore(MI, MRI, B, false, true); 5744 case Intrinsic::amdgcn_raw_tbuffer_store: 5745 case Intrinsic::amdgcn_struct_tbuffer_store: 5746 return legalizeBufferStore(MI, MRI, B, true, true); 5747 case Intrinsic::amdgcn_raw_buffer_load: 5748 case Intrinsic::amdgcn_struct_buffer_load: 5749 return legalizeBufferLoad(MI, MRI, B, false, false); 5750 case Intrinsic::amdgcn_raw_buffer_load_format: 5751 case Intrinsic::amdgcn_struct_buffer_load_format: 5752 return legalizeBufferLoad(MI, MRI, B, true, false); 5753 case Intrinsic::amdgcn_raw_tbuffer_load: 5754 case Intrinsic::amdgcn_struct_tbuffer_load: 5755 return legalizeBufferLoad(MI, MRI, B, true, true); 5756 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5757 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5758 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5759 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5760 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5761 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5762 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5763 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5764 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5765 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5766 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5767 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5768 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5769 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5770 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5771 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5772 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5773 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5774 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5775 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5776 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5777 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5778 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5779 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5780 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5781 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5782 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5783 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5784 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5785 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5786 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5787 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5788 return legalizeBufferAtomic(MI, B, IntrID); 5789 case Intrinsic::amdgcn_atomic_inc: 5790 return legalizeAtomicIncDec(MI, B, true); 5791 case Intrinsic::amdgcn_atomic_dec: 5792 return legalizeAtomicIncDec(MI, B, false); 5793 case Intrinsic::trap: 5794 return legalizeTrapIntrinsic(MI, MRI, B); 5795 case Intrinsic::debugtrap: 5796 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5797 case Intrinsic::amdgcn_rsq_clamp: 5798 return legalizeRsqClampIntrinsic(MI, MRI, B); 5799 case Intrinsic::amdgcn_ds_fadd: 5800 case Intrinsic::amdgcn_ds_fmin: 5801 case Intrinsic::amdgcn_ds_fmax: 5802 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5803 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5804 return legalizeBVHIntrinsic(MI, B); 5805 default: { 5806 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5807 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5808 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5809 return true; 5810 } 5811 } 5812 5813 return true; 5814 } 5815