1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returns true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .minScalar(0, S16) 536 .clampMaxNumElementsStrict(0, S16, 2) 537 .widenScalarToNextMultipleOf(0, 32) 538 .maxScalar(0, S32) 539 .scalarize(0); 540 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 542 .legalFor({S32, S16, V2S16}) // Clamp modifier 543 .minScalarOrElt(0, S16) 544 .clampMaxNumElementsStrict(0, S16, 2) 545 .scalarize(0) 546 .widenScalarToNextPow2(0, 32) 547 .lower(); 548 } else if (ST.has16BitInsts()) { 549 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 550 .legalFor({S32, S16}) 551 .minScalar(0, S16) 552 .widenScalarToNextMultipleOf(0, 32) 553 .maxScalar(0, S32) 554 .scalarize(0); 555 556 // Technically the saturating operations require clamp bit support, but this 557 // was introduced at the same time as 16-bit operations. 558 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 559 .legalFor({S32, S16}) // Clamp modifier 560 .minScalar(0, S16) 561 .scalarize(0) 562 .widenScalarToNextPow2(0, 16) 563 .lower(); 564 565 // We're just lowering this, but it helps get a better result to try to 566 // coerce to the desired type first. 567 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 568 .minScalar(0, S16) 569 .scalarize(0) 570 .lower(); 571 } else { 572 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 573 .legalFor({S32}) 574 .widenScalarToNextMultipleOf(0, 32) 575 .clampScalar(0, S32, S32) 576 .scalarize(0); 577 578 if (ST.hasIntClamp()) { 579 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 580 .legalFor({S32}) // Clamp modifier. 581 .scalarize(0) 582 .minScalarOrElt(0, S32) 583 .lower(); 584 } else { 585 // Clamp bit support was added in VI, along with 16-bit operations. 586 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 587 .minScalar(0, S32) 588 .scalarize(0) 589 .lower(); 590 } 591 592 // FIXME: DAG expansion gets better results. The widening uses the smaller 593 // range values and goes for the min/max lowering directly. 594 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 595 .minScalar(0, S32) 596 .scalarize(0) 597 .lower(); 598 } 599 600 getActionDefinitionsBuilder( 601 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 602 .customFor({S32, S64}) 603 .clampScalar(0, S32, S64) 604 .widenScalarToNextPow2(0, 32) 605 .scalarize(0); 606 607 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 608 .legalFor({S32}) 609 .maxScalar(0, S32); 610 611 if (ST.hasVOP3PInsts()) { 612 Mulh 613 .clampMaxNumElements(0, S8, 2) 614 .lowerFor({V2S8}); 615 } 616 617 Mulh 618 .scalarize(0) 619 .lower(); 620 621 // Report legal for any types we can handle anywhere. For the cases only legal 622 // on the SALU, RegBankSelect will be able to re-legalize. 623 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 624 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 625 .clampScalar(0, S32, S64) 626 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 627 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 628 .widenScalarToNextPow2(0) 629 .scalarize(0); 630 631 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 632 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 633 .legalFor({{S32, S1}, {S32, S32}}) 634 .minScalar(0, S32) 635 // TODO: .scalarize(0) 636 .lower(); 637 638 getActionDefinitionsBuilder(G_BITCAST) 639 // Don't worry about the size constraint. 640 .legalIf(all(isRegisterType(0), isRegisterType(1))) 641 .lower(); 642 643 644 getActionDefinitionsBuilder(G_CONSTANT) 645 .legalFor({S1, S32, S64, S16, GlobalPtr, 646 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 647 .legalIf(isPointer(0)) 648 .clampScalar(0, S32, S64) 649 .widenScalarToNextPow2(0); 650 651 getActionDefinitionsBuilder(G_FCONSTANT) 652 .legalFor({S32, S64, S16}) 653 .clampScalar(0, S16, S64); 654 655 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 656 .legalIf(isRegisterType(0)) 657 // s1 and s16 are special cases because they have legal operations on 658 // them, but don't really occupy registers in the normal way. 659 .legalFor({S1, S16}) 660 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 661 .clampScalarOrElt(0, S32, MaxScalar) 662 .widenScalarToNextPow2(0, 32) 663 .clampMaxNumElements(0, S32, 16); 664 665 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 666 667 // If the amount is divergent, we have to do a wave reduction to get the 668 // maximum value, so this is expanded during RegBankSelect. 669 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 670 .legalFor({{PrivatePtr, S32}}); 671 672 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 673 .customIf(typeIsNot(0, PrivatePtr)); 674 675 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 676 677 auto &FPOpActions = getActionDefinitionsBuilder( 678 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 679 .legalFor({S32, S64}); 680 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 681 .customFor({S32, S64}); 682 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 683 .customFor({S32, S64}); 684 685 if (ST.has16BitInsts()) { 686 if (ST.hasVOP3PInsts()) 687 FPOpActions.legalFor({S16, V2S16}); 688 else 689 FPOpActions.legalFor({S16}); 690 691 TrigActions.customFor({S16}); 692 FDIVActions.customFor({S16}); 693 } 694 695 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 696 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 697 698 if (ST.hasVOP3PInsts()) { 699 MinNumMaxNum.customFor(FPTypesPK16) 700 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 701 .clampMaxNumElements(0, S16, 2) 702 .clampScalar(0, S16, S64) 703 .scalarize(0); 704 } else if (ST.has16BitInsts()) { 705 MinNumMaxNum.customFor(FPTypes16) 706 .clampScalar(0, S16, S64) 707 .scalarize(0); 708 } else { 709 MinNumMaxNum.customFor(FPTypesBase) 710 .clampScalar(0, S32, S64) 711 .scalarize(0); 712 } 713 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 716 717 FPOpActions 718 .scalarize(0) 719 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 720 721 TrigActions 722 .scalarize(0) 723 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 724 725 FDIVActions 726 .scalarize(0) 727 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 728 729 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 730 .legalFor(FPTypesPK16) 731 .clampMaxNumElementsStrict(0, S16, 2) 732 .scalarize(0) 733 .clampScalar(0, S16, S64); 734 735 if (ST.has16BitInsts()) { 736 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 737 .legalFor({S32, S64, S16}) 738 .scalarize(0) 739 .clampScalar(0, S16, S64); 740 } else { 741 getActionDefinitionsBuilder(G_FSQRT) 742 .legalFor({S32, S64}) 743 .scalarize(0) 744 .clampScalar(0, S32, S64); 745 746 if (ST.hasFractBug()) { 747 getActionDefinitionsBuilder(G_FFLOOR) 748 .customFor({S64}) 749 .legalFor({S32, S64}) 750 .scalarize(0) 751 .clampScalar(0, S32, S64); 752 } else { 753 getActionDefinitionsBuilder(G_FFLOOR) 754 .legalFor({S32, S64}) 755 .scalarize(0) 756 .clampScalar(0, S32, S64); 757 } 758 } 759 760 getActionDefinitionsBuilder(G_FPTRUNC) 761 .legalFor({{S32, S64}, {S16, S32}}) 762 .scalarize(0) 763 .lower(); 764 765 getActionDefinitionsBuilder(G_FPEXT) 766 .legalFor({{S64, S32}, {S32, S16}}) 767 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 768 .scalarize(0); 769 770 getActionDefinitionsBuilder(G_FSUB) 771 // Use actual fsub instruction 772 .legalFor({S32}) 773 // Must use fadd + fneg 774 .lowerFor({S64, S16, V2S16}) 775 .scalarize(0) 776 .clampScalar(0, S32, S64); 777 778 // Whether this is legal depends on the floating point mode for the function. 779 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 780 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 781 FMad.customFor({S32, S16}); 782 else if (ST.hasMadMacF32Insts()) 783 FMad.customFor({S32}); 784 else if (ST.hasMadF16()) 785 FMad.customFor({S16}); 786 FMad.scalarize(0) 787 .lower(); 788 789 auto &FRem = getActionDefinitionsBuilder(G_FREM); 790 if (ST.has16BitInsts()) { 791 FRem.customFor({S16, S32, S64}); 792 } else { 793 FRem.minScalar(0, S32) 794 .customFor({S32, S64}); 795 } 796 FRem.scalarize(0); 797 798 // TODO: Do we need to clamp maximum bitwidth? 799 getActionDefinitionsBuilder(G_TRUNC) 800 .legalIf(isScalar(0)) 801 .legalFor({{V2S16, V2S32}}) 802 .clampMaxNumElements(0, S16, 2) 803 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 804 // situations (like an invalid implicit use), we don't want to infinite loop 805 // in the legalizer. 806 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 807 .alwaysLegal(); 808 809 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 810 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 811 {S32, S1}, {S64, S1}, {S16, S1}}) 812 .scalarize(0) 813 .clampScalar(0, S32, S64) 814 .widenScalarToNextPow2(1, 32); 815 816 // TODO: Split s1->s64 during regbankselect for VALU. 817 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 818 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 819 .lowerIf(typeIs(1, S1)) 820 .customFor({{S32, S64}, {S64, S64}}); 821 if (ST.has16BitInsts()) 822 IToFP.legalFor({{S16, S16}}); 823 IToFP.clampScalar(1, S32, S64) 824 .minScalar(0, S32) 825 .scalarize(0) 826 .widenScalarToNextPow2(1); 827 828 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 829 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 830 .customFor({{S64, S32}, {S64, S64}}) 831 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 832 if (ST.has16BitInsts()) 833 FPToI.legalFor({{S16, S16}}); 834 else 835 FPToI.minScalar(1, S32); 836 837 FPToI.minScalar(0, S32) 838 .widenScalarToNextPow2(0, 32) 839 .scalarize(0) 840 .lower(); 841 842 // Lower roundeven into G_FRINT 843 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 844 .scalarize(0) 845 .lower(); 846 847 if (ST.has16BitInsts()) { 848 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 849 .legalFor({S16, S32, S64}) 850 .clampScalar(0, S16, S64) 851 .scalarize(0); 852 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 853 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 854 .legalFor({S32, S64}) 855 .clampScalar(0, S32, S64) 856 .scalarize(0); 857 } else { 858 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 859 .legalFor({S32}) 860 .customFor({S64}) 861 .clampScalar(0, S32, S64) 862 .scalarize(0); 863 } 864 865 getActionDefinitionsBuilder(G_PTR_ADD) 866 .legalIf(all(isPointer(0), sameSize(0, 1))) 867 .scalarize(0) 868 .scalarSameSizeAs(1, 0); 869 870 getActionDefinitionsBuilder(G_PTRMASK) 871 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 872 .scalarSameSizeAs(1, 0) 873 .scalarize(0); 874 875 auto &CmpBuilder = 876 getActionDefinitionsBuilder(G_ICMP) 877 // The compare output type differs based on the register bank of the output, 878 // so make both s1 and s32 legal. 879 // 880 // Scalar compares producing output in scc will be promoted to s32, as that 881 // is the allocatable register type that will be needed for the copy from 882 // scc. This will be promoted during RegBankSelect, and we assume something 883 // before that won't try to use s32 result types. 884 // 885 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 886 // bank. 887 .legalForCartesianProduct( 888 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 889 .legalForCartesianProduct( 890 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 891 if (ST.has16BitInsts()) { 892 CmpBuilder.legalFor({{S1, S16}}); 893 } 894 895 CmpBuilder 896 .widenScalarToNextPow2(1) 897 .clampScalar(1, S32, S64) 898 .scalarize(0) 899 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 900 901 getActionDefinitionsBuilder(G_FCMP) 902 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 903 .widenScalarToNextPow2(1) 904 .clampScalar(1, S32, S64) 905 .scalarize(0); 906 907 // FIXME: fpow has a selection pattern that should move to custom lowering. 908 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 909 if (ST.has16BitInsts()) 910 Exp2Ops.legalFor({S32, S16}); 911 else 912 Exp2Ops.legalFor({S32}); 913 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 914 Exp2Ops.scalarize(0); 915 916 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 917 if (ST.has16BitInsts()) 918 ExpOps.customFor({{S32}, {S16}}); 919 else 920 ExpOps.customFor({S32}); 921 ExpOps.clampScalar(0, MinScalarFPTy, S32) 922 .scalarize(0); 923 924 getActionDefinitionsBuilder(G_FPOWI) 925 .clampScalar(0, MinScalarFPTy, S32) 926 .lower(); 927 928 // The 64-bit versions produce 32-bit results, but only on the SALU. 929 getActionDefinitionsBuilder(G_CTPOP) 930 .legalFor({{S32, S32}, {S32, S64}}) 931 .clampScalar(0, S32, S32) 932 .widenScalarToNextPow2(1, 32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32); 936 937 938 // The hardware instructions return a different result on 0 than the generic 939 // instructions expect. The hardware produces -1, but these produce the 940 // bitwidth. 941 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 942 .scalarize(0) 943 .clampScalar(0, S32, S32) 944 .clampScalar(1, S32, S64) 945 .widenScalarToNextPow2(0, 32) 946 .widenScalarToNextPow2(1, 32) 947 .custom(); 948 949 // The 64-bit versions produce 32-bit results, but only on the SALU. 950 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 951 .legalFor({{S32, S32}, {S32, S64}}) 952 .clampScalar(0, S32, S32) 953 .clampScalar(1, S32, S64) 954 .scalarize(0) 955 .widenScalarToNextPow2(0, 32) 956 .widenScalarToNextPow2(1, 32); 957 958 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 959 // RegBankSelect. 960 getActionDefinitionsBuilder(G_BITREVERSE) 961 .legalFor({S32, S64}) 962 .clampScalar(0, S32, S64) 963 .scalarize(0) 964 .widenScalarToNextPow2(0); 965 966 if (ST.has16BitInsts()) { 967 getActionDefinitionsBuilder(G_BSWAP) 968 .legalFor({S16, S32, V2S16}) 969 .clampMaxNumElementsStrict(0, S16, 2) 970 // FIXME: Fixing non-power-of-2 before clamp is workaround for 971 // narrowScalar limitation. 972 .widenScalarToNextPow2(0) 973 .clampScalar(0, S16, S32) 974 .scalarize(0); 975 976 if (ST.hasVOP3PInsts()) { 977 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 978 .legalFor({S32, S16, V2S16}) 979 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 980 .clampMaxNumElements(0, S16, 2) 981 .minScalar(0, S16) 982 .widenScalarToNextPow2(0) 983 .scalarize(0) 984 .lower(); 985 } else { 986 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 987 .legalFor({S32, S16}) 988 .widenScalarToNextPow2(0) 989 .minScalar(0, S16) 990 .scalarize(0) 991 .lower(); 992 } 993 } else { 994 // TODO: Should have same legality without v_perm_b32 995 getActionDefinitionsBuilder(G_BSWAP) 996 .legalFor({S32}) 997 .lowerIf(scalarNarrowerThan(0, 32)) 998 // FIXME: Fixing non-power-of-2 before clamp is workaround for 999 // narrowScalar limitation. 1000 .widenScalarToNextPow2(0) 1001 .maxScalar(0, S32) 1002 .scalarize(0) 1003 .lower(); 1004 1005 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1006 .legalFor({S32}) 1007 .minScalar(0, S32) 1008 .widenScalarToNextPow2(0) 1009 .scalarize(0) 1010 .lower(); 1011 } 1012 1013 getActionDefinitionsBuilder(G_INTTOPTR) 1014 // List the common cases 1015 .legalForCartesianProduct(AddrSpaces64, {S64}) 1016 .legalForCartesianProduct(AddrSpaces32, {S32}) 1017 .scalarize(0) 1018 // Accept any address space as long as the size matches 1019 .legalIf(sameSize(0, 1)) 1020 .widenScalarIf(smallerThan(1, 0), 1021 [](const LegalityQuery &Query) { 1022 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1023 }) 1024 .narrowScalarIf(largerThan(1, 0), 1025 [](const LegalityQuery &Query) { 1026 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1027 }); 1028 1029 getActionDefinitionsBuilder(G_PTRTOINT) 1030 // List the common cases 1031 .legalForCartesianProduct(AddrSpaces64, {S64}) 1032 .legalForCartesianProduct(AddrSpaces32, {S32}) 1033 .scalarize(0) 1034 // Accept any address space as long as the size matches 1035 .legalIf(sameSize(0, 1)) 1036 .widenScalarIf(smallerThan(0, 1), 1037 [](const LegalityQuery &Query) { 1038 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1039 }) 1040 .narrowScalarIf( 1041 largerThan(0, 1), 1042 [](const LegalityQuery &Query) { 1043 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1044 }); 1045 1046 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1047 .scalarize(0) 1048 .custom(); 1049 1050 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1051 bool IsLoad) -> bool { 1052 const LLT DstTy = Query.Types[0]; 1053 1054 // Split vector extloads. 1055 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1056 1057 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1058 return true; 1059 1060 const LLT PtrTy = Query.Types[1]; 1061 unsigned AS = PtrTy.getAddressSpace(); 1062 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1063 return true; 1064 1065 // Catch weird sized loads that don't evenly divide into the access sizes 1066 // TODO: May be able to widen depending on alignment etc. 1067 unsigned NumRegs = (MemSize + 31) / 32; 1068 if (NumRegs == 3) { 1069 if (!ST.hasDwordx3LoadStores()) 1070 return true; 1071 } else { 1072 // If the alignment allows, these should have been widened. 1073 if (!isPowerOf2_32(NumRegs)) 1074 return true; 1075 } 1076 1077 return false; 1078 }; 1079 1080 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1081 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1082 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1083 1084 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1085 // LDS 1086 // TODO: Unsupported flat for SI. 1087 1088 for (unsigned Op : {G_LOAD, G_STORE}) { 1089 const bool IsStore = Op == G_STORE; 1090 1091 auto &Actions = getActionDefinitionsBuilder(Op); 1092 // Explicitly list some common cases. 1093 // TODO: Does this help compile time at all? 1094 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1095 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1096 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1097 {S64, GlobalPtr, S64, GlobalAlign32}, 1098 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1099 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1100 {S32, GlobalPtr, S8, GlobalAlign8}, 1101 {S32, GlobalPtr, S16, GlobalAlign16}, 1102 1103 {S32, LocalPtr, S32, 32}, 1104 {S64, LocalPtr, S64, 32}, 1105 {V2S32, LocalPtr, V2S32, 32}, 1106 {S32, LocalPtr, S8, 8}, 1107 {S32, LocalPtr, S16, 16}, 1108 {V2S16, LocalPtr, S32, 32}, 1109 1110 {S32, PrivatePtr, S32, 32}, 1111 {S32, PrivatePtr, S8, 8}, 1112 {S32, PrivatePtr, S16, 16}, 1113 {V2S16, PrivatePtr, S32, 32}, 1114 1115 {S32, ConstantPtr, S32, GlobalAlign32}, 1116 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1117 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1118 {S64, ConstantPtr, S64, GlobalAlign32}, 1119 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1120 Actions.legalIf( 1121 [=](const LegalityQuery &Query) -> bool { 1122 return isLoadStoreLegal(ST, Query); 1123 }); 1124 1125 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1126 // 64-bits. 1127 // 1128 // TODO: Should generalize bitcast action into coerce, which will also cover 1129 // inserting addrspacecasts. 1130 Actions.customIf(typeIs(1, Constant32Ptr)); 1131 1132 // Turn any illegal element vectors into something easier to deal 1133 // with. These will ultimately produce 32-bit scalar shifts to extract the 1134 // parts anyway. 1135 // 1136 // For odd 16-bit element vectors, prefer to split those into pieces with 1137 // 16-bit vector parts. 1138 Actions.bitcastIf( 1139 [=](const LegalityQuery &Query) -> bool { 1140 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1141 Query.MMODescrs[0].MemoryTy); 1142 }, bitcastToRegisterType(0)); 1143 1144 if (!IsStore) { 1145 // Widen suitably aligned loads by loading extra bytes. The standard 1146 // legalization actions can't properly express widening memory operands. 1147 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1148 return shouldWidenLoad(ST, Query, G_LOAD); 1149 }); 1150 } 1151 1152 // FIXME: load/store narrowing should be moved to lower action 1153 Actions 1154 .narrowScalarIf( 1155 [=](const LegalityQuery &Query) -> bool { 1156 return !Query.Types[0].isVector() && 1157 needToSplitMemOp(Query, Op == G_LOAD); 1158 }, 1159 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1160 const LLT DstTy = Query.Types[0]; 1161 const LLT PtrTy = Query.Types[1]; 1162 1163 const unsigned DstSize = DstTy.getSizeInBits(); 1164 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1165 1166 // Split extloads. 1167 if (DstSize > MemSize) 1168 return std::make_pair(0, LLT::scalar(MemSize)); 1169 1170 unsigned MaxSize = maxSizeForAddrSpace(ST, 1171 PtrTy.getAddressSpace(), 1172 Op == G_LOAD); 1173 if (MemSize > MaxSize) 1174 return std::make_pair(0, LLT::scalar(MaxSize)); 1175 1176 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1177 return std::make_pair(0, LLT::scalar(Align)); 1178 }) 1179 .fewerElementsIf( 1180 [=](const LegalityQuery &Query) -> bool { 1181 return Query.Types[0].isVector() && 1182 needToSplitMemOp(Query, Op == G_LOAD); 1183 }, 1184 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1185 const LLT DstTy = Query.Types[0]; 1186 const LLT PtrTy = Query.Types[1]; 1187 1188 LLT EltTy = DstTy.getElementType(); 1189 unsigned MaxSize = maxSizeForAddrSpace(ST, 1190 PtrTy.getAddressSpace(), 1191 Op == G_LOAD); 1192 1193 // FIXME: Handle widened to power of 2 results better. This ends 1194 // up scalarizing. 1195 // FIXME: 3 element stores scalarized on SI 1196 1197 // Split if it's too large for the address space. 1198 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1199 if (MemSize > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector( 1206 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1207 } 1208 1209 unsigned NumPieces = MemSize / MaxSize; 1210 1211 // FIXME: Refine when odd breakdowns handled 1212 // The scalars will need to be re-legalized. 1213 if (NumPieces == 1 || NumPieces >= NumElts || 1214 NumElts % NumPieces != 0) 1215 return std::make_pair(0, EltTy); 1216 1217 return std::make_pair( 1218 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1219 } 1220 1221 // FIXME: We could probably handle weird extending loads better. 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector( 1234 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1235 } 1236 1237 // May need relegalization for the scalars. 1238 return std::make_pair(0, EltTy); 1239 }) 1240 .minScalar(0, S32) 1241 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1242 .widenScalarToNextPow2(0) 1243 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1244 .lower(); 1245 } 1246 1247 // FIXME: Unaligned accesses not lowered. 1248 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1249 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1250 {S32, GlobalPtr, S16, 2 * 8}, 1251 {S32, LocalPtr, S8, 8}, 1252 {S32, LocalPtr, S16, 16}, 1253 {S32, PrivatePtr, S8, 8}, 1254 {S32, PrivatePtr, S16, 16}, 1255 {S32, ConstantPtr, S8, 8}, 1256 {S32, ConstantPtr, S16, 2 * 8}}) 1257 .legalIf( 1258 [=](const LegalityQuery &Query) -> bool { 1259 return isLoadStoreLegal(ST, Query); 1260 }); 1261 1262 if (ST.hasFlatAddressSpace()) { 1263 ExtLoads.legalForTypesWithMemDesc( 1264 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1265 } 1266 1267 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1268 // 64-bits. 1269 // 1270 // TODO: Should generalize bitcast action into coerce, which will also cover 1271 // inserting addrspacecasts. 1272 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1273 1274 ExtLoads.clampScalar(0, S32, S32) 1275 .widenScalarToNextPow2(0) 1276 .lower(); 1277 1278 auto &Atomics = getActionDefinitionsBuilder( 1279 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1280 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1281 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1282 G_ATOMICRMW_UMIN}) 1283 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1284 {S64, GlobalPtr}, {S64, LocalPtr}, 1285 {S32, RegionPtr}, {S64, RegionPtr}}); 1286 if (ST.hasFlatAddressSpace()) { 1287 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1288 } 1289 1290 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1291 if (ST.hasLDSFPAtomicAdd()) { 1292 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1293 if (ST.hasGFX90AInsts()) 1294 Atomic.legalFor({{S64, LocalPtr}}); 1295 } 1296 if (ST.hasAtomicFaddInsts()) 1297 Atomic.legalFor({{S32, GlobalPtr}}); 1298 1299 if (ST.hasGFX90AInsts()) { 1300 // These are legal with some caveats, and should have undergone expansion in 1301 // the IR in most situations 1302 // TODO: Move atomic expansion into legalizer 1303 // TODO: Also supports <2 x f16> 1304 Atomic.legalFor({ 1305 {S32, GlobalPtr}, 1306 {S64, GlobalPtr}, 1307 {S64, FlatPtr} 1308 }); 1309 } 1310 1311 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1312 // demarshalling 1313 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1314 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1315 {S32, FlatPtr}, {S64, FlatPtr}}) 1316 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1317 {S32, RegionPtr}, {S64, RegionPtr}}); 1318 // TODO: Pointer types, any 32-bit or 64-bit vector 1319 1320 // Condition should be s32 for scalar, s1 for vector. 1321 getActionDefinitionsBuilder(G_SELECT) 1322 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1323 LocalPtr, FlatPtr, PrivatePtr, 1324 LLT::fixed_vector(2, LocalPtr), 1325 LLT::fixed_vector(2, PrivatePtr)}, 1326 {S1, S32}) 1327 .clampScalar(0, S16, S64) 1328 .scalarize(1) 1329 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1330 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1331 .clampMaxNumElements(0, S32, 2) 1332 .clampMaxNumElements(0, LocalPtr, 2) 1333 .clampMaxNumElements(0, PrivatePtr, 2) 1334 .scalarize(0) 1335 .widenScalarToNextPow2(0) 1336 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1337 1338 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1339 // be more flexible with the shift amount type. 1340 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1341 .legalFor({{S32, S32}, {S64, S32}}); 1342 if (ST.has16BitInsts()) { 1343 if (ST.hasVOP3PInsts()) { 1344 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1345 .clampMaxNumElements(0, S16, 2); 1346 } else 1347 Shifts.legalFor({{S16, S16}}); 1348 1349 // TODO: Support 16-bit shift amounts for all types 1350 Shifts.widenScalarIf( 1351 [=](const LegalityQuery &Query) { 1352 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1353 // 32-bit amount. 1354 const LLT ValTy = Query.Types[0]; 1355 const LLT AmountTy = Query.Types[1]; 1356 return ValTy.getSizeInBits() <= 16 && 1357 AmountTy.getSizeInBits() < 16; 1358 }, changeTo(1, S16)); 1359 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1360 Shifts.clampScalar(1, S32, S32); 1361 Shifts.widenScalarToNextPow2(0, 16); 1362 Shifts.clampScalar(0, S16, S64); 1363 1364 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1365 .minScalar(0, S16) 1366 .scalarize(0) 1367 .lower(); 1368 } else { 1369 // Make sure we legalize the shift amount type first, as the general 1370 // expansion for the shifted type will produce much worse code if it hasn't 1371 // been truncated already. 1372 Shifts.clampScalar(1, S32, S32); 1373 Shifts.widenScalarToNextPow2(0, 32); 1374 Shifts.clampScalar(0, S32, S64); 1375 1376 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1377 .minScalar(0, S32) 1378 .scalarize(0) 1379 .lower(); 1380 } 1381 Shifts.scalarize(0); 1382 1383 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1384 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1385 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1386 unsigned IdxTypeIdx = 2; 1387 1388 getActionDefinitionsBuilder(Op) 1389 .customIf([=](const LegalityQuery &Query) { 1390 const LLT EltTy = Query.Types[EltTypeIdx]; 1391 const LLT VecTy = Query.Types[VecTypeIdx]; 1392 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1393 const unsigned EltSize = EltTy.getSizeInBits(); 1394 return (EltSize == 32 || EltSize == 64) && 1395 VecTy.getSizeInBits() % 32 == 0 && 1396 VecTy.getSizeInBits() <= MaxRegisterSize && 1397 IdxTy.getSizeInBits() == 32; 1398 }) 1399 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1400 bitcastToVectorElement32(VecTypeIdx)) 1401 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1402 .bitcastIf( 1403 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1404 [=](const LegalityQuery &Query) { 1405 // For > 64-bit element types, try to turn this into a 64-bit 1406 // element vector since we may be able to do better indexing 1407 // if this is scalar. If not, fall back to 32. 1408 const LLT EltTy = Query.Types[EltTypeIdx]; 1409 const LLT VecTy = Query.Types[VecTypeIdx]; 1410 const unsigned DstEltSize = EltTy.getSizeInBits(); 1411 const unsigned VecSize = VecTy.getSizeInBits(); 1412 1413 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1414 return std::make_pair( 1415 VecTypeIdx, 1416 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1417 }) 1418 .clampScalar(EltTypeIdx, S32, S64) 1419 .clampScalar(VecTypeIdx, S32, S64) 1420 .clampScalar(IdxTypeIdx, S32, S32) 1421 .clampMaxNumElements(VecTypeIdx, S32, 32) 1422 // TODO: Clamp elements for 64-bit vectors? 1423 // It should only be necessary with variable indexes. 1424 // As a last resort, lower to the stack 1425 .lower(); 1426 } 1427 1428 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1429 .unsupportedIf([=](const LegalityQuery &Query) { 1430 const LLT &EltTy = Query.Types[1].getElementType(); 1431 return Query.Types[0] != EltTy; 1432 }); 1433 1434 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1435 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1436 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1437 1438 // FIXME: Doesn't handle extract of illegal sizes. 1439 getActionDefinitionsBuilder(Op) 1440 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1441 .lowerIf([=](const LegalityQuery &Query) { 1442 // Sub-vector(or single element) insert and extract. 1443 // TODO: verify immediate offset here since lower only works with 1444 // whole elements. 1445 const LLT BigTy = Query.Types[BigTyIdx]; 1446 return BigTy.isVector(); 1447 }) 1448 // FIXME: Multiples of 16 should not be legal. 1449 .legalIf([=](const LegalityQuery &Query) { 1450 const LLT BigTy = Query.Types[BigTyIdx]; 1451 const LLT LitTy = Query.Types[LitTyIdx]; 1452 return (BigTy.getSizeInBits() % 32 == 0) && 1453 (LitTy.getSizeInBits() % 16 == 0); 1454 }) 1455 .widenScalarIf( 1456 [=](const LegalityQuery &Query) { 1457 const LLT BigTy = Query.Types[BigTyIdx]; 1458 return (BigTy.getScalarSizeInBits() < 16); 1459 }, 1460 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1461 .widenScalarIf( 1462 [=](const LegalityQuery &Query) { 1463 const LLT LitTy = Query.Types[LitTyIdx]; 1464 return (LitTy.getScalarSizeInBits() < 16); 1465 }, 1466 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1467 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1468 .widenScalarToNextPow2(BigTyIdx, 32); 1469 1470 } 1471 1472 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1473 .legalForCartesianProduct(AllS32Vectors, {S32}) 1474 .legalForCartesianProduct(AllS64Vectors, {S64}) 1475 .clampNumElements(0, V16S32, V32S32) 1476 .clampNumElements(0, V2S64, V16S64) 1477 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1478 1479 if (ST.hasScalarPackInsts()) { 1480 BuildVector 1481 // FIXME: Should probably widen s1 vectors straight to s32 1482 .minScalarOrElt(0, S16) 1483 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1484 .minScalar(1, S32); 1485 1486 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1487 .legalFor({V2S16, S32}) 1488 .lower(); 1489 BuildVector.minScalarOrElt(0, S32); 1490 } else { 1491 BuildVector.customFor({V2S16, S16}); 1492 BuildVector.minScalarOrElt(0, S32); 1493 1494 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1495 .customFor({V2S16, S32}) 1496 .lower(); 1497 } 1498 1499 BuildVector.legalIf(isRegisterType(0)); 1500 1501 // FIXME: Clamp maximum size 1502 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1503 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1504 .clampMaxNumElements(0, S32, 32) 1505 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1506 .clampMaxNumElements(0, S16, 64); 1507 1508 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1509 // pre-legalize. 1510 if (ST.hasVOP3PInsts()) { 1511 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1512 .customFor({V2S16, V2S16}) 1513 .lower(); 1514 } else 1515 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1516 1517 // Merge/Unmerge 1518 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1519 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1520 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1521 1522 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1523 const LLT Ty = Query.Types[TypeIdx]; 1524 if (Ty.isVector()) { 1525 const LLT &EltTy = Ty.getElementType(); 1526 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1527 return true; 1528 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1529 return true; 1530 } 1531 return false; 1532 }; 1533 1534 auto &Builder = getActionDefinitionsBuilder(Op) 1535 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1536 .lowerFor({{S16, V2S16}}) 1537 .lowerIf([=](const LegalityQuery &Query) { 1538 const LLT BigTy = Query.Types[BigTyIdx]; 1539 return BigTy.getSizeInBits() == 32; 1540 }) 1541 // Try to widen to s16 first for small types. 1542 // TODO: Only do this on targets with legal s16 shifts 1543 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1544 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1545 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1546 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1547 elementTypeIs(1, S16)), 1548 changeTo(1, V2S16)) 1549 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1550 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1551 // valid. 1552 .clampScalar(LitTyIdx, S32, S512) 1553 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1554 // Break up vectors with weird elements into scalars 1555 .fewerElementsIf( 1556 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1557 scalarize(0)) 1558 .fewerElementsIf( 1559 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1560 scalarize(1)) 1561 .clampScalar(BigTyIdx, S32, MaxScalar); 1562 1563 if (Op == G_MERGE_VALUES) { 1564 Builder.widenScalarIf( 1565 // TODO: Use 16-bit shifts if legal for 8-bit values? 1566 [=](const LegalityQuery &Query) { 1567 const LLT Ty = Query.Types[LitTyIdx]; 1568 return Ty.getSizeInBits() < 32; 1569 }, 1570 changeTo(LitTyIdx, S32)); 1571 } 1572 1573 Builder.widenScalarIf( 1574 [=](const LegalityQuery &Query) { 1575 const LLT Ty = Query.Types[BigTyIdx]; 1576 return !isPowerOf2_32(Ty.getSizeInBits()) && 1577 Ty.getSizeInBits() % 16 != 0; 1578 }, 1579 [=](const LegalityQuery &Query) { 1580 // Pick the next power of 2, or a multiple of 64 over 128. 1581 // Whichever is smaller. 1582 const LLT &Ty = Query.Types[BigTyIdx]; 1583 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1584 if (NewSizeInBits >= 256) { 1585 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1586 if (RoundedTo < NewSizeInBits) 1587 NewSizeInBits = RoundedTo; 1588 } 1589 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1590 }) 1591 // Any vectors left are the wrong size. Scalarize them. 1592 .scalarize(0) 1593 .scalarize(1); 1594 } 1595 1596 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1597 // RegBankSelect. 1598 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1599 .legalFor({{S32}, {S64}}); 1600 1601 if (ST.hasVOP3PInsts()) { 1602 SextInReg.lowerFor({{V2S16}}) 1603 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1604 // get more vector shift opportunities, since we'll get those when 1605 // expanded. 1606 .clampMaxNumElementsStrict(0, S16, 2); 1607 } else if (ST.has16BitInsts()) { 1608 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1609 } else { 1610 // Prefer to promote to s32 before lowering if we don't have 16-bit 1611 // shifts. This avoid a lot of intermediate truncate and extend operations. 1612 SextInReg.lowerFor({{S32}, {S64}}); 1613 } 1614 1615 SextInReg 1616 .scalarize(0) 1617 .clampScalar(0, S32, S64) 1618 .lower(); 1619 1620 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1621 .scalarize(0) 1622 .lower(); 1623 1624 // TODO: Only Try to form v2s16 with legal packed instructions. 1625 getActionDefinitionsBuilder(G_FSHR) 1626 .legalFor({{S32, S32}}) 1627 .lowerFor({{V2S16, V2S16}}) 1628 .clampMaxNumElementsStrict(0, S16, 2) 1629 .scalarize(0) 1630 .lower(); 1631 1632 if (ST.hasVOP3PInsts()) { 1633 getActionDefinitionsBuilder(G_FSHL) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .clampMaxNumElementsStrict(0, S16, 2) 1636 .scalarize(0) 1637 .lower(); 1638 } else { 1639 getActionDefinitionsBuilder(G_FSHL) 1640 .scalarize(0) 1641 .lower(); 1642 } 1643 1644 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1645 .legalFor({S64}); 1646 1647 getActionDefinitionsBuilder(G_FENCE) 1648 .alwaysLegal(); 1649 1650 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1651 .scalarize(0) 1652 .minScalar(0, S32) 1653 .lower(); 1654 1655 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1656 .legalFor({{S32, S32}, {S64, S32}}) 1657 .clampScalar(1, S32, S32) 1658 .clampScalar(0, S32, S64) 1659 .widenScalarToNextPow2(0) 1660 .scalarize(0); 1661 1662 getActionDefinitionsBuilder({ 1663 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1664 G_FCOPYSIGN, 1665 1666 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1667 G_ATOMICRMW_NAND, 1668 G_ATOMICRMW_FSUB, 1669 G_READ_REGISTER, 1670 G_WRITE_REGISTER, 1671 1672 G_SADDO, G_SSUBO, 1673 1674 // TODO: Implement 1675 G_FMINIMUM, G_FMAXIMUM}).lower(); 1676 1677 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1678 .lower(); 1679 1680 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1681 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1682 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1683 .unsupported(); 1684 1685 getLegacyLegalizerInfo().computeTables(); 1686 verify(*ST.getInstrInfo()); 1687 } 1688 1689 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1690 MachineInstr &MI) const { 1691 MachineIRBuilder &B = Helper.MIRBuilder; 1692 MachineRegisterInfo &MRI = *B.getMRI(); 1693 1694 switch (MI.getOpcode()) { 1695 case TargetOpcode::G_ADDRSPACE_CAST: 1696 return legalizeAddrSpaceCast(MI, MRI, B); 1697 case TargetOpcode::G_FRINT: 1698 return legalizeFrint(MI, MRI, B); 1699 case TargetOpcode::G_FCEIL: 1700 return legalizeFceil(MI, MRI, B); 1701 case TargetOpcode::G_FREM: 1702 return legalizeFrem(MI, MRI, B); 1703 case TargetOpcode::G_INTRINSIC_TRUNC: 1704 return legalizeIntrinsicTrunc(MI, MRI, B); 1705 case TargetOpcode::G_SITOFP: 1706 return legalizeITOFP(MI, MRI, B, true); 1707 case TargetOpcode::G_UITOFP: 1708 return legalizeITOFP(MI, MRI, B, false); 1709 case TargetOpcode::G_FPTOSI: 1710 return legalizeFPTOI(MI, MRI, B, true); 1711 case TargetOpcode::G_FPTOUI: 1712 return legalizeFPTOI(MI, MRI, B, false); 1713 case TargetOpcode::G_FMINNUM: 1714 case TargetOpcode::G_FMAXNUM: 1715 case TargetOpcode::G_FMINNUM_IEEE: 1716 case TargetOpcode::G_FMAXNUM_IEEE: 1717 return legalizeMinNumMaxNum(Helper, MI); 1718 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1719 return legalizeExtractVectorElt(MI, MRI, B); 1720 case TargetOpcode::G_INSERT_VECTOR_ELT: 1721 return legalizeInsertVectorElt(MI, MRI, B); 1722 case TargetOpcode::G_SHUFFLE_VECTOR: 1723 return legalizeShuffleVector(MI, MRI, B); 1724 case TargetOpcode::G_FSIN: 1725 case TargetOpcode::G_FCOS: 1726 return legalizeSinCos(MI, MRI, B); 1727 case TargetOpcode::G_GLOBAL_VALUE: 1728 return legalizeGlobalValue(MI, MRI, B); 1729 case TargetOpcode::G_LOAD: 1730 case TargetOpcode::G_SEXTLOAD: 1731 case TargetOpcode::G_ZEXTLOAD: 1732 return legalizeLoad(Helper, MI); 1733 case TargetOpcode::G_FMAD: 1734 return legalizeFMad(MI, MRI, B); 1735 case TargetOpcode::G_FDIV: 1736 return legalizeFDIV(MI, MRI, B); 1737 case TargetOpcode::G_UDIV: 1738 case TargetOpcode::G_UREM: 1739 case TargetOpcode::G_UDIVREM: 1740 return legalizeUnsignedDIV_REM(MI, MRI, B); 1741 case TargetOpcode::G_SDIV: 1742 case TargetOpcode::G_SREM: 1743 case TargetOpcode::G_SDIVREM: 1744 return legalizeSignedDIV_REM(MI, MRI, B); 1745 case TargetOpcode::G_ATOMIC_CMPXCHG: 1746 return legalizeAtomicCmpXChg(MI, MRI, B); 1747 case TargetOpcode::G_FLOG: 1748 return legalizeFlog(MI, B, numbers::ln2f); 1749 case TargetOpcode::G_FLOG10: 1750 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1751 case TargetOpcode::G_FEXP: 1752 return legalizeFExp(MI, B); 1753 case TargetOpcode::G_FPOW: 1754 return legalizeFPow(MI, B); 1755 case TargetOpcode::G_FFLOOR: 1756 return legalizeFFloor(MI, MRI, B); 1757 case TargetOpcode::G_BUILD_VECTOR: 1758 return legalizeBuildVector(MI, MRI, B); 1759 case TargetOpcode::G_CTLZ: 1760 case TargetOpcode::G_CTTZ: 1761 return legalizeCTLZ_CTTZ(MI, MRI, B); 1762 default: 1763 return false; 1764 } 1765 1766 llvm_unreachable("expected switch to return"); 1767 } 1768 1769 Register AMDGPULegalizerInfo::getSegmentAperture( 1770 unsigned AS, 1771 MachineRegisterInfo &MRI, 1772 MachineIRBuilder &B) const { 1773 MachineFunction &MF = B.getMF(); 1774 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1775 const LLT S32 = LLT::scalar(32); 1776 1777 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1778 1779 if (ST.hasApertureRegs()) { 1780 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1781 // getreg. 1782 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1783 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1784 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1785 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1786 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1788 unsigned Encoding = 1789 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1790 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1791 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1792 1793 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1794 1795 B.buildInstr(AMDGPU::S_GETREG_B32) 1796 .addDef(GetReg) 1797 .addImm(Encoding); 1798 MRI.setType(GetReg, S32); 1799 1800 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1801 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1802 } 1803 1804 Register QueuePtr = MRI.createGenericVirtualRegister( 1805 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1806 1807 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1808 return Register(); 1809 1810 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1811 // private_segment_aperture_base_hi. 1812 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1813 1814 // TODO: can we be smarter about machine pointer info? 1815 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1816 MachineMemOperand *MMO = MF.getMachineMemOperand( 1817 PtrInfo, 1818 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1819 MachineMemOperand::MOInvariant, 1820 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1821 1822 Register LoadAddr; 1823 1824 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1825 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1826 } 1827 1828 /// Return true if the value is a known valid address, such that a null check is 1829 /// not necessary. 1830 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1831 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1832 MachineInstr *Def = MRI.getVRegDef(Val); 1833 switch (Def->getOpcode()) { 1834 case AMDGPU::G_FRAME_INDEX: 1835 case AMDGPU::G_GLOBAL_VALUE: 1836 case AMDGPU::G_BLOCK_ADDR: 1837 return true; 1838 case AMDGPU::G_CONSTANT: { 1839 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1840 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1841 } 1842 default: 1843 return false; 1844 } 1845 1846 return false; 1847 } 1848 1849 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1850 MachineInstr &MI, MachineRegisterInfo &MRI, 1851 MachineIRBuilder &B) const { 1852 MachineFunction &MF = B.getMF(); 1853 1854 const LLT S32 = LLT::scalar(32); 1855 Register Dst = MI.getOperand(0).getReg(); 1856 Register Src = MI.getOperand(1).getReg(); 1857 1858 LLT DstTy = MRI.getType(Dst); 1859 LLT SrcTy = MRI.getType(Src); 1860 unsigned DestAS = DstTy.getAddressSpace(); 1861 unsigned SrcAS = SrcTy.getAddressSpace(); 1862 1863 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1864 // vector element. 1865 assert(!DstTy.isVector()); 1866 1867 const AMDGPUTargetMachine &TM 1868 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1869 1870 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1871 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1872 return true; 1873 } 1874 1875 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1876 // Truncate. 1877 B.buildExtract(Dst, Src, 0); 1878 MI.eraseFromParent(); 1879 return true; 1880 } 1881 1882 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1883 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1884 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1885 1886 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1887 // another. Merge operands are required to be the same type, but creating an 1888 // extra ptrtoint would be kind of pointless. 1889 auto HighAddr = B.buildConstant( 1890 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1891 B.buildMerge(Dst, {Src, HighAddr}); 1892 MI.eraseFromParent(); 1893 return true; 1894 } 1895 1896 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1897 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1898 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1899 1900 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1901 // Extract low 32-bits of the pointer. 1902 B.buildExtract(Dst, Src, 0); 1903 MI.eraseFromParent(); 1904 return true; 1905 } 1906 1907 unsigned NullVal = TM.getNullPointerValue(DestAS); 1908 1909 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1910 auto FlatNull = B.buildConstant(SrcTy, 0); 1911 1912 // Extract low 32-bits of the pointer. 1913 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1914 1915 auto CmpRes = 1916 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1917 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1918 1919 MI.eraseFromParent(); 1920 return true; 1921 } 1922 1923 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1924 return false; 1925 1926 if (!ST.hasFlatAddressSpace()) 1927 return false; 1928 1929 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1930 if (!ApertureReg.isValid()) 1931 return false; 1932 1933 // Coerce the type of the low half of the result so we can use merge_values. 1934 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1935 1936 // TODO: Should we allow mismatched types but matching sizes in merges to 1937 // avoid the ptrtoint? 1938 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1939 1940 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1941 B.buildCopy(Dst, BuildPtr); 1942 MI.eraseFromParent(); 1943 return true; 1944 } 1945 1946 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1947 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1948 1949 auto CmpRes = 1950 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1951 1952 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1953 1954 MI.eraseFromParent(); 1955 return true; 1956 } 1957 1958 bool AMDGPULegalizerInfo::legalizeFrint( 1959 MachineInstr &MI, MachineRegisterInfo &MRI, 1960 MachineIRBuilder &B) const { 1961 Register Src = MI.getOperand(1).getReg(); 1962 LLT Ty = MRI.getType(Src); 1963 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1964 1965 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1966 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1967 1968 auto C1 = B.buildFConstant(Ty, C1Val); 1969 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1970 1971 // TODO: Should this propagate fast-math-flags? 1972 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1973 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1974 1975 auto C2 = B.buildFConstant(Ty, C2Val); 1976 auto Fabs = B.buildFAbs(Ty, Src); 1977 1978 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1979 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1980 MI.eraseFromParent(); 1981 return true; 1982 } 1983 1984 bool AMDGPULegalizerInfo::legalizeFceil( 1985 MachineInstr &MI, MachineRegisterInfo &MRI, 1986 MachineIRBuilder &B) const { 1987 1988 const LLT S1 = LLT::scalar(1); 1989 const LLT S64 = LLT::scalar(64); 1990 1991 Register Src = MI.getOperand(1).getReg(); 1992 assert(MRI.getType(Src) == S64); 1993 1994 // result = trunc(src) 1995 // if (src > 0.0 && src != result) 1996 // result += 1.0 1997 1998 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1999 2000 const auto Zero = B.buildFConstant(S64, 0.0); 2001 const auto One = B.buildFConstant(S64, 1.0); 2002 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2003 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2004 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2005 auto Add = B.buildSelect(S64, And, One, Zero); 2006 2007 // TODO: Should this propagate fast-math-flags? 2008 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2009 MI.eraseFromParent(); 2010 return true; 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeFrem( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 Register DstReg = MI.getOperand(0).getReg(); 2017 Register Src0Reg = MI.getOperand(1).getReg(); 2018 Register Src1Reg = MI.getOperand(2).getReg(); 2019 auto Flags = MI.getFlags(); 2020 LLT Ty = MRI.getType(DstReg); 2021 2022 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2023 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2024 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2025 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2026 MI.eraseFromParent(); 2027 return true; 2028 } 2029 2030 static MachineInstrBuilder extractF64Exponent(Register Hi, 2031 MachineIRBuilder &B) { 2032 const unsigned FractBits = 52; 2033 const unsigned ExpBits = 11; 2034 LLT S32 = LLT::scalar(32); 2035 2036 auto Const0 = B.buildConstant(S32, FractBits - 32); 2037 auto Const1 = B.buildConstant(S32, ExpBits); 2038 2039 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2040 .addUse(Hi) 2041 .addUse(Const0.getReg(0)) 2042 .addUse(Const1.getReg(0)); 2043 2044 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2045 } 2046 2047 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2048 MachineInstr &MI, MachineRegisterInfo &MRI, 2049 MachineIRBuilder &B) const { 2050 const LLT S1 = LLT::scalar(1); 2051 const LLT S32 = LLT::scalar(32); 2052 const LLT S64 = LLT::scalar(64); 2053 2054 Register Src = MI.getOperand(1).getReg(); 2055 assert(MRI.getType(Src) == S64); 2056 2057 // TODO: Should this use extract since the low half is unused? 2058 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2059 Register Hi = Unmerge.getReg(1); 2060 2061 // Extract the upper half, since this is where we will find the sign and 2062 // exponent. 2063 auto Exp = extractF64Exponent(Hi, B); 2064 2065 const unsigned FractBits = 52; 2066 2067 // Extract the sign bit. 2068 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2069 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2070 2071 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2072 2073 const auto Zero32 = B.buildConstant(S32, 0); 2074 2075 // Extend back to 64-bits. 2076 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2077 2078 auto Shr = B.buildAShr(S64, FractMask, Exp); 2079 auto Not = B.buildNot(S64, Shr); 2080 auto Tmp0 = B.buildAnd(S64, Src, Not); 2081 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2082 2083 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2084 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2085 2086 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2087 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2088 MI.eraseFromParent(); 2089 return true; 2090 } 2091 2092 bool AMDGPULegalizerInfo::legalizeITOFP( 2093 MachineInstr &MI, MachineRegisterInfo &MRI, 2094 MachineIRBuilder &B, bool Signed) const { 2095 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Src = MI.getOperand(1).getReg(); 2098 2099 const LLT S64 = LLT::scalar(64); 2100 const LLT S32 = LLT::scalar(32); 2101 2102 assert(MRI.getType(Src) == S64); 2103 2104 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2105 auto ThirtyTwo = B.buildConstant(S32, 32); 2106 2107 if (MRI.getType(Dst) == S64) { 2108 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2109 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2110 2111 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2112 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2113 .addUse(CvtHi.getReg(0)) 2114 .addUse(ThirtyTwo.getReg(0)); 2115 2116 // TODO: Should this propagate fast-math-flags? 2117 B.buildFAdd(Dst, LdExp, CvtLo); 2118 MI.eraseFromParent(); 2119 return true; 2120 } 2121 2122 assert(MRI.getType(Dst) == S32); 2123 2124 auto One = B.buildConstant(S32, 1); 2125 2126 MachineInstrBuilder ShAmt; 2127 if (Signed) { 2128 auto ThirtyOne = B.buildConstant(S32, 31); 2129 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2130 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2131 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2132 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2133 /*HasSideEffects=*/false) 2134 .addUse(Unmerge.getReg(1)); 2135 auto LS2 = B.buildSub(S32, LS, One); 2136 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2137 } else 2138 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2139 auto Norm = B.buildShl(S64, Src, ShAmt); 2140 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2141 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2142 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2143 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2144 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2145 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2146 /*HasSideEffects=*/false) 2147 .addUse(FVal.getReg(0)) 2148 .addUse(Scale.getReg(0)); 2149 MI.eraseFromParent(); 2150 return true; 2151 } 2152 2153 // TODO: Copied from DAG implementation. Verify logic and document how this 2154 // actually works. 2155 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2156 MachineRegisterInfo &MRI, 2157 MachineIRBuilder &B, 2158 bool Signed) const { 2159 2160 Register Dst = MI.getOperand(0).getReg(); 2161 Register Src = MI.getOperand(1).getReg(); 2162 2163 const LLT S64 = LLT::scalar(64); 2164 const LLT S32 = LLT::scalar(32); 2165 2166 const LLT SrcLT = MRI.getType(Src); 2167 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2168 2169 unsigned Flags = MI.getFlags(); 2170 2171 // The basic idea of converting a floating point number into a pair of 32-bit 2172 // integers is illustrated as follows: 2173 // 2174 // tf := trunc(val); 2175 // hif := floor(tf * 2^-32); 2176 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2177 // hi := fptoi(hif); 2178 // lo := fptoi(lof); 2179 // 2180 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2181 MachineInstrBuilder Sign; 2182 if (Signed && SrcLT == S32) { 2183 // However, a 32-bit floating point number has only 23 bits mantissa and 2184 // it's not enough to hold all the significant bits of `lof` if val is 2185 // negative. To avoid the loss of precision, We need to take the absolute 2186 // value after truncating and flip the result back based on the original 2187 // signedness. 2188 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2189 Trunc = B.buildFAbs(S32, Trunc, Flags); 2190 } 2191 MachineInstrBuilder K0, K1; 2192 if (SrcLT == S64) { 2193 K0 = B.buildFConstant(S64, 2194 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2195 K1 = B.buildFConstant(S64, 2196 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2197 } else { 2198 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2199 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2200 } 2201 2202 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2203 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2204 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2205 2206 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2207 : B.buildFPTOUI(S32, FloorMul); 2208 auto Lo = B.buildFPTOUI(S32, Fma); 2209 2210 if (Signed && SrcLT == S32) { 2211 // Flip the result based on the signedness, which is either all 0s or 1s. 2212 Sign = B.buildMerge(S64, {Sign, Sign}); 2213 // r := xor({lo, hi}, sign) - sign; 2214 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2215 } else 2216 B.buildMerge(Dst, {Lo, Hi}); 2217 MI.eraseFromParent(); 2218 2219 return true; 2220 } 2221 2222 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2223 MachineInstr &MI) const { 2224 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2225 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2226 2227 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2228 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2229 2230 // With ieee_mode disabled, the instructions have the correct behavior 2231 // already for G_FMINNUM/G_FMAXNUM 2232 if (!MFI->getMode().IEEE) 2233 return !IsIEEEOp; 2234 2235 if (IsIEEEOp) 2236 return true; 2237 2238 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2239 } 2240 2241 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2242 MachineInstr &MI, MachineRegisterInfo &MRI, 2243 MachineIRBuilder &B) const { 2244 // TODO: Should move some of this into LegalizerHelper. 2245 2246 // TODO: Promote dynamic indexing of s16 to s32 2247 2248 // FIXME: Artifact combiner probably should have replaced the truncated 2249 // constant before this, so we shouldn't need 2250 // getIConstantVRegValWithLookThrough. 2251 Optional<ValueAndVReg> MaybeIdxVal = 2252 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2253 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2254 return true; 2255 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2256 2257 Register Dst = MI.getOperand(0).getReg(); 2258 Register Vec = MI.getOperand(1).getReg(); 2259 2260 LLT VecTy = MRI.getType(Vec); 2261 LLT EltTy = VecTy.getElementType(); 2262 assert(EltTy == MRI.getType(Dst)); 2263 2264 if (IdxVal < VecTy.getNumElements()) { 2265 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2266 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2267 } else { 2268 B.buildUndef(Dst); 2269 } 2270 2271 MI.eraseFromParent(); 2272 return true; 2273 } 2274 2275 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2276 MachineInstr &MI, MachineRegisterInfo &MRI, 2277 MachineIRBuilder &B) const { 2278 // TODO: Should move some of this into LegalizerHelper. 2279 2280 // TODO: Promote dynamic indexing of s16 to s32 2281 2282 // FIXME: Artifact combiner probably should have replaced the truncated 2283 // constant before this, so we shouldn't need 2284 // getIConstantVRegValWithLookThrough. 2285 Optional<ValueAndVReg> MaybeIdxVal = 2286 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2287 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2288 return true; 2289 2290 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2291 Register Dst = MI.getOperand(0).getReg(); 2292 Register Vec = MI.getOperand(1).getReg(); 2293 Register Ins = MI.getOperand(2).getReg(); 2294 2295 LLT VecTy = MRI.getType(Vec); 2296 LLT EltTy = VecTy.getElementType(); 2297 assert(EltTy == MRI.getType(Ins)); 2298 (void)Ins; 2299 2300 unsigned NumElts = VecTy.getNumElements(); 2301 if (IdxVal < NumElts) { 2302 SmallVector<Register, 8> SrcRegs; 2303 for (unsigned i = 0; i < NumElts; ++i) 2304 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2305 B.buildUnmerge(SrcRegs, Vec); 2306 2307 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2308 B.buildMerge(Dst, SrcRegs); 2309 } else { 2310 B.buildUndef(Dst); 2311 } 2312 2313 MI.eraseFromParent(); 2314 return true; 2315 } 2316 2317 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2318 MachineInstr &MI, MachineRegisterInfo &MRI, 2319 MachineIRBuilder &B) const { 2320 const LLT V2S16 = LLT::fixed_vector(2, 16); 2321 2322 Register Dst = MI.getOperand(0).getReg(); 2323 Register Src0 = MI.getOperand(1).getReg(); 2324 LLT DstTy = MRI.getType(Dst); 2325 LLT SrcTy = MRI.getType(Src0); 2326 2327 if (SrcTy == V2S16 && DstTy == V2S16 && 2328 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2329 return true; 2330 2331 MachineIRBuilder HelperBuilder(MI); 2332 GISelObserverWrapper DummyObserver; 2333 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2334 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2335 } 2336 2337 bool AMDGPULegalizerInfo::legalizeSinCos( 2338 MachineInstr &MI, MachineRegisterInfo &MRI, 2339 MachineIRBuilder &B) const { 2340 2341 Register DstReg = MI.getOperand(0).getReg(); 2342 Register SrcReg = MI.getOperand(1).getReg(); 2343 LLT Ty = MRI.getType(DstReg); 2344 unsigned Flags = MI.getFlags(); 2345 2346 Register TrigVal; 2347 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2348 if (ST.hasTrigReducedRange()) { 2349 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2350 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2351 .addUse(MulVal.getReg(0)) 2352 .setMIFlags(Flags).getReg(0); 2353 } else 2354 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2355 2356 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2357 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2358 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2359 .addUse(TrigVal) 2360 .setMIFlags(Flags); 2361 MI.eraseFromParent(); 2362 return true; 2363 } 2364 2365 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2366 MachineIRBuilder &B, 2367 const GlobalValue *GV, 2368 int64_t Offset, 2369 unsigned GAFlags) const { 2370 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2371 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2372 // to the following code sequence: 2373 // 2374 // For constant address space: 2375 // s_getpc_b64 s[0:1] 2376 // s_add_u32 s0, s0, $symbol 2377 // s_addc_u32 s1, s1, 0 2378 // 2379 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2380 // a fixup or relocation is emitted to replace $symbol with a literal 2381 // constant, which is a pc-relative offset from the encoding of the $symbol 2382 // operand to the global variable. 2383 // 2384 // For global address space: 2385 // s_getpc_b64 s[0:1] 2386 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2387 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2388 // 2389 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2390 // fixups or relocations are emitted to replace $symbol@*@lo and 2391 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2392 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2393 // operand to the global variable. 2394 // 2395 // What we want here is an offset from the value returned by s_getpc 2396 // (which is the address of the s_add_u32 instruction) to the global 2397 // variable, but since the encoding of $symbol starts 4 bytes after the start 2398 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2399 // small. This requires us to add 4 to the global variable offset in order to 2400 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2401 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2402 // instruction. 2403 2404 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2405 2406 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2407 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2408 2409 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2410 .addDef(PCReg); 2411 2412 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2413 if (GAFlags == SIInstrInfo::MO_NONE) 2414 MIB.addImm(0); 2415 else 2416 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2417 2418 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2419 2420 if (PtrTy.getSizeInBits() == 32) 2421 B.buildExtract(DstReg, PCReg, 0); 2422 return true; 2423 } 2424 2425 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2426 MachineInstr &MI, MachineRegisterInfo &MRI, 2427 MachineIRBuilder &B) const { 2428 Register DstReg = MI.getOperand(0).getReg(); 2429 LLT Ty = MRI.getType(DstReg); 2430 unsigned AS = Ty.getAddressSpace(); 2431 2432 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2433 MachineFunction &MF = B.getMF(); 2434 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2435 2436 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2437 if (!MFI->isModuleEntryFunction() && 2438 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2439 const Function &Fn = MF.getFunction(); 2440 DiagnosticInfoUnsupported BadLDSDecl( 2441 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2442 DS_Warning); 2443 Fn.getContext().diagnose(BadLDSDecl); 2444 2445 // We currently don't have a way to correctly allocate LDS objects that 2446 // aren't directly associated with a kernel. We do force inlining of 2447 // functions that use local objects. However, if these dead functions are 2448 // not eliminated, we don't want a compile time error. Just emit a warning 2449 // and a trap, since there should be no callable path here. 2450 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2451 B.buildUndef(DstReg); 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 // TODO: We could emit code to handle the initialization somewhere. 2457 // We ignore the initializer for now and legalize it to allow selection. 2458 // The initializer will anyway get errored out during assembly emission. 2459 const SITargetLowering *TLI = ST.getTargetLowering(); 2460 if (!TLI->shouldUseLDSConstAddress(GV)) { 2461 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2462 return true; // Leave in place; 2463 } 2464 2465 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2466 Type *Ty = GV->getValueType(); 2467 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2468 // zero-sized type in other languages to declare the dynamic shared 2469 // memory which size is not known at the compile time. They will be 2470 // allocated by the runtime and placed directly after the static 2471 // allocated ones. They all share the same offset. 2472 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2473 // Adjust alignment for that dynamic shared memory array. 2474 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2475 LLT S32 = LLT::scalar(32); 2476 auto Sz = 2477 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2478 B.buildIntToPtr(DstReg, Sz); 2479 MI.eraseFromParent(); 2480 return true; 2481 } 2482 } 2483 2484 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2485 *cast<GlobalVariable>(GV))); 2486 MI.eraseFromParent(); 2487 return true; 2488 } 2489 2490 const SITargetLowering *TLI = ST.getTargetLowering(); 2491 2492 if (TLI->shouldEmitFixup(GV)) { 2493 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2494 MI.eraseFromParent(); 2495 return true; 2496 } 2497 2498 if (TLI->shouldEmitPCReloc(GV)) { 2499 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2505 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2506 2507 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2508 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2509 MachinePointerInfo::getGOT(MF), 2510 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2511 MachineMemOperand::MOInvariant, 2512 LoadTy, Align(8)); 2513 2514 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2515 2516 if (Ty.getSizeInBits() == 32) { 2517 // Truncate if this is a 32-bit constant address. 2518 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2519 B.buildExtract(DstReg, Load, 0); 2520 } else 2521 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2522 2523 MI.eraseFromParent(); 2524 return true; 2525 } 2526 2527 static LLT widenToNextPowerOf2(LLT Ty) { 2528 if (Ty.isVector()) 2529 return Ty.changeElementCount( 2530 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2531 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2532 } 2533 2534 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2535 MachineInstr &MI) const { 2536 MachineIRBuilder &B = Helper.MIRBuilder; 2537 MachineRegisterInfo &MRI = *B.getMRI(); 2538 GISelChangeObserver &Observer = Helper.Observer; 2539 2540 Register PtrReg = MI.getOperand(1).getReg(); 2541 LLT PtrTy = MRI.getType(PtrReg); 2542 unsigned AddrSpace = PtrTy.getAddressSpace(); 2543 2544 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2545 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2546 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2547 Observer.changingInstr(MI); 2548 MI.getOperand(1).setReg(Cast.getReg(0)); 2549 Observer.changedInstr(MI); 2550 return true; 2551 } 2552 2553 if (MI.getOpcode() != AMDGPU::G_LOAD) 2554 return false; 2555 2556 Register ValReg = MI.getOperand(0).getReg(); 2557 LLT ValTy = MRI.getType(ValReg); 2558 2559 MachineMemOperand *MMO = *MI.memoperands_begin(); 2560 const unsigned ValSize = ValTy.getSizeInBits(); 2561 const LLT MemTy = MMO->getMemoryType(); 2562 const Align MemAlign = MMO->getAlign(); 2563 const unsigned MemSize = MemTy.getSizeInBits(); 2564 const uint64_t AlignInBits = 8 * MemAlign.value(); 2565 2566 // Widen non-power-of-2 loads to the alignment if needed 2567 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2568 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2569 2570 // This was already the correct extending load result type, so just adjust 2571 // the memory type. 2572 if (WideMemSize == ValSize) { 2573 MachineFunction &MF = B.getMF(); 2574 2575 MachineMemOperand *WideMMO = 2576 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2577 Observer.changingInstr(MI); 2578 MI.setMemRefs(MF, {WideMMO}); 2579 Observer.changedInstr(MI); 2580 return true; 2581 } 2582 2583 // Don't bother handling edge case that should probably never be produced. 2584 if (ValSize > WideMemSize) 2585 return false; 2586 2587 LLT WideTy = widenToNextPowerOf2(ValTy); 2588 2589 Register WideLoad; 2590 if (!WideTy.isVector()) { 2591 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2592 B.buildTrunc(ValReg, WideLoad).getReg(0); 2593 } else { 2594 // Extract the subvector. 2595 2596 if (isRegisterType(ValTy)) { 2597 // If this a case where G_EXTRACT is legal, use it. 2598 // (e.g. <3 x s32> -> <4 x s32>) 2599 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2600 B.buildExtract(ValReg, WideLoad, 0); 2601 } else { 2602 // For cases where the widened type isn't a nice register value, unmerge 2603 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2604 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2605 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2606 } 2607 } 2608 2609 MI.eraseFromParent(); 2610 return true; 2611 } 2612 2613 return false; 2614 } 2615 2616 bool AMDGPULegalizerInfo::legalizeFMad( 2617 MachineInstr &MI, MachineRegisterInfo &MRI, 2618 MachineIRBuilder &B) const { 2619 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2620 assert(Ty.isScalar()); 2621 2622 MachineFunction &MF = B.getMF(); 2623 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2624 2625 // TODO: Always legal with future ftz flag. 2626 // FIXME: Do we need just output? 2627 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2628 return true; 2629 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2630 return true; 2631 2632 MachineIRBuilder HelperBuilder(MI); 2633 GISelObserverWrapper DummyObserver; 2634 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2635 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2636 } 2637 2638 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2639 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2640 Register DstReg = MI.getOperand(0).getReg(); 2641 Register PtrReg = MI.getOperand(1).getReg(); 2642 Register CmpVal = MI.getOperand(2).getReg(); 2643 Register NewVal = MI.getOperand(3).getReg(); 2644 2645 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2646 "this should not have been custom lowered"); 2647 2648 LLT ValTy = MRI.getType(CmpVal); 2649 LLT VecTy = LLT::fixed_vector(2, ValTy); 2650 2651 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2652 2653 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2654 .addDef(DstReg) 2655 .addUse(PtrReg) 2656 .addUse(PackedVal) 2657 .setMemRefs(MI.memoperands()); 2658 2659 MI.eraseFromParent(); 2660 return true; 2661 } 2662 2663 bool AMDGPULegalizerInfo::legalizeFlog( 2664 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2665 Register Dst = MI.getOperand(0).getReg(); 2666 Register Src = MI.getOperand(1).getReg(); 2667 LLT Ty = B.getMRI()->getType(Dst); 2668 unsigned Flags = MI.getFlags(); 2669 2670 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2671 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2672 2673 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2674 MI.eraseFromParent(); 2675 return true; 2676 } 2677 2678 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2679 MachineIRBuilder &B) const { 2680 Register Dst = MI.getOperand(0).getReg(); 2681 Register Src = MI.getOperand(1).getReg(); 2682 unsigned Flags = MI.getFlags(); 2683 LLT Ty = B.getMRI()->getType(Dst); 2684 2685 auto K = B.buildFConstant(Ty, numbers::log2e); 2686 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2687 B.buildFExp2(Dst, Mul, Flags); 2688 MI.eraseFromParent(); 2689 return true; 2690 } 2691 2692 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2693 MachineIRBuilder &B) const { 2694 Register Dst = MI.getOperand(0).getReg(); 2695 Register Src0 = MI.getOperand(1).getReg(); 2696 Register Src1 = MI.getOperand(2).getReg(); 2697 unsigned Flags = MI.getFlags(); 2698 LLT Ty = B.getMRI()->getType(Dst); 2699 const LLT S16 = LLT::scalar(16); 2700 const LLT S32 = LLT::scalar(32); 2701 2702 if (Ty == S32) { 2703 auto Log = B.buildFLog2(S32, Src0, Flags); 2704 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2705 .addUse(Log.getReg(0)) 2706 .addUse(Src1) 2707 .setMIFlags(Flags); 2708 B.buildFExp2(Dst, Mul, Flags); 2709 } else if (Ty == S16) { 2710 // There's no f16 fmul_legacy, so we need to convert for it. 2711 auto Log = B.buildFLog2(S16, Src0, Flags); 2712 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2713 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2714 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2715 .addUse(Ext0.getReg(0)) 2716 .addUse(Ext1.getReg(0)) 2717 .setMIFlags(Flags); 2718 2719 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2720 } else 2721 return false; 2722 2723 MI.eraseFromParent(); 2724 return true; 2725 } 2726 2727 // Find a source register, ignoring any possible source modifiers. 2728 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2729 Register ModSrc = OrigSrc; 2730 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2731 ModSrc = SrcFNeg->getOperand(1).getReg(); 2732 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2733 ModSrc = SrcFAbs->getOperand(1).getReg(); 2734 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2735 ModSrc = SrcFAbs->getOperand(1).getReg(); 2736 return ModSrc; 2737 } 2738 2739 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2740 MachineRegisterInfo &MRI, 2741 MachineIRBuilder &B) const { 2742 2743 const LLT S1 = LLT::scalar(1); 2744 const LLT S64 = LLT::scalar(64); 2745 Register Dst = MI.getOperand(0).getReg(); 2746 Register OrigSrc = MI.getOperand(1).getReg(); 2747 unsigned Flags = MI.getFlags(); 2748 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2749 "this should not have been custom lowered"); 2750 2751 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2752 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2753 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2754 // V_FRACT bug is: 2755 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2756 // 2757 // Convert floor(x) to (x - fract(x)) 2758 2759 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2760 .addUse(OrigSrc) 2761 .setMIFlags(Flags); 2762 2763 // Give source modifier matching some assistance before obscuring a foldable 2764 // pattern. 2765 2766 // TODO: We can avoid the neg on the fract? The input sign to fract 2767 // shouldn't matter? 2768 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2769 2770 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2771 2772 Register Min = MRI.createGenericVirtualRegister(S64); 2773 2774 // We don't need to concern ourselves with the snan handling difference, so 2775 // use the one which will directly select. 2776 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2777 if (MFI->getMode().IEEE) 2778 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2779 else 2780 B.buildFMinNum(Min, Fract, Const, Flags); 2781 2782 Register CorrectedFract = Min; 2783 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2784 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2785 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2786 } 2787 2788 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2789 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2790 2791 MI.eraseFromParent(); 2792 return true; 2793 } 2794 2795 // Turn an illegal packed v2s16 build vector into bit operations. 2796 // TODO: This should probably be a bitcast action in LegalizerHelper. 2797 bool AMDGPULegalizerInfo::legalizeBuildVector( 2798 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2799 Register Dst = MI.getOperand(0).getReg(); 2800 const LLT S32 = LLT::scalar(32); 2801 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2802 2803 Register Src0 = MI.getOperand(1).getReg(); 2804 Register Src1 = MI.getOperand(2).getReg(); 2805 assert(MRI.getType(Src0) == LLT::scalar(16)); 2806 2807 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2808 B.buildBitcast(Dst, Merge); 2809 2810 MI.eraseFromParent(); 2811 return true; 2812 } 2813 2814 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 2815 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 2816 // case with a single min instruction instead of a compare+select. 2817 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 2818 MachineRegisterInfo &MRI, 2819 MachineIRBuilder &B) const { 2820 Register Dst = MI.getOperand(0).getReg(); 2821 Register Src = MI.getOperand(1).getReg(); 2822 LLT DstTy = MRI.getType(Dst); 2823 LLT SrcTy = MRI.getType(Src); 2824 2825 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 2826 ? AMDGPU::G_AMDGPU_FFBH_U32 2827 : AMDGPU::G_AMDGPU_FFBL_B32; 2828 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 2829 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 2830 2831 MI.eraseFromParent(); 2832 return true; 2833 } 2834 2835 // Check that this is a G_XOR x, -1 2836 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2837 if (MI.getOpcode() != TargetOpcode::G_XOR) 2838 return false; 2839 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2840 return ConstVal && *ConstVal == -1; 2841 } 2842 2843 // Return the use branch instruction, otherwise null if the usage is invalid. 2844 static MachineInstr * 2845 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2846 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2847 Register CondDef = MI.getOperand(0).getReg(); 2848 if (!MRI.hasOneNonDBGUse(CondDef)) 2849 return nullptr; 2850 2851 MachineBasicBlock *Parent = MI.getParent(); 2852 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2853 2854 if (isNot(MRI, *UseMI)) { 2855 Register NegatedCond = UseMI->getOperand(0).getReg(); 2856 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2857 return nullptr; 2858 2859 // We're deleting the def of this value, so we need to remove it. 2860 eraseInstr(*UseMI, MRI); 2861 2862 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2863 Negated = true; 2864 } 2865 2866 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2867 return nullptr; 2868 2869 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2870 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2871 if (Next == Parent->end()) { 2872 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2873 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2874 return nullptr; 2875 UncondBrTarget = &*NextMBB; 2876 } else { 2877 if (Next->getOpcode() != AMDGPU::G_BR) 2878 return nullptr; 2879 Br = &*Next; 2880 UncondBrTarget = Br->getOperand(0).getMBB(); 2881 } 2882 2883 return UseMI; 2884 } 2885 2886 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2887 const ArgDescriptor *Arg, 2888 const TargetRegisterClass *ArgRC, 2889 LLT ArgTy) const { 2890 MCRegister SrcReg = Arg->getRegister(); 2891 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2892 assert(DstReg.isVirtual() && "Virtual register expected"); 2893 2894 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 2895 *ArgRC, B.getDebugLoc(), ArgTy); 2896 if (Arg->isMasked()) { 2897 // TODO: Should we try to emit this once in the entry block? 2898 const LLT S32 = LLT::scalar(32); 2899 const unsigned Mask = Arg->getMask(); 2900 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2901 2902 Register AndMaskSrc = LiveIn; 2903 2904 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 2905 // 0. 2906 if (Shift != 0) { 2907 auto ShiftAmt = B.buildConstant(S32, Shift); 2908 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2909 } 2910 2911 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2912 } else { 2913 B.buildCopy(DstReg, LiveIn); 2914 } 2915 2916 return true; 2917 } 2918 2919 bool AMDGPULegalizerInfo::loadInputValue( 2920 Register DstReg, MachineIRBuilder &B, 2921 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2922 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2923 const ArgDescriptor *Arg; 2924 const TargetRegisterClass *ArgRC; 2925 LLT ArgTy; 2926 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2927 2928 if (!Arg) { 2929 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 2930 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 2931 // case the pointer argument may be missing and we use null. 2932 B.buildConstant(DstReg, 0); 2933 return true; 2934 } 2935 2936 // It's undefined behavior if a function marked with the amdgpu-no-* 2937 // attributes uses the corresponding intrinsic. 2938 B.buildUndef(DstReg); 2939 return true; 2940 } 2941 2942 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2943 return false; // TODO: Handle these 2944 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2945 } 2946 2947 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2948 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2949 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2950 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2951 return false; 2952 2953 MI.eraseFromParent(); 2954 return true; 2955 } 2956 2957 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2958 MachineRegisterInfo &MRI, 2959 MachineIRBuilder &B) const { 2960 Register Dst = MI.getOperand(0).getReg(); 2961 LLT DstTy = MRI.getType(Dst); 2962 LLT S16 = LLT::scalar(16); 2963 LLT S32 = LLT::scalar(32); 2964 LLT S64 = LLT::scalar(64); 2965 2966 if (DstTy == S16) 2967 return legalizeFDIV16(MI, MRI, B); 2968 if (DstTy == S32) 2969 return legalizeFDIV32(MI, MRI, B); 2970 if (DstTy == S64) 2971 return legalizeFDIV64(MI, MRI, B); 2972 2973 return false; 2974 } 2975 2976 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2977 Register DstDivReg, 2978 Register DstRemReg, 2979 Register X, 2980 Register Y) const { 2981 const LLT S1 = LLT::scalar(1); 2982 const LLT S32 = LLT::scalar(32); 2983 2984 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2985 // algorithm used here. 2986 2987 // Initial estimate of inv(y). 2988 auto FloatY = B.buildUITOFP(S32, Y); 2989 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2990 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2991 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2992 auto Z = B.buildFPTOUI(S32, ScaledY); 2993 2994 // One round of UNR. 2995 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2996 auto NegYZ = B.buildMul(S32, NegY, Z); 2997 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2998 2999 // Quotient/remainder estimate. 3000 auto Q = B.buildUMulH(S32, X, Z); 3001 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3002 3003 // First quotient/remainder refinement. 3004 auto One = B.buildConstant(S32, 1); 3005 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3006 if (DstDivReg) 3007 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3008 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3009 3010 // Second quotient/remainder refinement. 3011 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3012 if (DstDivReg) 3013 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3014 3015 if (DstRemReg) 3016 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3017 } 3018 3019 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3020 // 3021 // Return lo, hi of result 3022 // 3023 // %cvt.lo = G_UITOFP Val.lo 3024 // %cvt.hi = G_UITOFP Val.hi 3025 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3026 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3027 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3028 // %mul2 = G_FMUL %mul1, 2**(-32) 3029 // %trunc = G_INTRINSIC_TRUNC %mul2 3030 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3031 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3032 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3033 Register Val) { 3034 const LLT S32 = LLT::scalar(32); 3035 auto Unmerge = B.buildUnmerge(S32, Val); 3036 3037 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3038 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3039 3040 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3041 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3042 3043 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3044 auto Mul1 = 3045 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3046 3047 // 2**(-32) 3048 auto Mul2 = 3049 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3050 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3051 3052 // -(2**32) 3053 auto Mad2 = B.buildFMAD(S32, Trunc, 3054 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3055 3056 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3057 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3058 3059 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3060 } 3061 3062 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3063 Register DstDivReg, 3064 Register DstRemReg, 3065 Register Numer, 3066 Register Denom) const { 3067 const LLT S32 = LLT::scalar(32); 3068 const LLT S64 = LLT::scalar(64); 3069 const LLT S1 = LLT::scalar(1); 3070 Register RcpLo, RcpHi; 3071 3072 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3073 3074 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3075 3076 auto Zero64 = B.buildConstant(S64, 0); 3077 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3078 3079 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3080 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3081 3082 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3083 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3084 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3085 3086 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3087 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3088 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3089 3090 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3091 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3092 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3093 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3094 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3095 3096 auto Zero32 = B.buildConstant(S32, 0); 3097 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3098 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3099 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3100 3101 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3102 Register NumerLo = UnmergeNumer.getReg(0); 3103 Register NumerHi = UnmergeNumer.getReg(1); 3104 3105 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3106 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3107 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3108 Register Mul3_Lo = UnmergeMul3.getReg(0); 3109 Register Mul3_Hi = UnmergeMul3.getReg(1); 3110 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3111 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3112 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3113 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3114 3115 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3116 Register DenomLo = UnmergeDenom.getReg(0); 3117 Register DenomHi = UnmergeDenom.getReg(1); 3118 3119 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3120 auto C1 = B.buildSExt(S32, CmpHi); 3121 3122 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3123 auto C2 = B.buildSExt(S32, CmpLo); 3124 3125 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3126 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3127 3128 // TODO: Here and below portions of the code can be enclosed into if/endif. 3129 // Currently control flow is unconditional and we have 4 selects after 3130 // potential endif to substitute PHIs. 3131 3132 // if C3 != 0 ... 3133 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3134 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3135 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3136 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3137 3138 auto One64 = B.buildConstant(S64, 1); 3139 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3140 3141 auto C4 = 3142 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3143 auto C5 = 3144 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3145 auto C6 = B.buildSelect( 3146 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3147 3148 // if (C6 != 0) 3149 auto Add4 = B.buildAdd(S64, Add3, One64); 3150 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3151 3152 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3153 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3154 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3155 3156 // endif C6 3157 // endif C3 3158 3159 if (DstDivReg) { 3160 auto Sel1 = B.buildSelect( 3161 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3162 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3163 Sel1, MulHi3); 3164 } 3165 3166 if (DstRemReg) { 3167 auto Sel2 = B.buildSelect( 3168 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3169 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3170 Sel2, Sub1); 3171 } 3172 } 3173 3174 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3175 MachineRegisterInfo &MRI, 3176 MachineIRBuilder &B) const { 3177 Register DstDivReg, DstRemReg; 3178 switch (MI.getOpcode()) { 3179 default: 3180 llvm_unreachable("Unexpected opcode!"); 3181 case AMDGPU::G_UDIV: { 3182 DstDivReg = MI.getOperand(0).getReg(); 3183 break; 3184 } 3185 case AMDGPU::G_UREM: { 3186 DstRemReg = MI.getOperand(0).getReg(); 3187 break; 3188 } 3189 case AMDGPU::G_UDIVREM: { 3190 DstDivReg = MI.getOperand(0).getReg(); 3191 DstRemReg = MI.getOperand(1).getReg(); 3192 break; 3193 } 3194 } 3195 3196 const LLT S64 = LLT::scalar(64); 3197 const LLT S32 = LLT::scalar(32); 3198 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3199 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3200 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3201 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3202 3203 if (Ty == S32) 3204 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3205 else if (Ty == S64) 3206 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3207 else 3208 return false; 3209 3210 MI.eraseFromParent(); 3211 return true; 3212 } 3213 3214 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3215 MachineRegisterInfo &MRI, 3216 MachineIRBuilder &B) const { 3217 const LLT S64 = LLT::scalar(64); 3218 const LLT S32 = LLT::scalar(32); 3219 3220 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3221 if (Ty != S32 && Ty != S64) 3222 return false; 3223 3224 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3225 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3226 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3227 3228 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3229 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3230 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3231 3232 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3233 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3234 3235 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3236 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3237 3238 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3239 switch (MI.getOpcode()) { 3240 default: 3241 llvm_unreachable("Unexpected opcode!"); 3242 case AMDGPU::G_SDIV: { 3243 DstDivReg = MI.getOperand(0).getReg(); 3244 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3245 break; 3246 } 3247 case AMDGPU::G_SREM: { 3248 DstRemReg = MI.getOperand(0).getReg(); 3249 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3250 break; 3251 } 3252 case AMDGPU::G_SDIVREM: { 3253 DstDivReg = MI.getOperand(0).getReg(); 3254 DstRemReg = MI.getOperand(1).getReg(); 3255 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3256 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3257 break; 3258 } 3259 } 3260 3261 if (Ty == S32) 3262 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3263 else 3264 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3265 3266 if (DstDivReg) { 3267 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3268 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3269 B.buildSub(DstDivReg, SignXor, Sign); 3270 } 3271 3272 if (DstRemReg) { 3273 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3274 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3275 B.buildSub(DstRemReg, SignXor, Sign); 3276 } 3277 3278 MI.eraseFromParent(); 3279 return true; 3280 } 3281 3282 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3283 MachineRegisterInfo &MRI, 3284 MachineIRBuilder &B) const { 3285 Register Res = MI.getOperand(0).getReg(); 3286 Register LHS = MI.getOperand(1).getReg(); 3287 Register RHS = MI.getOperand(2).getReg(); 3288 uint16_t Flags = MI.getFlags(); 3289 LLT ResTy = MRI.getType(Res); 3290 3291 const MachineFunction &MF = B.getMF(); 3292 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3293 MI.getFlag(MachineInstr::FmAfn); 3294 3295 if (!AllowInaccurateRcp) 3296 return false; 3297 3298 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3299 // 1 / x -> RCP(x) 3300 if (CLHS->isExactlyValue(1.0)) { 3301 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3302 .addUse(RHS) 3303 .setMIFlags(Flags); 3304 3305 MI.eraseFromParent(); 3306 return true; 3307 } 3308 3309 // -1 / x -> RCP( FNEG(x) ) 3310 if (CLHS->isExactlyValue(-1.0)) { 3311 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3312 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3313 .addUse(FNeg.getReg(0)) 3314 .setMIFlags(Flags); 3315 3316 MI.eraseFromParent(); 3317 return true; 3318 } 3319 } 3320 3321 // x / y -> x * (1.0 / y) 3322 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3323 .addUse(RHS) 3324 .setMIFlags(Flags); 3325 B.buildFMul(Res, LHS, RCP, Flags); 3326 3327 MI.eraseFromParent(); 3328 return true; 3329 } 3330 3331 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3332 MachineRegisterInfo &MRI, 3333 MachineIRBuilder &B) const { 3334 Register Res = MI.getOperand(0).getReg(); 3335 Register X = MI.getOperand(1).getReg(); 3336 Register Y = MI.getOperand(2).getReg(); 3337 uint16_t Flags = MI.getFlags(); 3338 LLT ResTy = MRI.getType(Res); 3339 3340 const MachineFunction &MF = B.getMF(); 3341 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3342 MI.getFlag(MachineInstr::FmAfn); 3343 3344 if (!AllowInaccurateRcp) 3345 return false; 3346 3347 auto NegY = B.buildFNeg(ResTy, Y); 3348 auto One = B.buildFConstant(ResTy, 1.0); 3349 3350 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3351 .addUse(Y) 3352 .setMIFlags(Flags); 3353 3354 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3355 R = B.buildFMA(ResTy, Tmp0, R, R); 3356 3357 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3358 R = B.buildFMA(ResTy, Tmp1, R, R); 3359 3360 auto Ret = B.buildFMul(ResTy, X, R); 3361 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3362 3363 B.buildFMA(Res, Tmp2, R, Ret); 3364 MI.eraseFromParent(); 3365 return true; 3366 } 3367 3368 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3369 MachineRegisterInfo &MRI, 3370 MachineIRBuilder &B) const { 3371 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3372 return true; 3373 3374 Register Res = MI.getOperand(0).getReg(); 3375 Register LHS = MI.getOperand(1).getReg(); 3376 Register RHS = MI.getOperand(2).getReg(); 3377 3378 uint16_t Flags = MI.getFlags(); 3379 3380 LLT S16 = LLT::scalar(16); 3381 LLT S32 = LLT::scalar(32); 3382 3383 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3384 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3385 3386 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3387 .addUse(RHSExt.getReg(0)) 3388 .setMIFlags(Flags); 3389 3390 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3391 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3392 3393 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3394 .addUse(RDst.getReg(0)) 3395 .addUse(RHS) 3396 .addUse(LHS) 3397 .setMIFlags(Flags); 3398 3399 MI.eraseFromParent(); 3400 return true; 3401 } 3402 3403 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3404 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3405 static void toggleSPDenormMode(bool Enable, 3406 MachineIRBuilder &B, 3407 const GCNSubtarget &ST, 3408 AMDGPU::SIModeRegisterDefaults Mode) { 3409 // Set SP denorm mode to this value. 3410 unsigned SPDenormMode = 3411 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3412 3413 if (ST.hasDenormModeInst()) { 3414 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3415 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3416 3417 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3418 B.buildInstr(AMDGPU::S_DENORM_MODE) 3419 .addImm(NewDenormModeValue); 3420 3421 } else { 3422 // Select FP32 bit field in mode register. 3423 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3424 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3425 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3426 3427 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3428 .addImm(SPDenormMode) 3429 .addImm(SPDenormModeBitField); 3430 } 3431 } 3432 3433 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3434 MachineRegisterInfo &MRI, 3435 MachineIRBuilder &B) const { 3436 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3437 return true; 3438 3439 Register Res = MI.getOperand(0).getReg(); 3440 Register LHS = MI.getOperand(1).getReg(); 3441 Register RHS = MI.getOperand(2).getReg(); 3442 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3443 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3444 3445 uint16_t Flags = MI.getFlags(); 3446 3447 LLT S32 = LLT::scalar(32); 3448 LLT S1 = LLT::scalar(1); 3449 3450 auto One = B.buildFConstant(S32, 1.0f); 3451 3452 auto DenominatorScaled = 3453 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3454 .addUse(LHS) 3455 .addUse(RHS) 3456 .addImm(0) 3457 .setMIFlags(Flags); 3458 auto NumeratorScaled = 3459 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3460 .addUse(LHS) 3461 .addUse(RHS) 3462 .addImm(1) 3463 .setMIFlags(Flags); 3464 3465 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3466 .addUse(DenominatorScaled.getReg(0)) 3467 .setMIFlags(Flags); 3468 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3469 3470 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3471 // aren't modeled as reading it. 3472 if (!Mode.allFP32Denormals()) 3473 toggleSPDenormMode(true, B, ST, Mode); 3474 3475 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3476 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3477 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3478 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3479 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3480 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3481 3482 if (!Mode.allFP32Denormals()) 3483 toggleSPDenormMode(false, B, ST, Mode); 3484 3485 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3486 .addUse(Fma4.getReg(0)) 3487 .addUse(Fma1.getReg(0)) 3488 .addUse(Fma3.getReg(0)) 3489 .addUse(NumeratorScaled.getReg(1)) 3490 .setMIFlags(Flags); 3491 3492 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3493 .addUse(Fmas.getReg(0)) 3494 .addUse(RHS) 3495 .addUse(LHS) 3496 .setMIFlags(Flags); 3497 3498 MI.eraseFromParent(); 3499 return true; 3500 } 3501 3502 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3503 MachineRegisterInfo &MRI, 3504 MachineIRBuilder &B) const { 3505 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3506 return true; 3507 3508 Register Res = MI.getOperand(0).getReg(); 3509 Register LHS = MI.getOperand(1).getReg(); 3510 Register RHS = MI.getOperand(2).getReg(); 3511 3512 uint16_t Flags = MI.getFlags(); 3513 3514 LLT S64 = LLT::scalar(64); 3515 LLT S1 = LLT::scalar(1); 3516 3517 auto One = B.buildFConstant(S64, 1.0); 3518 3519 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3520 .addUse(LHS) 3521 .addUse(RHS) 3522 .addImm(0) 3523 .setMIFlags(Flags); 3524 3525 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3526 3527 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3528 .addUse(DivScale0.getReg(0)) 3529 .setMIFlags(Flags); 3530 3531 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3532 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3533 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3534 3535 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3536 .addUse(LHS) 3537 .addUse(RHS) 3538 .addImm(1) 3539 .setMIFlags(Flags); 3540 3541 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3542 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3543 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3544 3545 Register Scale; 3546 if (!ST.hasUsableDivScaleConditionOutput()) { 3547 // Workaround a hardware bug on SI where the condition output from div_scale 3548 // is not usable. 3549 3550 LLT S32 = LLT::scalar(32); 3551 3552 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3553 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3554 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3555 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3556 3557 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3558 Scale1Unmerge.getReg(1)); 3559 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3560 Scale0Unmerge.getReg(1)); 3561 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3562 } else { 3563 Scale = DivScale1.getReg(1); 3564 } 3565 3566 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3567 .addUse(Fma4.getReg(0)) 3568 .addUse(Fma3.getReg(0)) 3569 .addUse(Mul.getReg(0)) 3570 .addUse(Scale) 3571 .setMIFlags(Flags); 3572 3573 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3574 .addUse(Fmas.getReg(0)) 3575 .addUse(RHS) 3576 .addUse(LHS) 3577 .setMIFlags(Flags); 3578 3579 MI.eraseFromParent(); 3580 return true; 3581 } 3582 3583 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3584 MachineRegisterInfo &MRI, 3585 MachineIRBuilder &B) const { 3586 Register Res = MI.getOperand(0).getReg(); 3587 Register LHS = MI.getOperand(2).getReg(); 3588 Register RHS = MI.getOperand(3).getReg(); 3589 uint16_t Flags = MI.getFlags(); 3590 3591 LLT S32 = LLT::scalar(32); 3592 LLT S1 = LLT::scalar(1); 3593 3594 auto Abs = B.buildFAbs(S32, RHS, Flags); 3595 const APFloat C0Val(1.0f); 3596 3597 auto C0 = B.buildConstant(S32, 0x6f800000); 3598 auto C1 = B.buildConstant(S32, 0x2f800000); 3599 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3600 3601 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3602 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3603 3604 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3605 3606 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3607 .addUse(Mul0.getReg(0)) 3608 .setMIFlags(Flags); 3609 3610 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3611 3612 B.buildFMul(Res, Sel, Mul1, Flags); 3613 3614 MI.eraseFromParent(); 3615 return true; 3616 } 3617 3618 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3619 // FIXME: Why do we handle this one but not other removed instructions? 3620 // 3621 // Reciprocal square root. The clamp prevents infinite results, clamping 3622 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3623 // +-max_float. 3624 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3625 MachineRegisterInfo &MRI, 3626 MachineIRBuilder &B) const { 3627 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3628 return true; 3629 3630 Register Dst = MI.getOperand(0).getReg(); 3631 Register Src = MI.getOperand(2).getReg(); 3632 auto Flags = MI.getFlags(); 3633 3634 LLT Ty = MRI.getType(Dst); 3635 3636 const fltSemantics *FltSemantics; 3637 if (Ty == LLT::scalar(32)) 3638 FltSemantics = &APFloat::IEEEsingle(); 3639 else if (Ty == LLT::scalar(64)) 3640 FltSemantics = &APFloat::IEEEdouble(); 3641 else 3642 return false; 3643 3644 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3645 .addUse(Src) 3646 .setMIFlags(Flags); 3647 3648 // We don't need to concern ourselves with the snan handling difference, since 3649 // the rsq quieted (or not) so use the one which will directly select. 3650 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3651 const bool UseIEEE = MFI->getMode().IEEE; 3652 3653 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3654 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3655 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3656 3657 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3658 3659 if (UseIEEE) 3660 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3661 else 3662 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3663 MI.eraseFromParent(); 3664 return true; 3665 } 3666 3667 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3668 switch (IID) { 3669 case Intrinsic::amdgcn_ds_fadd: 3670 return AMDGPU::G_ATOMICRMW_FADD; 3671 case Intrinsic::amdgcn_ds_fmin: 3672 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3673 case Intrinsic::amdgcn_ds_fmax: 3674 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3675 default: 3676 llvm_unreachable("not a DS FP intrinsic"); 3677 } 3678 } 3679 3680 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3681 MachineInstr &MI, 3682 Intrinsic::ID IID) const { 3683 GISelChangeObserver &Observer = Helper.Observer; 3684 Observer.changingInstr(MI); 3685 3686 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3687 3688 // The remaining operands were used to set fields in the MemOperand on 3689 // construction. 3690 for (int I = 6; I > 3; --I) 3691 MI.RemoveOperand(I); 3692 3693 MI.RemoveOperand(1); // Remove the intrinsic ID. 3694 Observer.changedInstr(MI); 3695 return true; 3696 } 3697 3698 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3699 MachineRegisterInfo &MRI, 3700 MachineIRBuilder &B) const { 3701 uint64_t Offset = 3702 ST.getTargetLowering()->getImplicitParameterOffset( 3703 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3704 LLT DstTy = MRI.getType(DstReg); 3705 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3706 3707 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3708 if (!loadInputValue(KernargPtrReg, B, 3709 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3710 return false; 3711 3712 // FIXME: This should be nuw 3713 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3714 return true; 3715 } 3716 3717 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3718 MachineRegisterInfo &MRI, 3719 MachineIRBuilder &B) const { 3720 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3721 if (!MFI->isEntryFunction()) { 3722 return legalizePreloadedArgIntrin(MI, MRI, B, 3723 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3724 } 3725 3726 Register DstReg = MI.getOperand(0).getReg(); 3727 if (!getImplicitArgPtr(DstReg, MRI, B)) 3728 return false; 3729 3730 MI.eraseFromParent(); 3731 return true; 3732 } 3733 3734 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3735 MachineRegisterInfo &MRI, 3736 MachineIRBuilder &B, 3737 unsigned AddrSpace) const { 3738 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3739 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3740 Register Hi32 = Unmerge.getReg(1); 3741 3742 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3743 MI.eraseFromParent(); 3744 return true; 3745 } 3746 3747 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3748 // offset (the offset that is included in bounds checking and swizzling, to be 3749 // split between the instruction's voffset and immoffset fields) and soffset 3750 // (the offset that is excluded from bounds checking and swizzling, to go in 3751 // the instruction's soffset field). This function takes the first kind of 3752 // offset and figures out how to split it between voffset and immoffset. 3753 std::pair<Register, unsigned> 3754 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3755 Register OrigOffset) const { 3756 const unsigned MaxImm = 4095; 3757 Register BaseReg; 3758 unsigned ImmOffset; 3759 const LLT S32 = LLT::scalar(32); 3760 MachineRegisterInfo &MRI = *B.getMRI(); 3761 3762 std::tie(BaseReg, ImmOffset) = 3763 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3764 3765 // If BaseReg is a pointer, convert it to int. 3766 if (MRI.getType(BaseReg).isPointer()) 3767 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3768 3769 // If the immediate value is too big for the immoffset field, put the value 3770 // and -4096 into the immoffset field so that the value that is copied/added 3771 // for the voffset field is a multiple of 4096, and it stands more chance 3772 // of being CSEd with the copy/add for another similar load/store. 3773 // However, do not do that rounding down to a multiple of 4096 if that is a 3774 // negative number, as it appears to be illegal to have a negative offset 3775 // in the vgpr, even if adding the immediate offset makes it positive. 3776 unsigned Overflow = ImmOffset & ~MaxImm; 3777 ImmOffset -= Overflow; 3778 if ((int32_t)Overflow < 0) { 3779 Overflow += ImmOffset; 3780 ImmOffset = 0; 3781 } 3782 3783 if (Overflow != 0) { 3784 if (!BaseReg) { 3785 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3786 } else { 3787 auto OverflowVal = B.buildConstant(S32, Overflow); 3788 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3789 } 3790 } 3791 3792 if (!BaseReg) 3793 BaseReg = B.buildConstant(S32, 0).getReg(0); 3794 3795 return std::make_pair(BaseReg, ImmOffset); 3796 } 3797 3798 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3799 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3800 Register VOffset, Register SOffset, 3801 unsigned ImmOffset, Register VIndex, 3802 MachineRegisterInfo &MRI) const { 3803 Optional<ValueAndVReg> MaybeVOffsetVal = 3804 getIConstantVRegValWithLookThrough(VOffset, MRI); 3805 Optional<ValueAndVReg> MaybeSOffsetVal = 3806 getIConstantVRegValWithLookThrough(SOffset, MRI); 3807 Optional<ValueAndVReg> MaybeVIndexVal = 3808 getIConstantVRegValWithLookThrough(VIndex, MRI); 3809 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3810 // update the MMO with that offset. The stride is unknown so we can only do 3811 // this if VIndex is constant 0. 3812 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3813 MaybeVIndexVal->Value == 0) { 3814 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3815 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3816 MMO->setOffset(TotalOffset); 3817 } else { 3818 // We don't have a constant combined offset to use in the MMO. Give up. 3819 MMO->setValue((Value *)nullptr); 3820 } 3821 } 3822 3823 /// Handle register layout difference for f16 images for some subtargets. 3824 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3825 MachineRegisterInfo &MRI, 3826 Register Reg, 3827 bool ImageStore) const { 3828 const LLT S16 = LLT::scalar(16); 3829 const LLT S32 = LLT::scalar(32); 3830 LLT StoreVT = MRI.getType(Reg); 3831 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3832 3833 if (ST.hasUnpackedD16VMem()) { 3834 auto Unmerge = B.buildUnmerge(S16, Reg); 3835 3836 SmallVector<Register, 4> WideRegs; 3837 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3838 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3839 3840 int NumElts = StoreVT.getNumElements(); 3841 3842 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3843 .getReg(0); 3844 } 3845 3846 if (ImageStore && ST.hasImageStoreD16Bug()) { 3847 if (StoreVT.getNumElements() == 2) { 3848 SmallVector<Register, 4> PackedRegs; 3849 Reg = B.buildBitcast(S32, Reg).getReg(0); 3850 PackedRegs.push_back(Reg); 3851 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3852 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3853 .getReg(0); 3854 } 3855 3856 if (StoreVT.getNumElements() == 3) { 3857 SmallVector<Register, 4> PackedRegs; 3858 auto Unmerge = B.buildUnmerge(S16, Reg); 3859 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3860 PackedRegs.push_back(Unmerge.getReg(I)); 3861 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3862 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3863 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3864 } 3865 3866 if (StoreVT.getNumElements() == 4) { 3867 SmallVector<Register, 4> PackedRegs; 3868 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3869 auto Unmerge = B.buildUnmerge(S32, Reg); 3870 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3871 PackedRegs.push_back(Unmerge.getReg(I)); 3872 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3873 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3874 .getReg(0); 3875 } 3876 3877 llvm_unreachable("invalid data type"); 3878 } 3879 3880 if (StoreVT == LLT::fixed_vector(3, S16)) { 3881 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 3882 .getReg(0); 3883 } 3884 return Reg; 3885 } 3886 3887 Register AMDGPULegalizerInfo::fixStoreSourceType( 3888 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3889 MachineRegisterInfo *MRI = B.getMRI(); 3890 LLT Ty = MRI->getType(VData); 3891 3892 const LLT S16 = LLT::scalar(16); 3893 3894 // Fixup illegal register types for i8 stores. 3895 if (Ty == LLT::scalar(8) || Ty == S16) { 3896 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3897 return AnyExt; 3898 } 3899 3900 if (Ty.isVector()) { 3901 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3902 if (IsFormat) 3903 return handleD16VData(B, *MRI, VData); 3904 } 3905 } 3906 3907 return VData; 3908 } 3909 3910 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3911 MachineRegisterInfo &MRI, 3912 MachineIRBuilder &B, 3913 bool IsTyped, 3914 bool IsFormat) const { 3915 Register VData = MI.getOperand(1).getReg(); 3916 LLT Ty = MRI.getType(VData); 3917 LLT EltTy = Ty.getScalarType(); 3918 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3919 const LLT S32 = LLT::scalar(32); 3920 3921 VData = fixStoreSourceType(B, VData, IsFormat); 3922 Register RSrc = MI.getOperand(2).getReg(); 3923 3924 MachineMemOperand *MMO = *MI.memoperands_begin(); 3925 const int MemSize = MMO->getSize(); 3926 3927 unsigned ImmOffset; 3928 3929 // The typed intrinsics add an immediate after the registers. 3930 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3931 3932 // The struct intrinsic variants add one additional operand over raw. 3933 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3934 Register VIndex; 3935 int OpOffset = 0; 3936 if (HasVIndex) { 3937 VIndex = MI.getOperand(3).getReg(); 3938 OpOffset = 1; 3939 } else { 3940 VIndex = B.buildConstant(S32, 0).getReg(0); 3941 } 3942 3943 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3944 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3945 3946 unsigned Format = 0; 3947 if (IsTyped) { 3948 Format = MI.getOperand(5 + OpOffset).getImm(); 3949 ++OpOffset; 3950 } 3951 3952 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3953 3954 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3955 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3956 3957 unsigned Opc; 3958 if (IsTyped) { 3959 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3960 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3961 } else if (IsFormat) { 3962 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3963 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3964 } else { 3965 switch (MemSize) { 3966 case 1: 3967 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3968 break; 3969 case 2: 3970 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3971 break; 3972 default: 3973 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3974 break; 3975 } 3976 } 3977 3978 auto MIB = B.buildInstr(Opc) 3979 .addUse(VData) // vdata 3980 .addUse(RSrc) // rsrc 3981 .addUse(VIndex) // vindex 3982 .addUse(VOffset) // voffset 3983 .addUse(SOffset) // soffset 3984 .addImm(ImmOffset); // offset(imm) 3985 3986 if (IsTyped) 3987 MIB.addImm(Format); 3988 3989 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3990 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3991 .addMemOperand(MMO); 3992 3993 MI.eraseFromParent(); 3994 return true; 3995 } 3996 3997 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3998 MachineRegisterInfo &MRI, 3999 MachineIRBuilder &B, 4000 bool IsFormat, 4001 bool IsTyped) const { 4002 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4003 MachineMemOperand *MMO = *MI.memoperands_begin(); 4004 const LLT MemTy = MMO->getMemoryType(); 4005 const LLT S32 = LLT::scalar(32); 4006 4007 Register Dst = MI.getOperand(0).getReg(); 4008 Register RSrc = MI.getOperand(2).getReg(); 4009 4010 // The typed intrinsics add an immediate after the registers. 4011 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4012 4013 // The struct intrinsic variants add one additional operand over raw. 4014 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4015 Register VIndex; 4016 int OpOffset = 0; 4017 if (HasVIndex) { 4018 VIndex = MI.getOperand(3).getReg(); 4019 OpOffset = 1; 4020 } else { 4021 VIndex = B.buildConstant(S32, 0).getReg(0); 4022 } 4023 4024 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4025 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4026 4027 unsigned Format = 0; 4028 if (IsTyped) { 4029 Format = MI.getOperand(5 + OpOffset).getImm(); 4030 ++OpOffset; 4031 } 4032 4033 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4034 unsigned ImmOffset; 4035 4036 LLT Ty = MRI.getType(Dst); 4037 LLT EltTy = Ty.getScalarType(); 4038 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4039 const bool Unpacked = ST.hasUnpackedD16VMem(); 4040 4041 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4042 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4043 4044 unsigned Opc; 4045 4046 if (IsTyped) { 4047 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4048 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4049 } else if (IsFormat) { 4050 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4051 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4052 } else { 4053 switch (MemTy.getSizeInBits()) { 4054 case 8: 4055 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4056 break; 4057 case 16: 4058 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4059 break; 4060 default: 4061 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4062 break; 4063 } 4064 } 4065 4066 Register LoadDstReg; 4067 4068 bool IsExtLoad = 4069 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4070 LLT UnpackedTy = Ty.changeElementSize(32); 4071 4072 if (IsExtLoad) 4073 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4074 else if (Unpacked && IsD16 && Ty.isVector()) 4075 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4076 else 4077 LoadDstReg = Dst; 4078 4079 auto MIB = B.buildInstr(Opc) 4080 .addDef(LoadDstReg) // vdata 4081 .addUse(RSrc) // rsrc 4082 .addUse(VIndex) // vindex 4083 .addUse(VOffset) // voffset 4084 .addUse(SOffset) // soffset 4085 .addImm(ImmOffset); // offset(imm) 4086 4087 if (IsTyped) 4088 MIB.addImm(Format); 4089 4090 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4091 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4092 .addMemOperand(MMO); 4093 4094 if (LoadDstReg != Dst) { 4095 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4096 4097 // Widen result for extending loads was widened. 4098 if (IsExtLoad) 4099 B.buildTrunc(Dst, LoadDstReg); 4100 else { 4101 // Repack to original 16-bit vector result 4102 // FIXME: G_TRUNC should work, but legalization currently fails 4103 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4104 SmallVector<Register, 4> Repack; 4105 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4106 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4107 B.buildMerge(Dst, Repack); 4108 } 4109 } 4110 4111 MI.eraseFromParent(); 4112 return true; 4113 } 4114 4115 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4116 MachineIRBuilder &B, 4117 bool IsInc) const { 4118 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4119 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4120 B.buildInstr(Opc) 4121 .addDef(MI.getOperand(0).getReg()) 4122 .addUse(MI.getOperand(2).getReg()) 4123 .addUse(MI.getOperand(3).getReg()) 4124 .cloneMemRefs(MI); 4125 MI.eraseFromParent(); 4126 return true; 4127 } 4128 4129 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4130 switch (IntrID) { 4131 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4132 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4133 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4134 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4135 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4136 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4137 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4138 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4140 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4141 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4143 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4144 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4145 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4146 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4147 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4148 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4149 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4150 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4151 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4152 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4153 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4154 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4155 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4156 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4157 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4158 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4159 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4160 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4161 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4162 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4163 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4164 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4165 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4166 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4167 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4168 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4169 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4170 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4171 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4172 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4173 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4174 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4175 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4176 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4177 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4178 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4179 default: 4180 llvm_unreachable("unhandled atomic opcode"); 4181 } 4182 } 4183 4184 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4185 MachineIRBuilder &B, 4186 Intrinsic::ID IID) const { 4187 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4188 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4189 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4190 4191 Register Dst; 4192 4193 int OpOffset = 0; 4194 if (HasReturn) { 4195 // A few FP atomics do not support return values. 4196 Dst = MI.getOperand(0).getReg(); 4197 } else { 4198 OpOffset = -1; 4199 } 4200 4201 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4202 Register CmpVal; 4203 4204 if (IsCmpSwap) { 4205 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4206 ++OpOffset; 4207 } 4208 4209 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4210 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4211 4212 // The struct intrinsic variants add one additional operand over raw. 4213 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4214 Register VIndex; 4215 if (HasVIndex) { 4216 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4217 ++OpOffset; 4218 } else { 4219 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4220 } 4221 4222 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4223 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4224 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4225 4226 MachineMemOperand *MMO = *MI.memoperands_begin(); 4227 4228 unsigned ImmOffset; 4229 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4230 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4231 4232 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4233 4234 if (HasReturn) 4235 MIB.addDef(Dst); 4236 4237 MIB.addUse(VData); // vdata 4238 4239 if (IsCmpSwap) 4240 MIB.addReg(CmpVal); 4241 4242 MIB.addUse(RSrc) // rsrc 4243 .addUse(VIndex) // vindex 4244 .addUse(VOffset) // voffset 4245 .addUse(SOffset) // soffset 4246 .addImm(ImmOffset) // offset(imm) 4247 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4248 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4249 .addMemOperand(MMO); 4250 4251 MI.eraseFromParent(); 4252 return true; 4253 } 4254 4255 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4256 /// vector with s16 typed elements. 4257 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4258 SmallVectorImpl<Register> &PackedAddrs, 4259 unsigned ArgOffset, 4260 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4261 bool IsA16, bool IsG16) { 4262 const LLT S16 = LLT::scalar(16); 4263 const LLT V2S16 = LLT::fixed_vector(2, 16); 4264 auto EndIdx = Intr->VAddrEnd; 4265 4266 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4267 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4268 if (!SrcOp.isReg()) 4269 continue; // _L to _LZ may have eliminated this. 4270 4271 Register AddrReg = SrcOp.getReg(); 4272 4273 if ((I < Intr->GradientStart) || 4274 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4275 (I >= Intr->CoordStart && !IsA16)) { 4276 if ((I < Intr->GradientStart) && IsA16 && 4277 (B.getMRI()->getType(AddrReg) == S16)) { 4278 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4279 // Special handling of bias when A16 is on. Bias is of type half but 4280 // occupies full 32-bit. 4281 PackedAddrs.push_back( 4282 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4283 .getReg(0)); 4284 } else { 4285 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4286 "Bias needs to be converted to 16 bit in A16 mode"); 4287 // Handle any gradient or coordinate operands that should not be packed 4288 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4289 PackedAddrs.push_back(AddrReg); 4290 } 4291 } else { 4292 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4293 // derivatives dx/dh and dx/dv are packed with undef. 4294 if (((I + 1) >= EndIdx) || 4295 ((Intr->NumGradients / 2) % 2 == 1 && 4296 (I == static_cast<unsigned>(Intr->GradientStart + 4297 (Intr->NumGradients / 2) - 1) || 4298 I == static_cast<unsigned>(Intr->GradientStart + 4299 Intr->NumGradients - 1))) || 4300 // Check for _L to _LZ optimization 4301 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4302 PackedAddrs.push_back( 4303 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4304 .getReg(0)); 4305 } else { 4306 PackedAddrs.push_back( 4307 B.buildBuildVector( 4308 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4309 .getReg(0)); 4310 ++I; 4311 } 4312 } 4313 } 4314 } 4315 4316 /// Convert from separate vaddr components to a single vector address register, 4317 /// and replace the remaining operands with $noreg. 4318 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4319 int DimIdx, int NumVAddrs) { 4320 const LLT S32 = LLT::scalar(32); 4321 4322 SmallVector<Register, 8> AddrRegs; 4323 for (int I = 0; I != NumVAddrs; ++I) { 4324 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4325 if (SrcOp.isReg()) { 4326 AddrRegs.push_back(SrcOp.getReg()); 4327 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4328 } 4329 } 4330 4331 int NumAddrRegs = AddrRegs.size(); 4332 if (NumAddrRegs != 1) { 4333 // Above 8 elements round up to next power of 2 (i.e. 16). 4334 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4335 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4336 auto Undef = B.buildUndef(S32); 4337 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4338 NumAddrRegs = RoundedNumRegs; 4339 } 4340 4341 auto VAddr = 4342 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4343 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4344 } 4345 4346 for (int I = 1; I != NumVAddrs; ++I) { 4347 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4348 if (SrcOp.isReg()) 4349 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4350 } 4351 } 4352 4353 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4354 /// 4355 /// Depending on the subtarget, load/store with 16-bit element data need to be 4356 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4357 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4358 /// registers. 4359 /// 4360 /// We don't want to directly select image instructions just yet, but also want 4361 /// to exposes all register repacking to the legalizer/combiners. We also don't 4362 /// want a selected instrution entering RegBankSelect. In order to avoid 4363 /// defining a multitude of intermediate image instructions, directly hack on 4364 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4365 /// padding now unnecessary arguments with $noreg. 4366 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4367 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4368 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4369 4370 const unsigned NumDefs = MI.getNumExplicitDefs(); 4371 const unsigned ArgOffset = NumDefs + 1; 4372 bool IsTFE = NumDefs == 2; 4373 // We are only processing the operands of d16 image operations on subtargets 4374 // that use the unpacked register layout, or need to repack the TFE result. 4375 4376 // TODO: Do we need to guard against already legalized intrinsics? 4377 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4378 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4379 4380 MachineRegisterInfo *MRI = B.getMRI(); 4381 const LLT S32 = LLT::scalar(32); 4382 const LLT S16 = LLT::scalar(16); 4383 const LLT V2S16 = LLT::fixed_vector(2, 16); 4384 4385 unsigned DMask = 0; 4386 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4387 LLT Ty = MRI->getType(VData); 4388 4389 // Check for 16 bit addresses and pack if true. 4390 LLT GradTy = 4391 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4392 LLT AddrTy = 4393 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4394 const bool IsG16 = GradTy == S16; 4395 const bool IsA16 = AddrTy == S16; 4396 const bool IsD16 = Ty.getScalarType() == S16; 4397 4398 int DMaskLanes = 0; 4399 if (!BaseOpcode->Atomic) { 4400 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4401 if (BaseOpcode->Gather4) { 4402 DMaskLanes = 4; 4403 } else if (DMask != 0) { 4404 DMaskLanes = countPopulation(DMask); 4405 } else if (!IsTFE && !BaseOpcode->Store) { 4406 // If dmask is 0, this is a no-op load. This can be eliminated. 4407 B.buildUndef(MI.getOperand(0)); 4408 MI.eraseFromParent(); 4409 return true; 4410 } 4411 } 4412 4413 Observer.changingInstr(MI); 4414 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4415 4416 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4417 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4418 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4419 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4420 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4421 4422 // Track that we legalized this 4423 MI.setDesc(B.getTII().get(NewOpcode)); 4424 4425 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4426 // dmask to be at least 1 otherwise the instruction will fail 4427 if (IsTFE && DMask == 0) { 4428 DMask = 0x1; 4429 DMaskLanes = 1; 4430 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4431 } 4432 4433 if (BaseOpcode->Atomic) { 4434 Register VData0 = MI.getOperand(2).getReg(); 4435 LLT Ty = MRI->getType(VData0); 4436 4437 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4438 if (Ty.isVector()) 4439 return false; 4440 4441 if (BaseOpcode->AtomicX2) { 4442 Register VData1 = MI.getOperand(3).getReg(); 4443 // The two values are packed in one register. 4444 LLT PackedTy = LLT::fixed_vector(2, Ty); 4445 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4446 MI.getOperand(2).setReg(Concat.getReg(0)); 4447 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4448 } 4449 } 4450 4451 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4452 4453 // Rewrite the addressing register layout before doing anything else. 4454 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4455 // 16 bit gradients are supported, but are tied to the A16 control 4456 // so both gradients and addresses must be 16 bit 4457 return false; 4458 } 4459 4460 if (IsA16 && !ST.hasA16()) { 4461 // A16 not supported 4462 return false; 4463 } 4464 4465 if (IsA16 || IsG16) { 4466 if (Intr->NumVAddrs > 1) { 4467 SmallVector<Register, 4> PackedRegs; 4468 4469 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4470 IsG16); 4471 4472 // See also below in the non-a16 branch 4473 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4474 PackedRegs.size() <= ST.getNSAMaxSize(); 4475 4476 if (!UseNSA && PackedRegs.size() > 1) { 4477 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4478 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4479 PackedRegs[0] = Concat.getReg(0); 4480 PackedRegs.resize(1); 4481 } 4482 4483 const unsigned NumPacked = PackedRegs.size(); 4484 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4485 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4486 if (!SrcOp.isReg()) { 4487 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4488 continue; 4489 } 4490 4491 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4492 4493 if (I - Intr->VAddrStart < NumPacked) 4494 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4495 else 4496 SrcOp.setReg(AMDGPU::NoRegister); 4497 } 4498 } 4499 } else { 4500 // If the register allocator cannot place the address registers contiguously 4501 // without introducing moves, then using the non-sequential address encoding 4502 // is always preferable, since it saves VALU instructions and is usually a 4503 // wash in terms of code size or even better. 4504 // 4505 // However, we currently have no way of hinting to the register allocator 4506 // that MIMG addresses should be placed contiguously when it is possible to 4507 // do so, so force non-NSA for the common 2-address case as a heuristic. 4508 // 4509 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4510 // allocation when possible. 4511 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4512 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4513 4514 if (!UseNSA && Intr->NumVAddrs > 1) 4515 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4516 Intr->NumVAddrs); 4517 } 4518 4519 int Flags = 0; 4520 if (IsA16) 4521 Flags |= 1; 4522 if (IsG16) 4523 Flags |= 2; 4524 MI.addOperand(MachineOperand::CreateImm(Flags)); 4525 4526 if (BaseOpcode->Store) { // No TFE for stores? 4527 // TODO: Handle dmask trim 4528 if (!Ty.isVector() || !IsD16) 4529 return true; 4530 4531 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4532 if (RepackedReg != VData) { 4533 MI.getOperand(1).setReg(RepackedReg); 4534 } 4535 4536 return true; 4537 } 4538 4539 Register DstReg = MI.getOperand(0).getReg(); 4540 const LLT EltTy = Ty.getScalarType(); 4541 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4542 4543 // Confirm that the return type is large enough for the dmask specified 4544 if (NumElts < DMaskLanes) 4545 return false; 4546 4547 if (NumElts > 4 || DMaskLanes > 4) 4548 return false; 4549 4550 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4551 const LLT AdjustedTy = 4552 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4553 4554 // The raw dword aligned data component of the load. The only legal cases 4555 // where this matters should be when using the packed D16 format, for 4556 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4557 LLT RoundedTy; 4558 4559 // S32 vector to to cover all data, plus TFE result element. 4560 LLT TFETy; 4561 4562 // Register type to use for each loaded component. Will be S32 or V2S16. 4563 LLT RegTy; 4564 4565 if (IsD16 && ST.hasUnpackedD16VMem()) { 4566 RoundedTy = 4567 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4568 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4569 RegTy = S32; 4570 } else { 4571 unsigned EltSize = EltTy.getSizeInBits(); 4572 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4573 unsigned RoundedSize = 32 * RoundedElts; 4574 RoundedTy = LLT::scalarOrVector( 4575 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4576 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4577 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4578 } 4579 4580 // The return type does not need adjustment. 4581 // TODO: Should we change s16 case to s32 or <2 x s16>? 4582 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4583 return true; 4584 4585 Register Dst1Reg; 4586 4587 // Insert after the instruction. 4588 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4589 4590 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4591 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4592 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4593 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4594 4595 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4596 4597 MI.getOperand(0).setReg(NewResultReg); 4598 4599 // In the IR, TFE is supposed to be used with a 2 element struct return 4600 // type. The instruction really returns these two values in one contiguous 4601 // register, with one additional dword beyond the loaded data. Rewrite the 4602 // return type to use a single register result. 4603 4604 if (IsTFE) { 4605 Dst1Reg = MI.getOperand(1).getReg(); 4606 if (MRI->getType(Dst1Reg) != S32) 4607 return false; 4608 4609 // TODO: Make sure the TFE operand bit is set. 4610 MI.RemoveOperand(1); 4611 4612 // Handle the easy case that requires no repack instructions. 4613 if (Ty == S32) { 4614 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4615 return true; 4616 } 4617 } 4618 4619 // Now figure out how to copy the new result register back into the old 4620 // result. 4621 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4622 4623 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4624 4625 if (ResultNumRegs == 1) { 4626 assert(!IsTFE); 4627 ResultRegs[0] = NewResultReg; 4628 } else { 4629 // We have to repack into a new vector of some kind. 4630 for (int I = 0; I != NumDataRegs; ++I) 4631 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4632 B.buildUnmerge(ResultRegs, NewResultReg); 4633 4634 // Drop the final TFE element to get the data part. The TFE result is 4635 // directly written to the right place already. 4636 if (IsTFE) 4637 ResultRegs.resize(NumDataRegs); 4638 } 4639 4640 // For an s16 scalar result, we form an s32 result with a truncate regardless 4641 // of packed vs. unpacked. 4642 if (IsD16 && !Ty.isVector()) { 4643 B.buildTrunc(DstReg, ResultRegs[0]); 4644 return true; 4645 } 4646 4647 // Avoid a build/concat_vector of 1 entry. 4648 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4649 B.buildBitcast(DstReg, ResultRegs[0]); 4650 return true; 4651 } 4652 4653 assert(Ty.isVector()); 4654 4655 if (IsD16) { 4656 // For packed D16 results with TFE enabled, all the data components are 4657 // S32. Cast back to the expected type. 4658 // 4659 // TODO: We don't really need to use load s32 elements. We would only need one 4660 // cast for the TFE result if a multiple of v2s16 was used. 4661 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4662 for (Register &Reg : ResultRegs) 4663 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4664 } else if (ST.hasUnpackedD16VMem()) { 4665 for (Register &Reg : ResultRegs) 4666 Reg = B.buildTrunc(S16, Reg).getReg(0); 4667 } 4668 } 4669 4670 auto padWithUndef = [&](LLT Ty, int NumElts) { 4671 if (NumElts == 0) 4672 return; 4673 Register Undef = B.buildUndef(Ty).getReg(0); 4674 for (int I = 0; I != NumElts; ++I) 4675 ResultRegs.push_back(Undef); 4676 }; 4677 4678 // Pad out any elements eliminated due to the dmask. 4679 LLT ResTy = MRI->getType(ResultRegs[0]); 4680 if (!ResTy.isVector()) { 4681 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4682 B.buildBuildVector(DstReg, ResultRegs); 4683 return true; 4684 } 4685 4686 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4687 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4688 4689 // Deal with the one annoying legal case. 4690 const LLT V3S16 = LLT::fixed_vector(3, 16); 4691 if (Ty == V3S16) { 4692 if (IsTFE) { 4693 if (ResultRegs.size() == 1) { 4694 NewResultReg = ResultRegs[0]; 4695 } else if (ResultRegs.size() == 2) { 4696 LLT V4S16 = LLT::fixed_vector(4, 16); 4697 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 4698 } else { 4699 return false; 4700 } 4701 } 4702 4703 if (MRI->getType(DstReg).getNumElements() < 4704 MRI->getType(NewResultReg).getNumElements()) { 4705 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 4706 } else { 4707 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 4708 } 4709 return true; 4710 } 4711 4712 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4713 B.buildConcatVectors(DstReg, ResultRegs); 4714 return true; 4715 } 4716 4717 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4718 LegalizerHelper &Helper, MachineInstr &MI) const { 4719 MachineIRBuilder &B = Helper.MIRBuilder; 4720 GISelChangeObserver &Observer = Helper.Observer; 4721 4722 Register Dst = MI.getOperand(0).getReg(); 4723 LLT Ty = B.getMRI()->getType(Dst); 4724 unsigned Size = Ty.getSizeInBits(); 4725 MachineFunction &MF = B.getMF(); 4726 4727 Observer.changingInstr(MI); 4728 4729 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4730 Ty = getBitcastRegisterType(Ty); 4731 Helper.bitcastDst(MI, Ty, 0); 4732 Dst = MI.getOperand(0).getReg(); 4733 B.setInsertPt(B.getMBB(), MI); 4734 } 4735 4736 // FIXME: We don't really need this intermediate instruction. The intrinsic 4737 // should be fixed to have a memory operand. Since it's readnone, we're not 4738 // allowed to add one. 4739 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4740 MI.RemoveOperand(1); // Remove intrinsic ID 4741 4742 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4743 // TODO: Should this use datalayout alignment? 4744 const unsigned MemSize = (Size + 7) / 8; 4745 const Align MemAlign(4); 4746 MachineMemOperand *MMO = MF.getMachineMemOperand( 4747 MachinePointerInfo(), 4748 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4749 MachineMemOperand::MOInvariant, 4750 MemSize, MemAlign); 4751 MI.addMemOperand(MF, MMO); 4752 4753 // There are no 96-bit result scalar loads, but widening to 128-bit should 4754 // always be legal. We may need to restore this to a 96-bit result if it turns 4755 // out this needs to be converted to a vector load during RegBankSelect. 4756 if (!isPowerOf2_32(Size)) { 4757 if (Ty.isVector()) 4758 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4759 else 4760 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4761 } 4762 4763 Observer.changedInstr(MI); 4764 return true; 4765 } 4766 4767 // TODO: Move to selection 4768 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4769 MachineRegisterInfo &MRI, 4770 MachineIRBuilder &B) const { 4771 if (!ST.isTrapHandlerEnabled() || 4772 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4773 return legalizeTrapEndpgm(MI, MRI, B); 4774 4775 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4776 switch (*HsaAbiVer) { 4777 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4778 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4779 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4780 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4781 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 4782 return ST.supportsGetDoorbellID() ? 4783 legalizeTrapHsa(MI, MRI, B) : 4784 legalizeTrapHsaQueuePtr(MI, MRI, B); 4785 } 4786 } 4787 4788 llvm_unreachable("Unknown trap handler"); 4789 } 4790 4791 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4792 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4793 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4794 MI.eraseFromParent(); 4795 return true; 4796 } 4797 4798 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4799 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4800 // Pass queue pointer to trap handler as input, and insert trap instruction 4801 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4802 Register LiveIn = 4803 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4804 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4805 return false; 4806 4807 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4808 B.buildCopy(SGPR01, LiveIn); 4809 B.buildInstr(AMDGPU::S_TRAP) 4810 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4811 .addReg(SGPR01, RegState::Implicit); 4812 4813 MI.eraseFromParent(); 4814 return true; 4815 } 4816 4817 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4818 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4819 B.buildInstr(AMDGPU::S_TRAP) 4820 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4821 MI.eraseFromParent(); 4822 return true; 4823 } 4824 4825 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4826 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4827 // Is non-HSA path or trap-handler disabled? Then, report a warning 4828 // accordingly 4829 if (!ST.isTrapHandlerEnabled() || 4830 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4831 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4832 "debugtrap handler not supported", 4833 MI.getDebugLoc(), DS_Warning); 4834 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4835 Ctx.diagnose(NoTrap); 4836 } else { 4837 // Insert debug-trap instruction 4838 B.buildInstr(AMDGPU::S_TRAP) 4839 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4840 } 4841 4842 MI.eraseFromParent(); 4843 return true; 4844 } 4845 4846 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4847 MachineIRBuilder &B) const { 4848 MachineRegisterInfo &MRI = *B.getMRI(); 4849 const LLT S16 = LLT::scalar(16); 4850 const LLT S32 = LLT::scalar(32); 4851 4852 Register DstReg = MI.getOperand(0).getReg(); 4853 Register NodePtr = MI.getOperand(2).getReg(); 4854 Register RayExtent = MI.getOperand(3).getReg(); 4855 Register RayOrigin = MI.getOperand(4).getReg(); 4856 Register RayDir = MI.getOperand(5).getReg(); 4857 Register RayInvDir = MI.getOperand(6).getReg(); 4858 Register TDescr = MI.getOperand(7).getReg(); 4859 4860 if (!ST.hasGFX10_AEncoding()) { 4861 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4862 "intrinsic not supported on subtarget", 4863 MI.getDebugLoc()); 4864 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4865 return false; 4866 } 4867 4868 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4869 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4870 const unsigned NumVDataDwords = 4; 4871 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 4872 const bool UseNSA = 4873 ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); 4874 const unsigned BaseOpcodes[2][2] = { 4875 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 4876 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 4877 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 4878 int Opcode; 4879 if (UseNSA) { 4880 Opcode = 4881 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, 4882 NumVDataDwords, NumVAddrDwords); 4883 } else { 4884 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 4885 AMDGPU::MIMGEncGfx10Default, NumVDataDwords, 4886 PowerOf2Ceil(NumVAddrDwords)); 4887 } 4888 assert(Opcode != -1); 4889 4890 SmallVector<Register, 12> Ops; 4891 if (Is64) { 4892 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4893 Ops.push_back(Unmerge.getReg(0)); 4894 Ops.push_back(Unmerge.getReg(1)); 4895 } else { 4896 Ops.push_back(NodePtr); 4897 } 4898 Ops.push_back(RayExtent); 4899 4900 auto packLanes = [&Ops, &S32, &B](Register Src) { 4901 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 4902 Ops.push_back(Unmerge.getReg(0)); 4903 Ops.push_back(Unmerge.getReg(1)); 4904 Ops.push_back(Unmerge.getReg(2)); 4905 }; 4906 4907 packLanes(RayOrigin); 4908 if (IsA16) { 4909 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 4910 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 4911 Register R1 = MRI.createGenericVirtualRegister(S32); 4912 Register R2 = MRI.createGenericVirtualRegister(S32); 4913 Register R3 = MRI.createGenericVirtualRegister(S32); 4914 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4915 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4916 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4917 Ops.push_back(R1); 4918 Ops.push_back(R2); 4919 Ops.push_back(R3); 4920 } else { 4921 packLanes(RayDir); 4922 packLanes(RayInvDir); 4923 } 4924 4925 if (!UseNSA) { 4926 // Build a single vector containing all the operands so far prepared. 4927 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 4928 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 4929 Ops.clear(); 4930 Ops.push_back(MergedOps); 4931 } 4932 4933 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4934 .addDef(DstReg) 4935 .addImm(Opcode); 4936 4937 for (Register R : Ops) { 4938 MIB.addUse(R); 4939 } 4940 4941 MIB.addUse(TDescr) 4942 .addImm(IsA16 ? 1 : 0) 4943 .cloneMemRefs(MI); 4944 4945 MI.eraseFromParent(); 4946 return true; 4947 } 4948 4949 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { 4950 B.buildConstant(MI.getOperand(0).getReg(), C); 4951 MI.eraseFromParent(); 4952 return true; 4953 } 4954 4955 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4956 MachineInstr &MI) const { 4957 MachineIRBuilder &B = Helper.MIRBuilder; 4958 MachineRegisterInfo &MRI = *B.getMRI(); 4959 4960 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4961 auto IntrID = MI.getIntrinsicID(); 4962 switch (IntrID) { 4963 case Intrinsic::amdgcn_if: 4964 case Intrinsic::amdgcn_else: { 4965 MachineInstr *Br = nullptr; 4966 MachineBasicBlock *UncondBrTarget = nullptr; 4967 bool Negated = false; 4968 if (MachineInstr *BrCond = 4969 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4970 const SIRegisterInfo *TRI 4971 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4972 4973 Register Def = MI.getOperand(1).getReg(); 4974 Register Use = MI.getOperand(3).getReg(); 4975 4976 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4977 4978 if (Negated) 4979 std::swap(CondBrTarget, UncondBrTarget); 4980 4981 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4982 if (IntrID == Intrinsic::amdgcn_if) { 4983 B.buildInstr(AMDGPU::SI_IF) 4984 .addDef(Def) 4985 .addUse(Use) 4986 .addMBB(UncondBrTarget); 4987 } else { 4988 B.buildInstr(AMDGPU::SI_ELSE) 4989 .addDef(Def) 4990 .addUse(Use) 4991 .addMBB(UncondBrTarget); 4992 } 4993 4994 if (Br) { 4995 Br->getOperand(0).setMBB(CondBrTarget); 4996 } else { 4997 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4998 // since we're swapping branch targets it needs to be reinserted. 4999 // FIXME: IRTranslator should probably not do this 5000 B.buildBr(*CondBrTarget); 5001 } 5002 5003 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5004 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5005 MI.eraseFromParent(); 5006 BrCond->eraseFromParent(); 5007 return true; 5008 } 5009 5010 return false; 5011 } 5012 case Intrinsic::amdgcn_loop: { 5013 MachineInstr *Br = nullptr; 5014 MachineBasicBlock *UncondBrTarget = nullptr; 5015 bool Negated = false; 5016 if (MachineInstr *BrCond = 5017 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5018 const SIRegisterInfo *TRI 5019 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5020 5021 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5022 Register Reg = MI.getOperand(2).getReg(); 5023 5024 if (Negated) 5025 std::swap(CondBrTarget, UncondBrTarget); 5026 5027 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5028 B.buildInstr(AMDGPU::SI_LOOP) 5029 .addUse(Reg) 5030 .addMBB(UncondBrTarget); 5031 5032 if (Br) 5033 Br->getOperand(0).setMBB(CondBrTarget); 5034 else 5035 B.buildBr(*CondBrTarget); 5036 5037 MI.eraseFromParent(); 5038 BrCond->eraseFromParent(); 5039 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5040 return true; 5041 } 5042 5043 return false; 5044 } 5045 case Intrinsic::amdgcn_kernarg_segment_ptr: 5046 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5047 // This only makes sense to call in a kernel, so just lower to null. 5048 B.buildConstant(MI.getOperand(0).getReg(), 0); 5049 MI.eraseFromParent(); 5050 return true; 5051 } 5052 5053 return legalizePreloadedArgIntrin( 5054 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5055 case Intrinsic::amdgcn_implicitarg_ptr: 5056 return legalizeImplicitArgPtr(MI, MRI, B); 5057 case Intrinsic::amdgcn_workitem_id_x: 5058 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) 5059 return replaceWithConstant(B, MI, 0); 5060 return legalizePreloadedArgIntrin(MI, MRI, B, 5061 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5062 case Intrinsic::amdgcn_workitem_id_y: 5063 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) 5064 return replaceWithConstant(B, MI, 0); 5065 5066 return legalizePreloadedArgIntrin(MI, MRI, B, 5067 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5068 case Intrinsic::amdgcn_workitem_id_z: 5069 if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) 5070 return replaceWithConstant(B, MI, 0); 5071 5072 return legalizePreloadedArgIntrin(MI, MRI, B, 5073 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5074 case Intrinsic::amdgcn_workgroup_id_x: 5075 return legalizePreloadedArgIntrin(MI, MRI, B, 5076 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5077 case Intrinsic::amdgcn_workgroup_id_y: 5078 return legalizePreloadedArgIntrin(MI, MRI, B, 5079 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5080 case Intrinsic::amdgcn_workgroup_id_z: 5081 return legalizePreloadedArgIntrin(MI, MRI, B, 5082 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5083 case Intrinsic::amdgcn_dispatch_ptr: 5084 return legalizePreloadedArgIntrin(MI, MRI, B, 5085 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5086 case Intrinsic::amdgcn_queue_ptr: 5087 return legalizePreloadedArgIntrin(MI, MRI, B, 5088 AMDGPUFunctionArgInfo::QUEUE_PTR); 5089 case Intrinsic::amdgcn_implicit_buffer_ptr: 5090 return legalizePreloadedArgIntrin( 5091 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5092 case Intrinsic::amdgcn_dispatch_id: 5093 return legalizePreloadedArgIntrin(MI, MRI, B, 5094 AMDGPUFunctionArgInfo::DISPATCH_ID); 5095 case Intrinsic::amdgcn_fdiv_fast: 5096 return legalizeFDIVFastIntrin(MI, MRI, B); 5097 case Intrinsic::amdgcn_is_shared: 5098 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5099 case Intrinsic::amdgcn_is_private: 5100 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5101 case Intrinsic::amdgcn_wavefrontsize: { 5102 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5103 MI.eraseFromParent(); 5104 return true; 5105 } 5106 case Intrinsic::amdgcn_s_buffer_load: 5107 return legalizeSBufferLoad(Helper, MI); 5108 case Intrinsic::amdgcn_raw_buffer_store: 5109 case Intrinsic::amdgcn_struct_buffer_store: 5110 return legalizeBufferStore(MI, MRI, B, false, false); 5111 case Intrinsic::amdgcn_raw_buffer_store_format: 5112 case Intrinsic::amdgcn_struct_buffer_store_format: 5113 return legalizeBufferStore(MI, MRI, B, false, true); 5114 case Intrinsic::amdgcn_raw_tbuffer_store: 5115 case Intrinsic::amdgcn_struct_tbuffer_store: 5116 return legalizeBufferStore(MI, MRI, B, true, true); 5117 case Intrinsic::amdgcn_raw_buffer_load: 5118 case Intrinsic::amdgcn_struct_buffer_load: 5119 return legalizeBufferLoad(MI, MRI, B, false, false); 5120 case Intrinsic::amdgcn_raw_buffer_load_format: 5121 case Intrinsic::amdgcn_struct_buffer_load_format: 5122 return legalizeBufferLoad(MI, MRI, B, true, false); 5123 case Intrinsic::amdgcn_raw_tbuffer_load: 5124 case Intrinsic::amdgcn_struct_tbuffer_load: 5125 return legalizeBufferLoad(MI, MRI, B, true, true); 5126 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5127 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5128 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5129 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5130 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5131 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5132 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5133 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5134 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5135 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5136 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5137 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5138 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5139 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5140 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5141 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5142 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5143 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5144 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5145 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5146 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5147 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5148 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5149 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5150 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5151 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5152 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5153 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5154 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5155 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5156 return legalizeBufferAtomic(MI, B, IntrID); 5157 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5158 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5159 Register DstReg = MI.getOperand(0).getReg(); 5160 if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { 5161 Function &F = B.getMF().getFunction(); 5162 DiagnosticInfoUnsupported NoFpRet( 5163 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5164 DS_Error); 5165 F.getContext().diagnose(NoFpRet); 5166 B.buildUndef(DstReg); 5167 MI.eraseFromParent(); 5168 return true; 5169 } 5170 5171 return legalizeBufferAtomic(MI, B, IntrID); 5172 } 5173 case Intrinsic::amdgcn_atomic_inc: 5174 return legalizeAtomicIncDec(MI, B, true); 5175 case Intrinsic::amdgcn_atomic_dec: 5176 return legalizeAtomicIncDec(MI, B, false); 5177 case Intrinsic::trap: 5178 return legalizeTrapIntrinsic(MI, MRI, B); 5179 case Intrinsic::debugtrap: 5180 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5181 case Intrinsic::amdgcn_rsq_clamp: 5182 return legalizeRsqClampIntrinsic(MI, MRI, B); 5183 case Intrinsic::amdgcn_ds_fadd: 5184 case Intrinsic::amdgcn_ds_fmin: 5185 case Intrinsic::amdgcn_ds_fmax: 5186 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5187 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5188 return legalizeBVHIntrinsic(MI, B); 5189 default: { 5190 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5191 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5192 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5193 return true; 5194 } 5195 } 5196 5197 return true; 5198 } 5199