1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 #include "llvm/IR/IntrinsicsR600.h" 30 31 #define DEBUG_TYPE "amdgpu-legalinfo" 32 33 using namespace llvm; 34 using namespace LegalizeActions; 35 using namespace LegalizeMutations; 36 using namespace LegalityPredicates; 37 using namespace MIPatternMatch; 38 39 // Hack until load/store selection patterns support any tuple of legal types. 40 static cl::opt<bool> EnableNewLegality( 41 "amdgpu-global-isel-new-legality", 42 cl::desc("Use GlobalISel desired legality, rather than try to use" 43 "rules compatible with selection patterns"), 44 cl::init(false), 45 cl::ReallyHidden); 46 47 static constexpr unsigned MaxRegisterSize = 1024; 48 49 // Round the number of elements to the next power of two elements 50 static LLT getPow2VectorType(LLT Ty) { 51 unsigned NElts = Ty.getNumElements(); 52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 54 } 55 56 // Round the number of bits to the next power of two bits 57 static LLT getPow2ScalarType(LLT Ty) { 58 unsigned Bits = Ty.getSizeInBits(); 59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60 return LLT::scalar(Pow2Bits); 61 } 62 63 /// \returns true if this is an odd sized vector which should widen by adding an 64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65 /// excludes s1 vectors, which should always be scalarized. 66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 67 return [=](const LegalityQuery &Query) { 68 const LLT Ty = Query.Types[TypeIdx]; 69 if (!Ty.isVector()) 70 return false; 71 72 const LLT EltTy = Ty.getElementType(); 73 const unsigned EltSize = EltTy.getSizeInBits(); 74 return Ty.getNumElements() % 2 != 0 && 75 EltSize > 1 && EltSize < 32 && 76 Ty.getSizeInBits() % 32 != 0; 77 }; 78 } 79 80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81 return [=](const LegalityQuery &Query) { 82 const LLT Ty = Query.Types[TypeIdx]; 83 return Ty.getSizeInBits() % 32 == 0; 84 }; 85 } 86 87 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 88 return [=](const LegalityQuery &Query) { 89 const LLT Ty = Query.Types[TypeIdx]; 90 const LLT EltTy = Ty.getScalarType(); 91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 92 }; 93 } 94 95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 96 return [=](const LegalityQuery &Query) { 97 const LLT Ty = Query.Types[TypeIdx]; 98 const LLT EltTy = Ty.getElementType(); 99 return std::make_pair(TypeIdx, 100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 101 }; 102 } 103 104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 105 return [=](const LegalityQuery &Query) { 106 const LLT Ty = Query.Types[TypeIdx]; 107 const LLT EltTy = Ty.getElementType(); 108 unsigned Size = Ty.getSizeInBits(); 109 unsigned Pieces = (Size + 63) / 64; 110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111 return std::make_pair( 112 TypeIdx, 113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 114 }; 115 } 116 117 // Increase the number of vector elements to reach the next multiple of 32-bit 118 // type. 119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 120 return [=](const LegalityQuery &Query) { 121 const LLT Ty = Query.Types[TypeIdx]; 122 123 const LLT EltTy = Ty.getElementType(); 124 const int Size = Ty.getSizeInBits(); 125 const int EltSize = EltTy.getSizeInBits(); 126 const int NextMul32 = (Size + 31) / 32; 127 128 assert(EltSize < 32); 129 130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 132 }; 133 } 134 135 static LLT getBitcastRegisterType(const LLT Ty) { 136 const unsigned Size = Ty.getSizeInBits(); 137 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 uint64_t AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB}) 534 .legalFor({S32, S16, V2S16}) 535 .clampMaxNumElementsStrict(0, S16, 2) 536 .scalarize(0) 537 .minScalar(0, S16) 538 .widenScalarToNextMultipleOf(0, 32) 539 .maxScalar(0, S32); 540 541 getActionDefinitionsBuilder(G_MUL) 542 .legalFor({S32, S16, V2S16}) 543 .clampMaxNumElementsStrict(0, S16, 2) 544 .scalarize(0) 545 .minScalar(0, S16) 546 .widenScalarToNextMultipleOf(0, 32) 547 .custom(); 548 assert(ST.hasMad64_32()); 549 550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 551 .legalFor({S32, S16, V2S16}) // Clamp modifier 552 .minScalarOrElt(0, S16) 553 .clampMaxNumElementsStrict(0, S16, 2) 554 .scalarize(0) 555 .widenScalarToNextPow2(0, 32) 556 .lower(); 557 } else if (ST.has16BitInsts()) { 558 getActionDefinitionsBuilder({G_ADD, G_SUB}) 559 .legalFor({S32, S16}) 560 .minScalar(0, S16) 561 .widenScalarToNextMultipleOf(0, 32) 562 .maxScalar(0, S32) 563 .scalarize(0); 564 565 getActionDefinitionsBuilder(G_MUL) 566 .legalFor({S32, S16}) 567 .scalarize(0) 568 .minScalar(0, S16) 569 .widenScalarToNextMultipleOf(0, 32) 570 .custom(); 571 assert(ST.hasMad64_32()); 572 573 // Technically the saturating operations require clamp bit support, but this 574 // was introduced at the same time as 16-bit operations. 575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 576 .legalFor({S32, S16}) // Clamp modifier 577 .minScalar(0, S16) 578 .scalarize(0) 579 .widenScalarToNextPow2(0, 16) 580 .lower(); 581 582 // We're just lowering this, but it helps get a better result to try to 583 // coerce to the desired type first. 584 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 585 .minScalar(0, S16) 586 .scalarize(0) 587 .lower(); 588 } else { 589 getActionDefinitionsBuilder({G_ADD, G_SUB}) 590 .legalFor({S32}) 591 .widenScalarToNextMultipleOf(0, 32) 592 .clampScalar(0, S32, S32) 593 .scalarize(0); 594 595 auto &Mul = getActionDefinitionsBuilder(G_MUL) 596 .legalFor({S32}) 597 .scalarize(0) 598 .minScalar(0, S32) 599 .widenScalarToNextMultipleOf(0, 32); 600 601 if (ST.hasMad64_32()) 602 Mul.custom(); 603 else 604 Mul.maxScalar(0, S32); 605 606 if (ST.hasIntClamp()) { 607 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 608 .legalFor({S32}) // Clamp modifier. 609 .scalarize(0) 610 .minScalarOrElt(0, S32) 611 .lower(); 612 } else { 613 // Clamp bit support was added in VI, along with 16-bit operations. 614 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 615 .minScalar(0, S32) 616 .scalarize(0) 617 .lower(); 618 } 619 620 // FIXME: DAG expansion gets better results. The widening uses the smaller 621 // range values and goes for the min/max lowering directly. 622 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 623 .minScalar(0, S32) 624 .scalarize(0) 625 .lower(); 626 } 627 628 getActionDefinitionsBuilder( 629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 630 .customFor({S32, S64}) 631 .clampScalar(0, S32, S64) 632 .widenScalarToNextPow2(0, 32) 633 .scalarize(0); 634 635 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 636 .legalFor({S32}) 637 .maxScalar(0, S32); 638 639 if (ST.hasVOP3PInsts()) { 640 Mulh 641 .clampMaxNumElements(0, S8, 2) 642 .lowerFor({V2S8}); 643 } 644 645 Mulh 646 .scalarize(0) 647 .lower(); 648 649 // Report legal for any types we can handle anywhere. For the cases only legal 650 // on the SALU, RegBankSelect will be able to re-legalize. 651 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 653 .clampScalar(0, S32, S64) 654 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 655 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 656 .widenScalarToNextPow2(0) 657 .scalarize(0); 658 659 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 661 .legalFor({{S32, S1}, {S32, S32}}) 662 .minScalar(0, S32) 663 .scalarize(0) 664 .lower(); 665 666 getActionDefinitionsBuilder(G_BITCAST) 667 // Don't worry about the size constraint. 668 .legalIf(all(isRegisterType(0), isRegisterType(1))) 669 .lower(); 670 671 672 getActionDefinitionsBuilder(G_CONSTANT) 673 .legalFor({S1, S32, S64, S16, GlobalPtr, 674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 675 .legalIf(isPointer(0)) 676 .clampScalar(0, S32, S64) 677 .widenScalarToNextPow2(0); 678 679 getActionDefinitionsBuilder(G_FCONSTANT) 680 .legalFor({S32, S64, S16}) 681 .clampScalar(0, S16, S64); 682 683 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 684 .legalIf(isRegisterType(0)) 685 // s1 and s16 are special cases because they have legal operations on 686 // them, but don't really occupy registers in the normal way. 687 .legalFor({S1, S16}) 688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 689 .clampScalarOrElt(0, S32, MaxScalar) 690 .widenScalarToNextPow2(0, 32) 691 .clampMaxNumElements(0, S32, 16); 692 693 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 694 695 // If the amount is divergent, we have to do a wave reduction to get the 696 // maximum value, so this is expanded during RegBankSelect. 697 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 698 .legalFor({{PrivatePtr, S32}}); 699 700 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 701 .customIf(typeIsNot(0, PrivatePtr)); 702 703 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 704 705 auto &FPOpActions = getActionDefinitionsBuilder( 706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 707 .legalFor({S32, S64}); 708 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 709 .customFor({S32, S64}); 710 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 711 .customFor({S32, S64}); 712 713 if (ST.has16BitInsts()) { 714 if (ST.hasVOP3PInsts()) 715 FPOpActions.legalFor({S16, V2S16}); 716 else 717 FPOpActions.legalFor({S16}); 718 719 TrigActions.customFor({S16}); 720 FDIVActions.customFor({S16}); 721 } 722 723 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 725 726 if (ST.hasVOP3PInsts()) { 727 MinNumMaxNum.customFor(FPTypesPK16) 728 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 729 .clampMaxNumElements(0, S16, 2) 730 .clampScalar(0, S16, S64) 731 .scalarize(0); 732 } else if (ST.has16BitInsts()) { 733 MinNumMaxNum.customFor(FPTypes16) 734 .clampScalar(0, S16, S64) 735 .scalarize(0); 736 } else { 737 MinNumMaxNum.customFor(FPTypesBase) 738 .clampScalar(0, S32, S64) 739 .scalarize(0); 740 } 741 742 if (ST.hasVOP3PInsts()) 743 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 744 745 FPOpActions 746 .scalarize(0) 747 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 748 749 TrigActions 750 .scalarize(0) 751 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 752 753 FDIVActions 754 .scalarize(0) 755 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 756 757 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 758 .legalFor(FPTypesPK16) 759 .clampMaxNumElementsStrict(0, S16, 2) 760 .scalarize(0) 761 .clampScalar(0, S16, S64); 762 763 if (ST.has16BitInsts()) { 764 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 765 .legalFor({S32, S64, S16}) 766 .scalarize(0) 767 .clampScalar(0, S16, S64); 768 } else { 769 getActionDefinitionsBuilder(G_FSQRT) 770 .legalFor({S32, S64}) 771 .scalarize(0) 772 .clampScalar(0, S32, S64); 773 774 if (ST.hasFractBug()) { 775 getActionDefinitionsBuilder(G_FFLOOR) 776 .customFor({S64}) 777 .legalFor({S32, S64}) 778 .scalarize(0) 779 .clampScalar(0, S32, S64); 780 } else { 781 getActionDefinitionsBuilder(G_FFLOOR) 782 .legalFor({S32, S64}) 783 .scalarize(0) 784 .clampScalar(0, S32, S64); 785 } 786 } 787 788 getActionDefinitionsBuilder(G_FPTRUNC) 789 .legalFor({{S32, S64}, {S16, S32}}) 790 .scalarize(0) 791 .lower(); 792 793 getActionDefinitionsBuilder(G_FPEXT) 794 .legalFor({{S64, S32}, {S32, S16}}) 795 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 796 .scalarize(0); 797 798 auto &FSubActions = getActionDefinitionsBuilder(G_FSUB); 799 if (ST.has16BitInsts()) { 800 FSubActions 801 // Use actual fsub instruction 802 .legalFor({S32, S16}) 803 // Must use fadd + fneg 804 .lowerFor({S64, V2S16}); 805 } else { 806 FSubActions 807 // Use actual fsub instruction 808 .legalFor({S32}) 809 // Must use fadd + fneg 810 .lowerFor({S64, S16, V2S16}); 811 } 812 813 FSubActions 814 .scalarize(0) 815 .clampScalar(0, S32, S64); 816 817 // Whether this is legal depends on the floating point mode for the function. 818 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 819 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 820 FMad.customFor({S32, S16}); 821 else if (ST.hasMadMacF32Insts()) 822 FMad.customFor({S32}); 823 else if (ST.hasMadF16()) 824 FMad.customFor({S16}); 825 FMad.scalarize(0) 826 .lower(); 827 828 auto &FRem = getActionDefinitionsBuilder(G_FREM); 829 if (ST.has16BitInsts()) { 830 FRem.customFor({S16, S32, S64}); 831 } else { 832 FRem.minScalar(0, S32) 833 .customFor({S32, S64}); 834 } 835 FRem.scalarize(0); 836 837 // TODO: Do we need to clamp maximum bitwidth? 838 getActionDefinitionsBuilder(G_TRUNC) 839 .legalIf(isScalar(0)) 840 .legalFor({{V2S16, V2S32}}) 841 .clampMaxNumElements(0, S16, 2) 842 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 843 // situations (like an invalid implicit use), we don't want to infinite loop 844 // in the legalizer. 845 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 846 .alwaysLegal(); 847 848 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 849 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 850 {S32, S1}, {S64, S1}, {S16, S1}}) 851 .scalarize(0) 852 .clampScalar(0, S32, S64) 853 .widenScalarToNextPow2(1, 32); 854 855 // TODO: Split s1->s64 during regbankselect for VALU. 856 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 857 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 858 .lowerIf(typeIs(1, S1)) 859 .customFor({{S32, S64}, {S64, S64}}); 860 if (ST.has16BitInsts()) 861 IToFP.legalFor({{S16, S16}}); 862 IToFP.clampScalar(1, S32, S64) 863 .minScalar(0, S32) 864 .scalarize(0) 865 .widenScalarToNextPow2(1); 866 867 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 868 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 869 .customFor({{S64, S32}, {S64, S64}}) 870 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 871 if (ST.has16BitInsts()) 872 FPToI.legalFor({{S16, S16}}); 873 else 874 FPToI.minScalar(1, S32); 875 876 FPToI.minScalar(0, S32) 877 .widenScalarToNextPow2(0, 32) 878 .scalarize(0) 879 .lower(); 880 881 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 882 .customFor({S16, S32}) 883 .scalarize(0) 884 .lower(); 885 886 // Lower roundeven into G_FRINT 887 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 888 .scalarize(0) 889 .lower(); 890 891 if (ST.has16BitInsts()) { 892 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 893 .legalFor({S16, S32, S64}) 894 .clampScalar(0, S16, S64) 895 .scalarize(0); 896 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 897 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 898 .legalFor({S32, S64}) 899 .clampScalar(0, S32, S64) 900 .scalarize(0); 901 } else { 902 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 903 .legalFor({S32}) 904 .customFor({S64}) 905 .clampScalar(0, S32, S64) 906 .scalarize(0); 907 } 908 909 getActionDefinitionsBuilder(G_PTR_ADD) 910 .legalIf(all(isPointer(0), sameSize(0, 1))) 911 .scalarize(0) 912 .scalarSameSizeAs(1, 0); 913 914 getActionDefinitionsBuilder(G_PTRMASK) 915 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 916 .scalarSameSizeAs(1, 0) 917 .scalarize(0); 918 919 auto &CmpBuilder = 920 getActionDefinitionsBuilder(G_ICMP) 921 // The compare output type differs based on the register bank of the output, 922 // so make both s1 and s32 legal. 923 // 924 // Scalar compares producing output in scc will be promoted to s32, as that 925 // is the allocatable register type that will be needed for the copy from 926 // scc. This will be promoted during RegBankSelect, and we assume something 927 // before that won't try to use s32 result types. 928 // 929 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 930 // bank. 931 .legalForCartesianProduct( 932 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 933 .legalForCartesianProduct( 934 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 935 if (ST.has16BitInsts()) { 936 CmpBuilder.legalFor({{S1, S16}}); 937 } 938 939 CmpBuilder 940 .widenScalarToNextPow2(1) 941 .clampScalar(1, S32, S64) 942 .scalarize(0) 943 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 944 945 getActionDefinitionsBuilder(G_FCMP) 946 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 947 .widenScalarToNextPow2(1) 948 .clampScalar(1, S32, S64) 949 .scalarize(0); 950 951 // FIXME: fpow has a selection pattern that should move to custom lowering. 952 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 953 if (ST.has16BitInsts()) 954 Exp2Ops.legalFor({S32, S16}); 955 else 956 Exp2Ops.legalFor({S32}); 957 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 958 Exp2Ops.scalarize(0); 959 960 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 961 if (ST.has16BitInsts()) 962 ExpOps.customFor({{S32}, {S16}}); 963 else 964 ExpOps.customFor({S32}); 965 ExpOps.clampScalar(0, MinScalarFPTy, S32) 966 .scalarize(0); 967 968 getActionDefinitionsBuilder(G_FPOWI) 969 .clampScalar(0, MinScalarFPTy, S32) 970 .lower(); 971 972 // The 64-bit versions produce 32-bit results, but only on the SALU. 973 getActionDefinitionsBuilder(G_CTPOP) 974 .legalFor({{S32, S32}, {S32, S64}}) 975 .clampScalar(0, S32, S32) 976 .widenScalarToNextPow2(1, 32) 977 .clampScalar(1, S32, S64) 978 .scalarize(0) 979 .widenScalarToNextPow2(0, 32); 980 981 982 // The hardware instructions return a different result on 0 than the generic 983 // instructions expect. The hardware produces -1, but these produce the 984 // bitwidth. 985 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 986 .scalarize(0) 987 .clampScalar(0, S32, S32) 988 .clampScalar(1, S32, S64) 989 .widenScalarToNextPow2(0, 32) 990 .widenScalarToNextPow2(1, 32) 991 .custom(); 992 993 // The 64-bit versions produce 32-bit results, but only on the SALU. 994 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 995 .legalFor({{S32, S32}, {S32, S64}}) 996 .clampScalar(0, S32, S32) 997 .clampScalar(1, S32, S64) 998 .scalarize(0) 999 .widenScalarToNextPow2(0, 32) 1000 .widenScalarToNextPow2(1, 32); 1001 1002 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1003 // RegBankSelect. 1004 getActionDefinitionsBuilder(G_BITREVERSE) 1005 .legalFor({S32, S64}) 1006 .clampScalar(0, S32, S64) 1007 .scalarize(0) 1008 .widenScalarToNextPow2(0); 1009 1010 if (ST.has16BitInsts()) { 1011 getActionDefinitionsBuilder(G_BSWAP) 1012 .legalFor({S16, S32, V2S16}) 1013 .clampMaxNumElementsStrict(0, S16, 2) 1014 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1015 // narrowScalar limitation. 1016 .widenScalarToNextPow2(0) 1017 .clampScalar(0, S16, S32) 1018 .scalarize(0); 1019 1020 if (ST.hasVOP3PInsts()) { 1021 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1022 .legalFor({S32, S16, V2S16}) 1023 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1024 .clampMaxNumElements(0, S16, 2) 1025 .minScalar(0, S16) 1026 .widenScalarToNextPow2(0) 1027 .scalarize(0) 1028 .lower(); 1029 } else { 1030 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1031 .legalFor({S32, S16}) 1032 .widenScalarToNextPow2(0) 1033 .minScalar(0, S16) 1034 .scalarize(0) 1035 .lower(); 1036 } 1037 } else { 1038 // TODO: Should have same legality without v_perm_b32 1039 getActionDefinitionsBuilder(G_BSWAP) 1040 .legalFor({S32}) 1041 .lowerIf(scalarNarrowerThan(0, 32)) 1042 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1043 // narrowScalar limitation. 1044 .widenScalarToNextPow2(0) 1045 .maxScalar(0, S32) 1046 .scalarize(0) 1047 .lower(); 1048 1049 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1050 .legalFor({S32}) 1051 .minScalar(0, S32) 1052 .widenScalarToNextPow2(0) 1053 .scalarize(0) 1054 .lower(); 1055 } 1056 1057 getActionDefinitionsBuilder(G_INTTOPTR) 1058 // List the common cases 1059 .legalForCartesianProduct(AddrSpaces64, {S64}) 1060 .legalForCartesianProduct(AddrSpaces32, {S32}) 1061 .scalarize(0) 1062 // Accept any address space as long as the size matches 1063 .legalIf(sameSize(0, 1)) 1064 .widenScalarIf(smallerThan(1, 0), 1065 [](const LegalityQuery &Query) { 1066 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1067 }) 1068 .narrowScalarIf(largerThan(1, 0), 1069 [](const LegalityQuery &Query) { 1070 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1071 }); 1072 1073 getActionDefinitionsBuilder(G_PTRTOINT) 1074 // List the common cases 1075 .legalForCartesianProduct(AddrSpaces64, {S64}) 1076 .legalForCartesianProduct(AddrSpaces32, {S32}) 1077 .scalarize(0) 1078 // Accept any address space as long as the size matches 1079 .legalIf(sameSize(0, 1)) 1080 .widenScalarIf(smallerThan(0, 1), 1081 [](const LegalityQuery &Query) { 1082 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1083 }) 1084 .narrowScalarIf( 1085 largerThan(0, 1), 1086 [](const LegalityQuery &Query) { 1087 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1088 }); 1089 1090 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1091 .scalarize(0) 1092 .custom(); 1093 1094 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1095 bool IsLoad) -> bool { 1096 const LLT DstTy = Query.Types[0]; 1097 1098 // Split vector extloads. 1099 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1100 1101 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1102 return true; 1103 1104 const LLT PtrTy = Query.Types[1]; 1105 unsigned AS = PtrTy.getAddressSpace(); 1106 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1107 return true; 1108 1109 // Catch weird sized loads that don't evenly divide into the access sizes 1110 // TODO: May be able to widen depending on alignment etc. 1111 unsigned NumRegs = (MemSize + 31) / 32; 1112 if (NumRegs == 3) { 1113 if (!ST.hasDwordx3LoadStores()) 1114 return true; 1115 } else { 1116 // If the alignment allows, these should have been widened. 1117 if (!isPowerOf2_32(NumRegs)) 1118 return true; 1119 } 1120 1121 return false; 1122 }; 1123 1124 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1125 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1126 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1127 1128 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1129 // LDS 1130 // TODO: Unsupported flat for SI. 1131 1132 for (unsigned Op : {G_LOAD, G_STORE}) { 1133 const bool IsStore = Op == G_STORE; 1134 1135 auto &Actions = getActionDefinitionsBuilder(Op); 1136 // Explicitly list some common cases. 1137 // TODO: Does this help compile time at all? 1138 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1139 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1140 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1141 {S64, GlobalPtr, S64, GlobalAlign32}, 1142 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1143 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1144 {S32, GlobalPtr, S8, GlobalAlign8}, 1145 {S32, GlobalPtr, S16, GlobalAlign16}, 1146 1147 {S32, LocalPtr, S32, 32}, 1148 {S64, LocalPtr, S64, 32}, 1149 {V2S32, LocalPtr, V2S32, 32}, 1150 {S32, LocalPtr, S8, 8}, 1151 {S32, LocalPtr, S16, 16}, 1152 {V2S16, LocalPtr, S32, 32}, 1153 1154 {S32, PrivatePtr, S32, 32}, 1155 {S32, PrivatePtr, S8, 8}, 1156 {S32, PrivatePtr, S16, 16}, 1157 {V2S16, PrivatePtr, S32, 32}, 1158 1159 {S32, ConstantPtr, S32, GlobalAlign32}, 1160 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1161 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1162 {S64, ConstantPtr, S64, GlobalAlign32}, 1163 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1164 Actions.legalIf( 1165 [=](const LegalityQuery &Query) -> bool { 1166 return isLoadStoreLegal(ST, Query); 1167 }); 1168 1169 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1170 // 64-bits. 1171 // 1172 // TODO: Should generalize bitcast action into coerce, which will also cover 1173 // inserting addrspacecasts. 1174 Actions.customIf(typeIs(1, Constant32Ptr)); 1175 1176 // Turn any illegal element vectors into something easier to deal 1177 // with. These will ultimately produce 32-bit scalar shifts to extract the 1178 // parts anyway. 1179 // 1180 // For odd 16-bit element vectors, prefer to split those into pieces with 1181 // 16-bit vector parts. 1182 Actions.bitcastIf( 1183 [=](const LegalityQuery &Query) -> bool { 1184 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1185 Query.MMODescrs[0].MemoryTy); 1186 }, bitcastToRegisterType(0)); 1187 1188 if (!IsStore) { 1189 // Widen suitably aligned loads by loading extra bytes. The standard 1190 // legalization actions can't properly express widening memory operands. 1191 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1192 return shouldWidenLoad(ST, Query, G_LOAD); 1193 }); 1194 } 1195 1196 // FIXME: load/store narrowing should be moved to lower action 1197 Actions 1198 .narrowScalarIf( 1199 [=](const LegalityQuery &Query) -> bool { 1200 return !Query.Types[0].isVector() && 1201 needToSplitMemOp(Query, Op == G_LOAD); 1202 }, 1203 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1204 const LLT DstTy = Query.Types[0]; 1205 const LLT PtrTy = Query.Types[1]; 1206 1207 const unsigned DstSize = DstTy.getSizeInBits(); 1208 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1209 1210 // Split extloads. 1211 if (DstSize > MemSize) 1212 return std::make_pair(0, LLT::scalar(MemSize)); 1213 1214 unsigned MaxSize = maxSizeForAddrSpace(ST, 1215 PtrTy.getAddressSpace(), 1216 Op == G_LOAD); 1217 if (MemSize > MaxSize) 1218 return std::make_pair(0, LLT::scalar(MaxSize)); 1219 1220 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1221 return std::make_pair(0, LLT::scalar(Align)); 1222 }) 1223 .fewerElementsIf( 1224 [=](const LegalityQuery &Query) -> bool { 1225 return Query.Types[0].isVector() && 1226 needToSplitMemOp(Query, Op == G_LOAD); 1227 }, 1228 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1229 const LLT DstTy = Query.Types[0]; 1230 const LLT PtrTy = Query.Types[1]; 1231 1232 LLT EltTy = DstTy.getElementType(); 1233 unsigned MaxSize = maxSizeForAddrSpace(ST, 1234 PtrTy.getAddressSpace(), 1235 Op == G_LOAD); 1236 1237 // FIXME: Handle widened to power of 2 results better. This ends 1238 // up scalarizing. 1239 // FIXME: 3 element stores scalarized on SI 1240 1241 // Split if it's too large for the address space. 1242 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1243 if (MemSize > MaxSize) { 1244 unsigned NumElts = DstTy.getNumElements(); 1245 unsigned EltSize = EltTy.getSizeInBits(); 1246 1247 if (MaxSize % EltSize == 0) { 1248 return std::make_pair( 1249 0, LLT::scalarOrVector( 1250 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1251 } 1252 1253 unsigned NumPieces = MemSize / MaxSize; 1254 1255 // FIXME: Refine when odd breakdowns handled 1256 // The scalars will need to be re-legalized. 1257 if (NumPieces == 1 || NumPieces >= NumElts || 1258 NumElts % NumPieces != 0) 1259 return std::make_pair(0, EltTy); 1260 1261 return std::make_pair( 1262 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1263 } 1264 1265 // FIXME: We could probably handle weird extending loads better. 1266 if (DstTy.getSizeInBits() > MemSize) 1267 return std::make_pair(0, EltTy); 1268 1269 unsigned EltSize = EltTy.getSizeInBits(); 1270 unsigned DstSize = DstTy.getSizeInBits(); 1271 if (!isPowerOf2_32(DstSize)) { 1272 // We're probably decomposing an odd sized store. Try to split 1273 // to the widest type. TODO: Account for alignment. As-is it 1274 // should be OK, since the new parts will be further legalized. 1275 unsigned FloorSize = PowerOf2Floor(DstSize); 1276 return std::make_pair( 1277 0, LLT::scalarOrVector( 1278 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1279 } 1280 1281 // May need relegalization for the scalars. 1282 return std::make_pair(0, EltTy); 1283 }) 1284 .minScalar(0, S32) 1285 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1286 .widenScalarToNextPow2(0) 1287 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1288 .lower(); 1289 } 1290 1291 // FIXME: Unaligned accesses not lowered. 1292 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1293 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1294 {S32, GlobalPtr, S16, 2 * 8}, 1295 {S32, LocalPtr, S8, 8}, 1296 {S32, LocalPtr, S16, 16}, 1297 {S32, PrivatePtr, S8, 8}, 1298 {S32, PrivatePtr, S16, 16}, 1299 {S32, ConstantPtr, S8, 8}, 1300 {S32, ConstantPtr, S16, 2 * 8}}) 1301 .legalIf( 1302 [=](const LegalityQuery &Query) -> bool { 1303 return isLoadStoreLegal(ST, Query); 1304 }); 1305 1306 if (ST.hasFlatAddressSpace()) { 1307 ExtLoads.legalForTypesWithMemDesc( 1308 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1309 } 1310 1311 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1312 // 64-bits. 1313 // 1314 // TODO: Should generalize bitcast action into coerce, which will also cover 1315 // inserting addrspacecasts. 1316 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1317 1318 ExtLoads.clampScalar(0, S32, S32) 1319 .widenScalarToNextPow2(0) 1320 .lower(); 1321 1322 auto &Atomics = getActionDefinitionsBuilder( 1323 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1324 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1325 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1326 G_ATOMICRMW_UMIN}) 1327 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1328 {S64, GlobalPtr}, {S64, LocalPtr}, 1329 {S32, RegionPtr}, {S64, RegionPtr}}); 1330 if (ST.hasFlatAddressSpace()) { 1331 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1332 } 1333 1334 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1335 if (ST.hasLDSFPAtomicAdd()) { 1336 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1337 if (ST.hasGFX90AInsts()) 1338 Atomic.legalFor({{S64, LocalPtr}}); 1339 if (ST.hasGFX940Insts()) 1340 Atomic.legalFor({{V2S16, LocalPtr}}); 1341 } 1342 if (ST.hasAtomicFaddInsts()) 1343 Atomic.legalFor({{S32, GlobalPtr}}); 1344 1345 if (ST.hasGFX90AInsts()) { 1346 // These are legal with some caveats, and should have undergone expansion in 1347 // the IR in most situations 1348 // TODO: Move atomic expansion into legalizer 1349 // TODO: Also supports <2 x f16> 1350 Atomic.legalFor({ 1351 {S32, GlobalPtr}, 1352 {S64, GlobalPtr}, 1353 {S64, FlatPtr} 1354 }); 1355 } 1356 1357 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1358 // demarshalling 1359 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1360 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1361 {S32, FlatPtr}, {S64, FlatPtr}}) 1362 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1363 {S32, RegionPtr}, {S64, RegionPtr}}); 1364 // TODO: Pointer types, any 32-bit or 64-bit vector 1365 1366 // Condition should be s32 for scalar, s1 for vector. 1367 getActionDefinitionsBuilder(G_SELECT) 1368 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1369 LocalPtr, FlatPtr, PrivatePtr, 1370 LLT::fixed_vector(2, LocalPtr), 1371 LLT::fixed_vector(2, PrivatePtr)}, 1372 {S1, S32}) 1373 .clampScalar(0, S16, S64) 1374 .scalarize(1) 1375 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1376 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1377 .clampMaxNumElements(0, S32, 2) 1378 .clampMaxNumElements(0, LocalPtr, 2) 1379 .clampMaxNumElements(0, PrivatePtr, 2) 1380 .scalarize(0) 1381 .widenScalarToNextPow2(0) 1382 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1383 1384 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1385 // be more flexible with the shift amount type. 1386 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1387 .legalFor({{S32, S32}, {S64, S32}}); 1388 if (ST.has16BitInsts()) { 1389 if (ST.hasVOP3PInsts()) { 1390 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1391 .clampMaxNumElements(0, S16, 2); 1392 } else 1393 Shifts.legalFor({{S16, S16}}); 1394 1395 // TODO: Support 16-bit shift amounts for all types 1396 Shifts.widenScalarIf( 1397 [=](const LegalityQuery &Query) { 1398 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1399 // 32-bit amount. 1400 const LLT ValTy = Query.Types[0]; 1401 const LLT AmountTy = Query.Types[1]; 1402 return ValTy.getSizeInBits() <= 16 && 1403 AmountTy.getSizeInBits() < 16; 1404 }, changeTo(1, S16)); 1405 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1406 Shifts.clampScalar(1, S32, S32); 1407 Shifts.widenScalarToNextPow2(0, 16); 1408 Shifts.clampScalar(0, S16, S64); 1409 1410 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1411 .minScalar(0, S16) 1412 .scalarize(0) 1413 .lower(); 1414 } else { 1415 // Make sure we legalize the shift amount type first, as the general 1416 // expansion for the shifted type will produce much worse code if it hasn't 1417 // been truncated already. 1418 Shifts.clampScalar(1, S32, S32); 1419 Shifts.widenScalarToNextPow2(0, 32); 1420 Shifts.clampScalar(0, S32, S64); 1421 1422 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1423 .minScalar(0, S32) 1424 .scalarize(0) 1425 .lower(); 1426 } 1427 Shifts.scalarize(0); 1428 1429 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1430 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1431 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1432 unsigned IdxTypeIdx = 2; 1433 1434 getActionDefinitionsBuilder(Op) 1435 .customIf([=](const LegalityQuery &Query) { 1436 const LLT EltTy = Query.Types[EltTypeIdx]; 1437 const LLT VecTy = Query.Types[VecTypeIdx]; 1438 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1439 const unsigned EltSize = EltTy.getSizeInBits(); 1440 return (EltSize == 32 || EltSize == 64) && 1441 VecTy.getSizeInBits() % 32 == 0 && 1442 VecTy.getSizeInBits() <= MaxRegisterSize && 1443 IdxTy.getSizeInBits() == 32; 1444 }) 1445 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1446 bitcastToVectorElement32(VecTypeIdx)) 1447 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1448 .bitcastIf( 1449 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1450 [=](const LegalityQuery &Query) { 1451 // For > 64-bit element types, try to turn this into a 64-bit 1452 // element vector since we may be able to do better indexing 1453 // if this is scalar. If not, fall back to 32. 1454 const LLT EltTy = Query.Types[EltTypeIdx]; 1455 const LLT VecTy = Query.Types[VecTypeIdx]; 1456 const unsigned DstEltSize = EltTy.getSizeInBits(); 1457 const unsigned VecSize = VecTy.getSizeInBits(); 1458 1459 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1460 return std::make_pair( 1461 VecTypeIdx, 1462 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1463 }) 1464 .clampScalar(EltTypeIdx, S32, S64) 1465 .clampScalar(VecTypeIdx, S32, S64) 1466 .clampScalar(IdxTypeIdx, S32, S32) 1467 .clampMaxNumElements(VecTypeIdx, S32, 32) 1468 // TODO: Clamp elements for 64-bit vectors? 1469 // It should only be necessary with variable indexes. 1470 // As a last resort, lower to the stack 1471 .lower(); 1472 } 1473 1474 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1475 .unsupportedIf([=](const LegalityQuery &Query) { 1476 const LLT &EltTy = Query.Types[1].getElementType(); 1477 return Query.Types[0] != EltTy; 1478 }); 1479 1480 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1481 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1482 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1483 1484 // FIXME: Doesn't handle extract of illegal sizes. 1485 getActionDefinitionsBuilder(Op) 1486 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1487 .lowerIf([=](const LegalityQuery &Query) { 1488 // Sub-vector(or single element) insert and extract. 1489 // TODO: verify immediate offset here since lower only works with 1490 // whole elements. 1491 const LLT BigTy = Query.Types[BigTyIdx]; 1492 return BigTy.isVector(); 1493 }) 1494 // FIXME: Multiples of 16 should not be legal. 1495 .legalIf([=](const LegalityQuery &Query) { 1496 const LLT BigTy = Query.Types[BigTyIdx]; 1497 const LLT LitTy = Query.Types[LitTyIdx]; 1498 return (BigTy.getSizeInBits() % 32 == 0) && 1499 (LitTy.getSizeInBits() % 16 == 0); 1500 }) 1501 .widenScalarIf( 1502 [=](const LegalityQuery &Query) { 1503 const LLT BigTy = Query.Types[BigTyIdx]; 1504 return (BigTy.getScalarSizeInBits() < 16); 1505 }, 1506 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1507 .widenScalarIf( 1508 [=](const LegalityQuery &Query) { 1509 const LLT LitTy = Query.Types[LitTyIdx]; 1510 return (LitTy.getScalarSizeInBits() < 16); 1511 }, 1512 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1513 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1514 .widenScalarToNextPow2(BigTyIdx, 32); 1515 1516 } 1517 1518 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1519 .legalForCartesianProduct(AllS32Vectors, {S32}) 1520 .legalForCartesianProduct(AllS64Vectors, {S64}) 1521 .clampNumElements(0, V16S32, V32S32) 1522 .clampNumElements(0, V2S64, V16S64) 1523 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1524 1525 if (ST.hasScalarPackInsts()) { 1526 BuildVector 1527 // FIXME: Should probably widen s1 vectors straight to s32 1528 .minScalarOrElt(0, S16) 1529 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1530 .minScalar(1, S32); 1531 1532 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1533 .legalFor({V2S16, S32}) 1534 .lower(); 1535 BuildVector.minScalarOrElt(0, S32); 1536 } else { 1537 BuildVector.customFor({V2S16, S16}); 1538 BuildVector.minScalarOrElt(0, S32); 1539 1540 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1541 .customFor({V2S16, S32}) 1542 .lower(); 1543 } 1544 1545 BuildVector.legalIf(isRegisterType(0)); 1546 1547 // FIXME: Clamp maximum size 1548 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1549 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1550 .clampMaxNumElements(0, S32, 32) 1551 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1552 .clampMaxNumElements(0, S16, 64); 1553 1554 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 1555 // pre-legalize. 1556 if (ST.hasVOP3PInsts()) { 1557 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1558 .customFor({V2S16, V2S16}) 1559 .lower(); 1560 } else 1561 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1562 1563 // Merge/Unmerge 1564 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1565 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1566 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1567 1568 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1569 const LLT Ty = Query.Types[TypeIdx]; 1570 if (Ty.isVector()) { 1571 const LLT &EltTy = Ty.getElementType(); 1572 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1573 return true; 1574 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1575 return true; 1576 } 1577 return false; 1578 }; 1579 1580 auto &Builder = getActionDefinitionsBuilder(Op) 1581 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1582 .lowerFor({{S16, V2S16}}) 1583 .lowerIf([=](const LegalityQuery &Query) { 1584 const LLT BigTy = Query.Types[BigTyIdx]; 1585 return BigTy.getSizeInBits() == 32; 1586 }) 1587 // Try to widen to s16 first for small types. 1588 // TODO: Only do this on targets with legal s16 shifts 1589 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1590 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1591 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1592 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1593 elementTypeIs(1, S16)), 1594 changeTo(1, V2S16)) 1595 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1596 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1597 // valid. 1598 .clampScalar(LitTyIdx, S32, S512) 1599 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1600 // Break up vectors with weird elements into scalars 1601 .fewerElementsIf( 1602 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1603 scalarize(0)) 1604 .fewerElementsIf( 1605 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1606 scalarize(1)) 1607 .clampScalar(BigTyIdx, S32, MaxScalar); 1608 1609 if (Op == G_MERGE_VALUES) { 1610 Builder.widenScalarIf( 1611 // TODO: Use 16-bit shifts if legal for 8-bit values? 1612 [=](const LegalityQuery &Query) { 1613 const LLT Ty = Query.Types[LitTyIdx]; 1614 return Ty.getSizeInBits() < 32; 1615 }, 1616 changeTo(LitTyIdx, S32)); 1617 } 1618 1619 Builder.widenScalarIf( 1620 [=](const LegalityQuery &Query) { 1621 const LLT Ty = Query.Types[BigTyIdx]; 1622 return !isPowerOf2_32(Ty.getSizeInBits()) && 1623 Ty.getSizeInBits() % 16 != 0; 1624 }, 1625 [=](const LegalityQuery &Query) { 1626 // Pick the next power of 2, or a multiple of 64 over 128. 1627 // Whichever is smaller. 1628 const LLT &Ty = Query.Types[BigTyIdx]; 1629 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1630 if (NewSizeInBits >= 256) { 1631 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1632 if (RoundedTo < NewSizeInBits) 1633 NewSizeInBits = RoundedTo; 1634 } 1635 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1636 }) 1637 // Any vectors left are the wrong size. Scalarize them. 1638 .scalarize(0) 1639 .scalarize(1); 1640 } 1641 1642 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1643 // RegBankSelect. 1644 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1645 .legalFor({{S32}, {S64}}); 1646 1647 if (ST.hasVOP3PInsts()) { 1648 SextInReg.lowerFor({{V2S16}}) 1649 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1650 // get more vector shift opportunities, since we'll get those when 1651 // expanded. 1652 .clampMaxNumElementsStrict(0, S16, 2); 1653 } else if (ST.has16BitInsts()) { 1654 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1655 } else { 1656 // Prefer to promote to s32 before lowering if we don't have 16-bit 1657 // shifts. This avoid a lot of intermediate truncate and extend operations. 1658 SextInReg.lowerFor({{S32}, {S64}}); 1659 } 1660 1661 SextInReg 1662 .scalarize(0) 1663 .clampScalar(0, S32, S64) 1664 .lower(); 1665 1666 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1667 .scalarize(0) 1668 .lower(); 1669 1670 // TODO: Only Try to form v2s16 with legal packed instructions. 1671 getActionDefinitionsBuilder(G_FSHR) 1672 .legalFor({{S32, S32}}) 1673 .lowerFor({{V2S16, V2S16}}) 1674 .clampMaxNumElementsStrict(0, S16, 2) 1675 .scalarize(0) 1676 .lower(); 1677 1678 if (ST.hasVOP3PInsts()) { 1679 getActionDefinitionsBuilder(G_FSHL) 1680 .lowerFor({{V2S16, V2S16}}) 1681 .clampMaxNumElementsStrict(0, S16, 2) 1682 .scalarize(0) 1683 .lower(); 1684 } else { 1685 getActionDefinitionsBuilder(G_FSHL) 1686 .scalarize(0) 1687 .lower(); 1688 } 1689 1690 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1691 .legalFor({S64}); 1692 1693 getActionDefinitionsBuilder(G_FENCE) 1694 .alwaysLegal(); 1695 1696 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1697 .scalarize(0) 1698 .minScalar(0, S32) 1699 .lower(); 1700 1701 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1702 .legalFor({{S32, S32}, {S64, S32}}) 1703 .clampScalar(1, S32, S32) 1704 .clampScalar(0, S32, S64) 1705 .widenScalarToNextPow2(0) 1706 .scalarize(0); 1707 1708 getActionDefinitionsBuilder({ 1709 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1710 G_FCOPYSIGN, 1711 1712 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1713 G_ATOMICRMW_NAND, 1714 G_ATOMICRMW_FSUB, 1715 G_READ_REGISTER, 1716 G_WRITE_REGISTER, 1717 1718 G_SADDO, G_SSUBO, 1719 1720 // TODO: Implement 1721 G_FMINIMUM, G_FMAXIMUM}).lower(); 1722 1723 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1724 .lower(); 1725 1726 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1727 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1728 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1729 .unsupported(); 1730 1731 getLegacyLegalizerInfo().computeTables(); 1732 verify(*ST.getInstrInfo()); 1733 } 1734 1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1736 MachineInstr &MI) const { 1737 MachineIRBuilder &B = Helper.MIRBuilder; 1738 MachineRegisterInfo &MRI = *B.getMRI(); 1739 1740 switch (MI.getOpcode()) { 1741 case TargetOpcode::G_ADDRSPACE_CAST: 1742 return legalizeAddrSpaceCast(MI, MRI, B); 1743 case TargetOpcode::G_FRINT: 1744 return legalizeFrint(MI, MRI, B); 1745 case TargetOpcode::G_FCEIL: 1746 return legalizeFceil(MI, MRI, B); 1747 case TargetOpcode::G_FREM: 1748 return legalizeFrem(MI, MRI, B); 1749 case TargetOpcode::G_INTRINSIC_TRUNC: 1750 return legalizeIntrinsicTrunc(MI, MRI, B); 1751 case TargetOpcode::G_SITOFP: 1752 return legalizeITOFP(MI, MRI, B, true); 1753 case TargetOpcode::G_UITOFP: 1754 return legalizeITOFP(MI, MRI, B, false); 1755 case TargetOpcode::G_FPTOSI: 1756 return legalizeFPTOI(MI, MRI, B, true); 1757 case TargetOpcode::G_FPTOUI: 1758 return legalizeFPTOI(MI, MRI, B, false); 1759 case TargetOpcode::G_FMINNUM: 1760 case TargetOpcode::G_FMAXNUM: 1761 case TargetOpcode::G_FMINNUM_IEEE: 1762 case TargetOpcode::G_FMAXNUM_IEEE: 1763 return legalizeMinNumMaxNum(Helper, MI); 1764 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1765 return legalizeExtractVectorElt(MI, MRI, B); 1766 case TargetOpcode::G_INSERT_VECTOR_ELT: 1767 return legalizeInsertVectorElt(MI, MRI, B); 1768 case TargetOpcode::G_SHUFFLE_VECTOR: 1769 return legalizeShuffleVector(MI, MRI, B); 1770 case TargetOpcode::G_FSIN: 1771 case TargetOpcode::G_FCOS: 1772 return legalizeSinCos(MI, MRI, B); 1773 case TargetOpcode::G_GLOBAL_VALUE: 1774 return legalizeGlobalValue(MI, MRI, B); 1775 case TargetOpcode::G_LOAD: 1776 case TargetOpcode::G_SEXTLOAD: 1777 case TargetOpcode::G_ZEXTLOAD: 1778 return legalizeLoad(Helper, MI); 1779 case TargetOpcode::G_FMAD: 1780 return legalizeFMad(MI, MRI, B); 1781 case TargetOpcode::G_FDIV: 1782 return legalizeFDIV(MI, MRI, B); 1783 case TargetOpcode::G_UDIV: 1784 case TargetOpcode::G_UREM: 1785 case TargetOpcode::G_UDIVREM: 1786 return legalizeUnsignedDIV_REM(MI, MRI, B); 1787 case TargetOpcode::G_SDIV: 1788 case TargetOpcode::G_SREM: 1789 case TargetOpcode::G_SDIVREM: 1790 return legalizeSignedDIV_REM(MI, MRI, B); 1791 case TargetOpcode::G_ATOMIC_CMPXCHG: 1792 return legalizeAtomicCmpXChg(MI, MRI, B); 1793 case TargetOpcode::G_FLOG: 1794 return legalizeFlog(MI, B, numbers::ln2f); 1795 case TargetOpcode::G_FLOG10: 1796 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1797 case TargetOpcode::G_FEXP: 1798 return legalizeFExp(MI, B); 1799 case TargetOpcode::G_FPOW: 1800 return legalizeFPow(MI, B); 1801 case TargetOpcode::G_FFLOOR: 1802 return legalizeFFloor(MI, MRI, B); 1803 case TargetOpcode::G_BUILD_VECTOR: 1804 return legalizeBuildVector(MI, MRI, B); 1805 case TargetOpcode::G_MUL: 1806 return legalizeMul(Helper, MI); 1807 case TargetOpcode::G_CTLZ: 1808 case TargetOpcode::G_CTTZ: 1809 return legalizeCTLZ_CTTZ(MI, MRI, B); 1810 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 1811 return legalizeFPTruncRound(MI, B); 1812 default: 1813 return false; 1814 } 1815 1816 llvm_unreachable("expected switch to return"); 1817 } 1818 1819 Register AMDGPULegalizerInfo::getSegmentAperture( 1820 unsigned AS, 1821 MachineRegisterInfo &MRI, 1822 MachineIRBuilder &B) const { 1823 MachineFunction &MF = B.getMF(); 1824 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1825 const LLT S32 = LLT::scalar(32); 1826 1827 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1828 1829 if (ST.hasApertureRegs()) { 1830 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1831 // getreg. 1832 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1833 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1834 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1835 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1836 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1837 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1838 unsigned Encoding = 1839 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1840 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1841 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1842 1843 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1844 1845 B.buildInstr(AMDGPU::S_GETREG_B32) 1846 .addDef(GetReg) 1847 .addImm(Encoding); 1848 MRI.setType(GetReg, S32); 1849 1850 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1851 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1852 } 1853 1854 // TODO: can we be smarter about machine pointer info? 1855 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1856 Register LoadAddr = MRI.createGenericVirtualRegister( 1857 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1858 // For code object version 5, private_base and shared_base are passed through 1859 // implicit kernargs. 1860 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 1861 AMDGPUTargetLowering::ImplicitParameter Param = 1862 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 1863 : AMDGPUTargetLowering::PRIVATE_BASE; 1864 uint64_t Offset = 1865 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 1866 1867 Register KernargPtrReg = MRI.createGenericVirtualRegister( 1868 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1869 1870 if (!loadInputValue(KernargPtrReg, B, 1871 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 1872 return Register(); 1873 1874 MachineMemOperand *MMO = MF.getMachineMemOperand( 1875 PtrInfo, 1876 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1877 MachineMemOperand::MOInvariant, 1878 LLT::scalar(32), commonAlignment(Align(64), Offset)); 1879 1880 // Pointer address 1881 B.buildPtrAdd(LoadAddr, KernargPtrReg, 1882 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 1883 // Load address 1884 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1885 } 1886 1887 Register QueuePtr = MRI.createGenericVirtualRegister( 1888 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1889 1890 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1891 return Register(); 1892 1893 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1894 // private_segment_aperture_base_hi. 1895 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1896 1897 MachineMemOperand *MMO = MF.getMachineMemOperand( 1898 PtrInfo, 1899 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1900 MachineMemOperand::MOInvariant, 1901 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1902 1903 B.buildPtrAdd(LoadAddr, QueuePtr, 1904 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 1905 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1906 } 1907 1908 /// Return true if the value is a known valid address, such that a null check is 1909 /// not necessary. 1910 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 1911 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 1912 MachineInstr *Def = MRI.getVRegDef(Val); 1913 switch (Def->getOpcode()) { 1914 case AMDGPU::G_FRAME_INDEX: 1915 case AMDGPU::G_GLOBAL_VALUE: 1916 case AMDGPU::G_BLOCK_ADDR: 1917 return true; 1918 case AMDGPU::G_CONSTANT: { 1919 const ConstantInt *CI = Def->getOperand(1).getCImm(); 1920 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 1921 } 1922 default: 1923 return false; 1924 } 1925 1926 return false; 1927 } 1928 1929 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1930 MachineInstr &MI, MachineRegisterInfo &MRI, 1931 MachineIRBuilder &B) const { 1932 MachineFunction &MF = B.getMF(); 1933 1934 const LLT S32 = LLT::scalar(32); 1935 Register Dst = MI.getOperand(0).getReg(); 1936 Register Src = MI.getOperand(1).getReg(); 1937 1938 LLT DstTy = MRI.getType(Dst); 1939 LLT SrcTy = MRI.getType(Src); 1940 unsigned DestAS = DstTy.getAddressSpace(); 1941 unsigned SrcAS = SrcTy.getAddressSpace(); 1942 1943 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1944 // vector element. 1945 assert(!DstTy.isVector()); 1946 1947 const AMDGPUTargetMachine &TM 1948 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1949 1950 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1951 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1952 return true; 1953 } 1954 1955 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 1956 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 1957 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1958 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1959 // Extract low 32-bits of the pointer. 1960 B.buildExtract(Dst, Src, 0); 1961 MI.eraseFromParent(); 1962 return true; 1963 } 1964 1965 unsigned NullVal = TM.getNullPointerValue(DestAS); 1966 1967 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1968 auto FlatNull = B.buildConstant(SrcTy, 0); 1969 1970 // Extract low 32-bits of the pointer. 1971 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1972 1973 auto CmpRes = 1974 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1975 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1976 1977 MI.eraseFromParent(); 1978 return true; 1979 } 1980 1981 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 1982 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 1983 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 1984 if (!ST.hasFlatAddressSpace()) 1985 return false; 1986 1987 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1988 if (!ApertureReg.isValid()) 1989 return false; 1990 1991 // Coerce the type of the low half of the result so we can use merge_values. 1992 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1993 1994 // TODO: Should we allow mismatched types but matching sizes in merges to 1995 // avoid the ptrtoint? 1996 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1997 1998 if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 1999 B.buildCopy(Dst, BuildPtr); 2000 MI.eraseFromParent(); 2001 return true; 2002 } 2003 2004 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2005 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2006 2007 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2008 SegmentNull.getReg(0)); 2009 2010 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2011 2012 MI.eraseFromParent(); 2013 return true; 2014 } 2015 2016 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2017 SrcTy.getSizeInBits() == 64) { 2018 // Truncate. 2019 B.buildExtract(Dst, Src, 0); 2020 MI.eraseFromParent(); 2021 return true; 2022 } 2023 2024 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2025 DstTy.getSizeInBits() == 64) { 2026 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2027 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2028 2029 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 2030 // another. Merge operands are required to be the same type, but creating an 2031 // extra ptrtoint would be kind of pointless. 2032 auto HighAddr = B.buildConstant( 2033 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 2034 B.buildMerge(Dst, {Src, HighAddr}); 2035 MI.eraseFromParent(); 2036 return true; 2037 } 2038 2039 DiagnosticInfoUnsupported InvalidAddrSpaceCast( 2040 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 2041 2042 LLVMContext &Ctx = MF.getFunction().getContext(); 2043 Ctx.diagnose(InvalidAddrSpaceCast); 2044 B.buildUndef(Dst); 2045 MI.eraseFromParent(); 2046 return true; 2047 } 2048 2049 bool AMDGPULegalizerInfo::legalizeFrint( 2050 MachineInstr &MI, MachineRegisterInfo &MRI, 2051 MachineIRBuilder &B) const { 2052 Register Src = MI.getOperand(1).getReg(); 2053 LLT Ty = MRI.getType(Src); 2054 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2055 2056 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2057 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2058 2059 auto C1 = B.buildFConstant(Ty, C1Val); 2060 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2061 2062 // TODO: Should this propagate fast-math-flags? 2063 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2064 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2065 2066 auto C2 = B.buildFConstant(Ty, C2Val); 2067 auto Fabs = B.buildFAbs(Ty, Src); 2068 2069 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2070 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2071 MI.eraseFromParent(); 2072 return true; 2073 } 2074 2075 bool AMDGPULegalizerInfo::legalizeFceil( 2076 MachineInstr &MI, MachineRegisterInfo &MRI, 2077 MachineIRBuilder &B) const { 2078 2079 const LLT S1 = LLT::scalar(1); 2080 const LLT S64 = LLT::scalar(64); 2081 2082 Register Src = MI.getOperand(1).getReg(); 2083 assert(MRI.getType(Src) == S64); 2084 2085 // result = trunc(src) 2086 // if (src > 0.0 && src != result) 2087 // result += 1.0 2088 2089 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2090 2091 const auto Zero = B.buildFConstant(S64, 0.0); 2092 const auto One = B.buildFConstant(S64, 1.0); 2093 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2094 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2095 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2096 auto Add = B.buildSelect(S64, And, One, Zero); 2097 2098 // TODO: Should this propagate fast-math-flags? 2099 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2100 MI.eraseFromParent(); 2101 return true; 2102 } 2103 2104 bool AMDGPULegalizerInfo::legalizeFrem( 2105 MachineInstr &MI, MachineRegisterInfo &MRI, 2106 MachineIRBuilder &B) const { 2107 Register DstReg = MI.getOperand(0).getReg(); 2108 Register Src0Reg = MI.getOperand(1).getReg(); 2109 Register Src1Reg = MI.getOperand(2).getReg(); 2110 auto Flags = MI.getFlags(); 2111 LLT Ty = MRI.getType(DstReg); 2112 2113 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2114 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2115 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2116 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2117 MI.eraseFromParent(); 2118 return true; 2119 } 2120 2121 static MachineInstrBuilder extractF64Exponent(Register Hi, 2122 MachineIRBuilder &B) { 2123 const unsigned FractBits = 52; 2124 const unsigned ExpBits = 11; 2125 LLT S32 = LLT::scalar(32); 2126 2127 auto Const0 = B.buildConstant(S32, FractBits - 32); 2128 auto Const1 = B.buildConstant(S32, ExpBits); 2129 2130 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2131 .addUse(Hi) 2132 .addUse(Const0.getReg(0)) 2133 .addUse(Const1.getReg(0)); 2134 2135 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2136 } 2137 2138 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2139 MachineInstr &MI, MachineRegisterInfo &MRI, 2140 MachineIRBuilder &B) const { 2141 const LLT S1 = LLT::scalar(1); 2142 const LLT S32 = LLT::scalar(32); 2143 const LLT S64 = LLT::scalar(64); 2144 2145 Register Src = MI.getOperand(1).getReg(); 2146 assert(MRI.getType(Src) == S64); 2147 2148 // TODO: Should this use extract since the low half is unused? 2149 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2150 Register Hi = Unmerge.getReg(1); 2151 2152 // Extract the upper half, since this is where we will find the sign and 2153 // exponent. 2154 auto Exp = extractF64Exponent(Hi, B); 2155 2156 const unsigned FractBits = 52; 2157 2158 // Extract the sign bit. 2159 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2160 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2161 2162 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2163 2164 const auto Zero32 = B.buildConstant(S32, 0); 2165 2166 // Extend back to 64-bits. 2167 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2168 2169 auto Shr = B.buildAShr(S64, FractMask, Exp); 2170 auto Not = B.buildNot(S64, Shr); 2171 auto Tmp0 = B.buildAnd(S64, Src, Not); 2172 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2173 2174 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2175 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2176 2177 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2178 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2179 MI.eraseFromParent(); 2180 return true; 2181 } 2182 2183 bool AMDGPULegalizerInfo::legalizeITOFP( 2184 MachineInstr &MI, MachineRegisterInfo &MRI, 2185 MachineIRBuilder &B, bool Signed) const { 2186 2187 Register Dst = MI.getOperand(0).getReg(); 2188 Register Src = MI.getOperand(1).getReg(); 2189 2190 const LLT S64 = LLT::scalar(64); 2191 const LLT S32 = LLT::scalar(32); 2192 2193 assert(MRI.getType(Src) == S64); 2194 2195 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2196 auto ThirtyTwo = B.buildConstant(S32, 32); 2197 2198 if (MRI.getType(Dst) == S64) { 2199 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2200 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2201 2202 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2203 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2204 .addUse(CvtHi.getReg(0)) 2205 .addUse(ThirtyTwo.getReg(0)); 2206 2207 // TODO: Should this propagate fast-math-flags? 2208 B.buildFAdd(Dst, LdExp, CvtLo); 2209 MI.eraseFromParent(); 2210 return true; 2211 } 2212 2213 assert(MRI.getType(Dst) == S32); 2214 2215 auto One = B.buildConstant(S32, 1); 2216 2217 MachineInstrBuilder ShAmt; 2218 if (Signed) { 2219 auto ThirtyOne = B.buildConstant(S32, 31); 2220 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2221 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2222 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2223 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2224 /*HasSideEffects=*/false) 2225 .addUse(Unmerge.getReg(1)); 2226 auto LS2 = B.buildSub(S32, LS, One); 2227 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2228 } else 2229 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2230 auto Norm = B.buildShl(S64, Src, ShAmt); 2231 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2232 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2233 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2234 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2235 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2236 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2237 /*HasSideEffects=*/false) 2238 .addUse(FVal.getReg(0)) 2239 .addUse(Scale.getReg(0)); 2240 MI.eraseFromParent(); 2241 return true; 2242 } 2243 2244 // TODO: Copied from DAG implementation. Verify logic and document how this 2245 // actually works. 2246 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2247 MachineRegisterInfo &MRI, 2248 MachineIRBuilder &B, 2249 bool Signed) const { 2250 2251 Register Dst = MI.getOperand(0).getReg(); 2252 Register Src = MI.getOperand(1).getReg(); 2253 2254 const LLT S64 = LLT::scalar(64); 2255 const LLT S32 = LLT::scalar(32); 2256 2257 const LLT SrcLT = MRI.getType(Src); 2258 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2259 2260 unsigned Flags = MI.getFlags(); 2261 2262 // The basic idea of converting a floating point number into a pair of 32-bit 2263 // integers is illustrated as follows: 2264 // 2265 // tf := trunc(val); 2266 // hif := floor(tf * 2^-32); 2267 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2268 // hi := fptoi(hif); 2269 // lo := fptoi(lof); 2270 // 2271 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2272 MachineInstrBuilder Sign; 2273 if (Signed && SrcLT == S32) { 2274 // However, a 32-bit floating point number has only 23 bits mantissa and 2275 // it's not enough to hold all the significant bits of `lof` if val is 2276 // negative. To avoid the loss of precision, We need to take the absolute 2277 // value after truncating and flip the result back based on the original 2278 // signedness. 2279 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2280 Trunc = B.buildFAbs(S32, Trunc, Flags); 2281 } 2282 MachineInstrBuilder K0, K1; 2283 if (SrcLT == S64) { 2284 K0 = B.buildFConstant(S64, 2285 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2286 K1 = B.buildFConstant(S64, 2287 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2288 } else { 2289 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2290 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2291 } 2292 2293 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2294 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2295 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2296 2297 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2298 : B.buildFPTOUI(S32, FloorMul); 2299 auto Lo = B.buildFPTOUI(S32, Fma); 2300 2301 if (Signed && SrcLT == S32) { 2302 // Flip the result based on the signedness, which is either all 0s or 1s. 2303 Sign = B.buildMerge(S64, {Sign, Sign}); 2304 // r := xor({lo, hi}, sign) - sign; 2305 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2306 } else 2307 B.buildMerge(Dst, {Lo, Hi}); 2308 MI.eraseFromParent(); 2309 2310 return true; 2311 } 2312 2313 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2314 MachineInstr &MI) const { 2315 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2316 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2317 2318 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2319 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2320 2321 // With ieee_mode disabled, the instructions have the correct behavior 2322 // already for G_FMINNUM/G_FMAXNUM 2323 if (!MFI->getMode().IEEE) 2324 return !IsIEEEOp; 2325 2326 if (IsIEEEOp) 2327 return true; 2328 2329 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2330 } 2331 2332 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2333 MachineInstr &MI, MachineRegisterInfo &MRI, 2334 MachineIRBuilder &B) const { 2335 // TODO: Should move some of this into LegalizerHelper. 2336 2337 // TODO: Promote dynamic indexing of s16 to s32 2338 2339 // FIXME: Artifact combiner probably should have replaced the truncated 2340 // constant before this, so we shouldn't need 2341 // getIConstantVRegValWithLookThrough. 2342 Optional<ValueAndVReg> MaybeIdxVal = 2343 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2344 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2345 return true; 2346 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2347 2348 Register Dst = MI.getOperand(0).getReg(); 2349 Register Vec = MI.getOperand(1).getReg(); 2350 2351 LLT VecTy = MRI.getType(Vec); 2352 LLT EltTy = VecTy.getElementType(); 2353 assert(EltTy == MRI.getType(Dst)); 2354 2355 if (IdxVal < VecTy.getNumElements()) { 2356 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2357 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2358 } else { 2359 B.buildUndef(Dst); 2360 } 2361 2362 MI.eraseFromParent(); 2363 return true; 2364 } 2365 2366 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2367 MachineInstr &MI, MachineRegisterInfo &MRI, 2368 MachineIRBuilder &B) const { 2369 // TODO: Should move some of this into LegalizerHelper. 2370 2371 // TODO: Promote dynamic indexing of s16 to s32 2372 2373 // FIXME: Artifact combiner probably should have replaced the truncated 2374 // constant before this, so we shouldn't need 2375 // getIConstantVRegValWithLookThrough. 2376 Optional<ValueAndVReg> MaybeIdxVal = 2377 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2378 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2379 return true; 2380 2381 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2382 Register Dst = MI.getOperand(0).getReg(); 2383 Register Vec = MI.getOperand(1).getReg(); 2384 Register Ins = MI.getOperand(2).getReg(); 2385 2386 LLT VecTy = MRI.getType(Vec); 2387 LLT EltTy = VecTy.getElementType(); 2388 assert(EltTy == MRI.getType(Ins)); 2389 (void)Ins; 2390 2391 unsigned NumElts = VecTy.getNumElements(); 2392 if (IdxVal < NumElts) { 2393 SmallVector<Register, 8> SrcRegs; 2394 for (unsigned i = 0; i < NumElts; ++i) 2395 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2396 B.buildUnmerge(SrcRegs, Vec); 2397 2398 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2399 B.buildMerge(Dst, SrcRegs); 2400 } else { 2401 B.buildUndef(Dst); 2402 } 2403 2404 MI.eraseFromParent(); 2405 return true; 2406 } 2407 2408 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2409 MachineInstr &MI, MachineRegisterInfo &MRI, 2410 MachineIRBuilder &B) const { 2411 const LLT V2S16 = LLT::fixed_vector(2, 16); 2412 2413 Register Dst = MI.getOperand(0).getReg(); 2414 Register Src0 = MI.getOperand(1).getReg(); 2415 LLT DstTy = MRI.getType(Dst); 2416 LLT SrcTy = MRI.getType(Src0); 2417 2418 if (SrcTy == V2S16 && DstTy == V2S16 && 2419 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2420 return true; 2421 2422 MachineIRBuilder HelperBuilder(MI); 2423 GISelObserverWrapper DummyObserver; 2424 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2425 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2426 } 2427 2428 bool AMDGPULegalizerInfo::legalizeSinCos( 2429 MachineInstr &MI, MachineRegisterInfo &MRI, 2430 MachineIRBuilder &B) const { 2431 2432 Register DstReg = MI.getOperand(0).getReg(); 2433 Register SrcReg = MI.getOperand(1).getReg(); 2434 LLT Ty = MRI.getType(DstReg); 2435 unsigned Flags = MI.getFlags(); 2436 2437 Register TrigVal; 2438 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2439 if (ST.hasTrigReducedRange()) { 2440 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2441 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2442 .addUse(MulVal.getReg(0)) 2443 .setMIFlags(Flags).getReg(0); 2444 } else 2445 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2446 2447 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2448 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2449 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2450 .addUse(TrigVal) 2451 .setMIFlags(Flags); 2452 MI.eraseFromParent(); 2453 return true; 2454 } 2455 2456 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2457 MachineIRBuilder &B, 2458 const GlobalValue *GV, 2459 int64_t Offset, 2460 unsigned GAFlags) const { 2461 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2462 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2463 // to the following code sequence: 2464 // 2465 // For constant address space: 2466 // s_getpc_b64 s[0:1] 2467 // s_add_u32 s0, s0, $symbol 2468 // s_addc_u32 s1, s1, 0 2469 // 2470 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2471 // a fixup or relocation is emitted to replace $symbol with a literal 2472 // constant, which is a pc-relative offset from the encoding of the $symbol 2473 // operand to the global variable. 2474 // 2475 // For global address space: 2476 // s_getpc_b64 s[0:1] 2477 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2478 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2479 // 2480 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2481 // fixups or relocations are emitted to replace $symbol@*@lo and 2482 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2483 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2484 // operand to the global variable. 2485 // 2486 // What we want here is an offset from the value returned by s_getpc 2487 // (which is the address of the s_add_u32 instruction) to the global 2488 // variable, but since the encoding of $symbol starts 4 bytes after the start 2489 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2490 // small. This requires us to add 4 to the global variable offset in order to 2491 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2492 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2493 // instruction. 2494 2495 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2496 2497 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2498 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2499 2500 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2501 .addDef(PCReg); 2502 2503 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2504 if (GAFlags == SIInstrInfo::MO_NONE) 2505 MIB.addImm(0); 2506 else 2507 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2508 2509 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2510 2511 if (PtrTy.getSizeInBits() == 32) 2512 B.buildExtract(DstReg, PCReg, 0); 2513 return true; 2514 } 2515 2516 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2517 MachineInstr &MI, MachineRegisterInfo &MRI, 2518 MachineIRBuilder &B) const { 2519 Register DstReg = MI.getOperand(0).getReg(); 2520 LLT Ty = MRI.getType(DstReg); 2521 unsigned AS = Ty.getAddressSpace(); 2522 2523 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2524 MachineFunction &MF = B.getMF(); 2525 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2526 2527 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2528 if (!MFI->isModuleEntryFunction() && 2529 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2530 const Function &Fn = MF.getFunction(); 2531 DiagnosticInfoUnsupported BadLDSDecl( 2532 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2533 DS_Warning); 2534 Fn.getContext().diagnose(BadLDSDecl); 2535 2536 // We currently don't have a way to correctly allocate LDS objects that 2537 // aren't directly associated with a kernel. We do force inlining of 2538 // functions that use local objects. However, if these dead functions are 2539 // not eliminated, we don't want a compile time error. Just emit a warning 2540 // and a trap, since there should be no callable path here. 2541 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2542 B.buildUndef(DstReg); 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 // TODO: We could emit code to handle the initialization somewhere. 2548 // We ignore the initializer for now and legalize it to allow selection. 2549 // The initializer will anyway get errored out during assembly emission. 2550 const SITargetLowering *TLI = ST.getTargetLowering(); 2551 if (!TLI->shouldUseLDSConstAddress(GV)) { 2552 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2553 return true; // Leave in place; 2554 } 2555 2556 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2557 Type *Ty = GV->getValueType(); 2558 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2559 // zero-sized type in other languages to declare the dynamic shared 2560 // memory which size is not known at the compile time. They will be 2561 // allocated by the runtime and placed directly after the static 2562 // allocated ones. They all share the same offset. 2563 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2564 // Adjust alignment for that dynamic shared memory array. 2565 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2566 LLT S32 = LLT::scalar(32); 2567 auto Sz = 2568 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2569 B.buildIntToPtr(DstReg, Sz); 2570 MI.eraseFromParent(); 2571 return true; 2572 } 2573 } 2574 2575 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2576 *cast<GlobalVariable>(GV))); 2577 MI.eraseFromParent(); 2578 return true; 2579 } 2580 2581 const SITargetLowering *TLI = ST.getTargetLowering(); 2582 2583 if (TLI->shouldEmitFixup(GV)) { 2584 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2585 MI.eraseFromParent(); 2586 return true; 2587 } 2588 2589 if (TLI->shouldEmitPCReloc(GV)) { 2590 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2591 MI.eraseFromParent(); 2592 return true; 2593 } 2594 2595 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2596 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2597 2598 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2599 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2600 MachinePointerInfo::getGOT(MF), 2601 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2602 MachineMemOperand::MOInvariant, 2603 LoadTy, Align(8)); 2604 2605 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2606 2607 if (Ty.getSizeInBits() == 32) { 2608 // Truncate if this is a 32-bit constant address. 2609 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2610 B.buildExtract(DstReg, Load, 0); 2611 } else 2612 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2613 2614 MI.eraseFromParent(); 2615 return true; 2616 } 2617 2618 static LLT widenToNextPowerOf2(LLT Ty) { 2619 if (Ty.isVector()) 2620 return Ty.changeElementCount( 2621 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2622 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2623 } 2624 2625 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2626 MachineInstr &MI) const { 2627 MachineIRBuilder &B = Helper.MIRBuilder; 2628 MachineRegisterInfo &MRI = *B.getMRI(); 2629 GISelChangeObserver &Observer = Helper.Observer; 2630 2631 Register PtrReg = MI.getOperand(1).getReg(); 2632 LLT PtrTy = MRI.getType(PtrReg); 2633 unsigned AddrSpace = PtrTy.getAddressSpace(); 2634 2635 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2636 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2637 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2638 Observer.changingInstr(MI); 2639 MI.getOperand(1).setReg(Cast.getReg(0)); 2640 Observer.changedInstr(MI); 2641 return true; 2642 } 2643 2644 if (MI.getOpcode() != AMDGPU::G_LOAD) 2645 return false; 2646 2647 Register ValReg = MI.getOperand(0).getReg(); 2648 LLT ValTy = MRI.getType(ValReg); 2649 2650 MachineMemOperand *MMO = *MI.memoperands_begin(); 2651 const unsigned ValSize = ValTy.getSizeInBits(); 2652 const LLT MemTy = MMO->getMemoryType(); 2653 const Align MemAlign = MMO->getAlign(); 2654 const unsigned MemSize = MemTy.getSizeInBits(); 2655 const uint64_t AlignInBits = 8 * MemAlign.value(); 2656 2657 // Widen non-power-of-2 loads to the alignment if needed 2658 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2659 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2660 2661 // This was already the correct extending load result type, so just adjust 2662 // the memory type. 2663 if (WideMemSize == ValSize) { 2664 MachineFunction &MF = B.getMF(); 2665 2666 MachineMemOperand *WideMMO = 2667 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2668 Observer.changingInstr(MI); 2669 MI.setMemRefs(MF, {WideMMO}); 2670 Observer.changedInstr(MI); 2671 return true; 2672 } 2673 2674 // Don't bother handling edge case that should probably never be produced. 2675 if (ValSize > WideMemSize) 2676 return false; 2677 2678 LLT WideTy = widenToNextPowerOf2(ValTy); 2679 2680 Register WideLoad; 2681 if (!WideTy.isVector()) { 2682 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2683 B.buildTrunc(ValReg, WideLoad).getReg(0); 2684 } else { 2685 // Extract the subvector. 2686 2687 if (isRegisterType(ValTy)) { 2688 // If this a case where G_EXTRACT is legal, use it. 2689 // (e.g. <3 x s32> -> <4 x s32>) 2690 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2691 B.buildExtract(ValReg, WideLoad, 0); 2692 } else { 2693 // For cases where the widened type isn't a nice register value, unmerge 2694 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2695 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2696 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2697 } 2698 } 2699 2700 MI.eraseFromParent(); 2701 return true; 2702 } 2703 2704 return false; 2705 } 2706 2707 bool AMDGPULegalizerInfo::legalizeFMad( 2708 MachineInstr &MI, MachineRegisterInfo &MRI, 2709 MachineIRBuilder &B) const { 2710 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2711 assert(Ty.isScalar()); 2712 2713 MachineFunction &MF = B.getMF(); 2714 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2715 2716 // TODO: Always legal with future ftz flag. 2717 // FIXME: Do we need just output? 2718 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2719 return true; 2720 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2721 return true; 2722 2723 MachineIRBuilder HelperBuilder(MI); 2724 GISelObserverWrapper DummyObserver; 2725 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2726 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2727 } 2728 2729 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2730 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2731 Register DstReg = MI.getOperand(0).getReg(); 2732 Register PtrReg = MI.getOperand(1).getReg(); 2733 Register CmpVal = MI.getOperand(2).getReg(); 2734 Register NewVal = MI.getOperand(3).getReg(); 2735 2736 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2737 "this should not have been custom lowered"); 2738 2739 LLT ValTy = MRI.getType(CmpVal); 2740 LLT VecTy = LLT::fixed_vector(2, ValTy); 2741 2742 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2743 2744 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2745 .addDef(DstReg) 2746 .addUse(PtrReg) 2747 .addUse(PackedVal) 2748 .setMemRefs(MI.memoperands()); 2749 2750 MI.eraseFromParent(); 2751 return true; 2752 } 2753 2754 bool AMDGPULegalizerInfo::legalizeFlog( 2755 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2756 Register Dst = MI.getOperand(0).getReg(); 2757 Register Src = MI.getOperand(1).getReg(); 2758 LLT Ty = B.getMRI()->getType(Dst); 2759 unsigned Flags = MI.getFlags(); 2760 2761 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2762 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2763 2764 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2765 MI.eraseFromParent(); 2766 return true; 2767 } 2768 2769 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2770 MachineIRBuilder &B) const { 2771 Register Dst = MI.getOperand(0).getReg(); 2772 Register Src = MI.getOperand(1).getReg(); 2773 unsigned Flags = MI.getFlags(); 2774 LLT Ty = B.getMRI()->getType(Dst); 2775 2776 auto K = B.buildFConstant(Ty, numbers::log2e); 2777 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2778 B.buildFExp2(Dst, Mul, Flags); 2779 MI.eraseFromParent(); 2780 return true; 2781 } 2782 2783 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2784 MachineIRBuilder &B) const { 2785 Register Dst = MI.getOperand(0).getReg(); 2786 Register Src0 = MI.getOperand(1).getReg(); 2787 Register Src1 = MI.getOperand(2).getReg(); 2788 unsigned Flags = MI.getFlags(); 2789 LLT Ty = B.getMRI()->getType(Dst); 2790 const LLT S16 = LLT::scalar(16); 2791 const LLT S32 = LLT::scalar(32); 2792 2793 if (Ty == S32) { 2794 auto Log = B.buildFLog2(S32, Src0, Flags); 2795 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2796 .addUse(Log.getReg(0)) 2797 .addUse(Src1) 2798 .setMIFlags(Flags); 2799 B.buildFExp2(Dst, Mul, Flags); 2800 } else if (Ty == S16) { 2801 // There's no f16 fmul_legacy, so we need to convert for it. 2802 auto Log = B.buildFLog2(S16, Src0, Flags); 2803 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2804 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2805 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2806 .addUse(Ext0.getReg(0)) 2807 .addUse(Ext1.getReg(0)) 2808 .setMIFlags(Flags); 2809 2810 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2811 } else 2812 return false; 2813 2814 MI.eraseFromParent(); 2815 return true; 2816 } 2817 2818 // Find a source register, ignoring any possible source modifiers. 2819 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2820 Register ModSrc = OrigSrc; 2821 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2822 ModSrc = SrcFNeg->getOperand(1).getReg(); 2823 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2824 ModSrc = SrcFAbs->getOperand(1).getReg(); 2825 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2826 ModSrc = SrcFAbs->getOperand(1).getReg(); 2827 return ModSrc; 2828 } 2829 2830 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2831 MachineRegisterInfo &MRI, 2832 MachineIRBuilder &B) const { 2833 2834 const LLT S1 = LLT::scalar(1); 2835 const LLT S64 = LLT::scalar(64); 2836 Register Dst = MI.getOperand(0).getReg(); 2837 Register OrigSrc = MI.getOperand(1).getReg(); 2838 unsigned Flags = MI.getFlags(); 2839 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2840 "this should not have been custom lowered"); 2841 2842 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2843 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2844 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2845 // V_FRACT bug is: 2846 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2847 // 2848 // Convert floor(x) to (x - fract(x)) 2849 2850 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2851 .addUse(OrigSrc) 2852 .setMIFlags(Flags); 2853 2854 // Give source modifier matching some assistance before obscuring a foldable 2855 // pattern. 2856 2857 // TODO: We can avoid the neg on the fract? The input sign to fract 2858 // shouldn't matter? 2859 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2860 2861 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2862 2863 Register Min = MRI.createGenericVirtualRegister(S64); 2864 2865 // We don't need to concern ourselves with the snan handling difference, so 2866 // use the one which will directly select. 2867 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2868 if (MFI->getMode().IEEE) 2869 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2870 else 2871 B.buildFMinNum(Min, Fract, Const, Flags); 2872 2873 Register CorrectedFract = Min; 2874 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2875 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2876 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2877 } 2878 2879 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2880 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2881 2882 MI.eraseFromParent(); 2883 return true; 2884 } 2885 2886 // Turn an illegal packed v2s16 build vector into bit operations. 2887 // TODO: This should probably be a bitcast action in LegalizerHelper. 2888 bool AMDGPULegalizerInfo::legalizeBuildVector( 2889 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2890 Register Dst = MI.getOperand(0).getReg(); 2891 const LLT S32 = LLT::scalar(32); 2892 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2893 2894 Register Src0 = MI.getOperand(1).getReg(); 2895 Register Src1 = MI.getOperand(2).getReg(); 2896 assert(MRI.getType(Src0) == LLT::scalar(16)); 2897 2898 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2899 B.buildBitcast(Dst, Merge); 2900 2901 MI.eraseFromParent(); 2902 return true; 2903 } 2904 2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 2906 // 2907 // Source and accumulation registers must all be 32-bits. 2908 // 2909 // TODO: When the multiply is uniform, we should produce a code sequence 2910 // that is better suited to instruction selection on the SALU. Instead of 2911 // the outer loop going over parts of the result, the outer loop should go 2912 // over parts of one of the factors. This should result in instruction 2913 // selection that makes full use of S_ADDC_U32 instructions. 2914 void AMDGPULegalizerInfo::buildMultiply( 2915 LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 2916 ArrayRef<Register> Src0, ArrayRef<Register> Src1, 2917 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 2918 // Use (possibly empty) vectors of S1 registers to represent the set of 2919 // carries from one pair of positions to the next. 2920 using Carry = SmallVector<Register, 2>; 2921 2922 MachineIRBuilder &B = Helper.MIRBuilder; 2923 2924 const LLT S1 = LLT::scalar(1); 2925 const LLT S32 = LLT::scalar(32); 2926 const LLT S64 = LLT::scalar(64); 2927 2928 Register Zero32; 2929 Register Zero64; 2930 2931 auto getZero32 = [&]() -> Register { 2932 if (!Zero32) 2933 Zero32 = B.buildConstant(S32, 0).getReg(0); 2934 return Zero32; 2935 }; 2936 auto getZero64 = [&]() -> Register { 2937 if (!Zero64) 2938 Zero64 = B.buildConstant(S64, 0).getReg(0); 2939 return Zero64; 2940 }; 2941 2942 // Merge the given carries into the 32-bit LocalAccum, which is modified 2943 // in-place. 2944 // 2945 // Returns the carry-out, which is a single S1 register or null. 2946 auto mergeCarry = 2947 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 2948 if (CarryIn.empty()) 2949 return Register(); 2950 2951 bool HaveCarryOut = true; 2952 Register CarryAccum; 2953 if (CarryIn.size() == 1) { 2954 if (!LocalAccum) { 2955 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2956 return Register(); 2957 } 2958 2959 CarryAccum = getZero32(); 2960 } else { 2961 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 2962 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 2963 CarryAccum = 2964 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 2965 .getReg(0); 2966 } 2967 2968 if (!LocalAccum) { 2969 LocalAccum = getZero32(); 2970 HaveCarryOut = false; 2971 } 2972 } 2973 2974 auto Add = 2975 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 2976 LocalAccum = Add.getReg(0); 2977 return HaveCarryOut ? Add.getReg(1) : Register(); 2978 }; 2979 2980 // Build a multiply-add chain to compute 2981 // 2982 // LocalAccum + (partial products at DstIndex) 2983 // + (opportunistic subset of CarryIn) 2984 // 2985 // LocalAccum is an array of one or two 32-bit registers that are updated 2986 // in-place. The incoming registers may be null. 2987 // 2988 // In some edge cases, carry-ins can be consumed "for free". In that case, 2989 // the consumed carry bits are removed from CarryIn in-place. 2990 auto buildMadChain = 2991 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 2992 -> Carry { 2993 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 2994 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 2995 2996 Carry CarryOut; 2997 unsigned j0 = 0; 2998 2999 // Use plain 32-bit multiplication for the most significant part of the 3000 // result by default. 3001 if (LocalAccum.size() == 1 && 3002 (!UsePartialMad64_32 || !CarryIn.empty())) { 3003 do { 3004 unsigned j1 = DstIndex - j0; 3005 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 3006 if (!LocalAccum[0]) { 3007 LocalAccum[0] = Mul.getReg(0); 3008 } else { 3009 if (CarryIn.empty()) { 3010 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 3011 } else { 3012 LocalAccum[0] = 3013 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 3014 .getReg(0); 3015 CarryIn.pop_back(); 3016 } 3017 } 3018 ++j0; 3019 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 3020 } 3021 3022 // Build full 64-bit multiplies. 3023 if (j0 <= DstIndex) { 3024 bool HaveSmallAccum = false; 3025 Register Tmp; 3026 3027 if (LocalAccum[0]) { 3028 if (LocalAccum.size() == 1) { 3029 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 3030 HaveSmallAccum = true; 3031 } else if (LocalAccum[1]) { 3032 Tmp = B.buildMerge(S64, LocalAccum).getReg(0); 3033 HaveSmallAccum = false; 3034 } else { 3035 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 3036 HaveSmallAccum = true; 3037 } 3038 } else { 3039 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 3040 Tmp = getZero64(); 3041 HaveSmallAccum = true; 3042 } 3043 3044 do { 3045 unsigned j1 = DstIndex - j0; 3046 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 3047 {Src0[j0], Src1[j1], Tmp}); 3048 Tmp = Mad.getReg(0); 3049 if (!HaveSmallAccum) 3050 CarryOut.push_back(Mad.getReg(1)); 3051 HaveSmallAccum = false; 3052 ++j0; 3053 } while (j0 <= DstIndex); 3054 3055 auto Unmerge = B.buildUnmerge(S32, Tmp); 3056 LocalAccum[0] = Unmerge.getReg(0); 3057 if (LocalAccum.size() > 1) 3058 LocalAccum[1] = Unmerge.getReg(1); 3059 } 3060 3061 return CarryOut; 3062 }; 3063 3064 // Outer multiply loop, iterating over destination parts from least 3065 // significant to most significant parts. 3066 // 3067 // The columns of the following diagram correspond to the destination parts 3068 // affected by one iteration of the outer loop (ignoring boundary 3069 // conditions). 3070 // 3071 // Dest index relative to 2 * i: 1 0 -1 3072 // ------ 3073 // Carries from previous iteration: e o 3074 // Even-aligned partial product sum: E E . 3075 // Odd-aligned partial product sum: O O 3076 // 3077 // 'o' is OddCarry, 'e' is EvenCarry. 3078 // EE and OO are computed from partial products via buildMadChain and use 3079 // accumulation where possible and appropriate. 3080 // 3081 Register SeparateOddCarry; 3082 Carry EvenCarry; 3083 Carry OddCarry; 3084 3085 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 3086 Carry OddCarryIn = std::move(OddCarry); 3087 Carry EvenCarryIn = std::move(EvenCarry); 3088 OddCarry.clear(); 3089 EvenCarry.clear(); 3090 3091 // Partial products at offset 2 * i. 3092 if (2 * i < Accum.size()) { 3093 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 3094 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 3095 } 3096 3097 // Partial products at offset 2 * i - 1. 3098 if (i > 0) { 3099 if (!SeparateOddAlignedProducts) { 3100 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 3101 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3102 } else { 3103 bool IsHighest = 2 * i >= Accum.size(); 3104 Register SeparateOddOut[2]; 3105 auto LocalAccum = makeMutableArrayRef(SeparateOddOut) 3106 .take_front(IsHighest ? 1 : 2); 3107 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 3108 3109 MachineInstr *Lo; 3110 3111 if (i == 1) { 3112 if (!IsHighest) 3113 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 3114 else 3115 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 3116 } else { 3117 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 3118 SeparateOddCarry); 3119 } 3120 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 3121 3122 if (!IsHighest) { 3123 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 3124 Lo->getOperand(1).getReg()); 3125 Accum[2 * i] = Hi.getReg(0); 3126 SeparateOddCarry = Hi.getReg(1); 3127 } 3128 } 3129 } 3130 3131 // Add in the carries from the previous iteration 3132 if (i > 0) { 3133 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 3134 EvenCarryIn.push_back(CarryOut); 3135 3136 if (2 * i < Accum.size()) { 3137 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 3138 OddCarry.push_back(CarryOut); 3139 } 3140 } 3141 } 3142 } 3143 3144 // Custom narrowing of wide multiplies using wide multiply-add instructions. 3145 // 3146 // TODO: If the multiply is followed by an addition, we should attempt to 3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 3148 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 3149 MachineInstr &MI) const { 3150 assert(ST.hasMad64_32()); 3151 assert(MI.getOpcode() == TargetOpcode::G_MUL); 3152 3153 MachineIRBuilder &B = Helper.MIRBuilder; 3154 MachineRegisterInfo &MRI = *B.getMRI(); 3155 3156 Register DstReg = MI.getOperand(0).getReg(); 3157 Register Src0 = MI.getOperand(1).getReg(); 3158 Register Src1 = MI.getOperand(2).getReg(); 3159 3160 LLT Ty = MRI.getType(DstReg); 3161 assert(Ty.isScalar()); 3162 3163 unsigned Size = Ty.getSizeInBits(); 3164 unsigned NumParts = Size / 32; 3165 assert((Size % 32) == 0); 3166 assert(NumParts >= 2); 3167 3168 // Whether to use MAD_64_32 for partial products whose high half is 3169 // discarded. This avoids some ADD instructions but risks false dependency 3170 // stalls on some subtargets in some cases. 3171 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 3172 3173 // Whether to compute odd-aligned partial products separately. This is 3174 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 3175 // in an even-aligned VGPR. 3176 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 3177 3178 LLT S32 = LLT::scalar(32); 3179 SmallVector<Register, 2> Src0Parts, Src1Parts; 3180 for (unsigned i = 0; i < NumParts; ++i) { 3181 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3182 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 3183 } 3184 B.buildUnmerge(Src0Parts, Src0); 3185 B.buildUnmerge(Src1Parts, Src1); 3186 3187 SmallVector<Register, 2> AccumRegs(NumParts); 3188 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 3189 SeparateOddAlignedProducts); 3190 3191 B.buildMerge(DstReg, AccumRegs); 3192 MI.eraseFromParent(); 3193 return true; 3194 3195 } 3196 3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3199 // case with a single min instruction instead of a compare+select. 3200 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3201 MachineRegisterInfo &MRI, 3202 MachineIRBuilder &B) const { 3203 Register Dst = MI.getOperand(0).getReg(); 3204 Register Src = MI.getOperand(1).getReg(); 3205 LLT DstTy = MRI.getType(Dst); 3206 LLT SrcTy = MRI.getType(Src); 3207 3208 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3209 ? AMDGPU::G_AMDGPU_FFBH_U32 3210 : AMDGPU::G_AMDGPU_FFBL_B32; 3211 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3212 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3213 3214 MI.eraseFromParent(); 3215 return true; 3216 } 3217 3218 // Check that this is a G_XOR x, -1 3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3220 if (MI.getOpcode() != TargetOpcode::G_XOR) 3221 return false; 3222 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3223 return ConstVal && *ConstVal == -1; 3224 } 3225 3226 // Return the use branch instruction, otherwise null if the usage is invalid. 3227 static MachineInstr * 3228 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3229 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 3230 Register CondDef = MI.getOperand(0).getReg(); 3231 if (!MRI.hasOneNonDBGUse(CondDef)) 3232 return nullptr; 3233 3234 MachineBasicBlock *Parent = MI.getParent(); 3235 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3236 3237 if (isNot(MRI, *UseMI)) { 3238 Register NegatedCond = UseMI->getOperand(0).getReg(); 3239 if (!MRI.hasOneNonDBGUse(NegatedCond)) 3240 return nullptr; 3241 3242 // We're deleting the def of this value, so we need to remove it. 3243 eraseInstr(*UseMI, MRI); 3244 3245 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3246 Negated = true; 3247 } 3248 3249 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3250 return nullptr; 3251 3252 // Make sure the cond br is followed by a G_BR, or is the last instruction. 3253 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 3254 if (Next == Parent->end()) { 3255 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 3256 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 3257 return nullptr; 3258 UncondBrTarget = &*NextMBB; 3259 } else { 3260 if (Next->getOpcode() != AMDGPU::G_BR) 3261 return nullptr; 3262 Br = &*Next; 3263 UncondBrTarget = Br->getOperand(0).getMBB(); 3264 } 3265 3266 return UseMI; 3267 } 3268 3269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3270 const ArgDescriptor *Arg, 3271 const TargetRegisterClass *ArgRC, 3272 LLT ArgTy) const { 3273 MCRegister SrcReg = Arg->getRegister(); 3274 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 3275 assert(DstReg.isVirtual() && "Virtual register expected"); 3276 3277 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 3278 *ArgRC, B.getDebugLoc(), ArgTy); 3279 if (Arg->isMasked()) { 3280 // TODO: Should we try to emit this once in the entry block? 3281 const LLT S32 = LLT::scalar(32); 3282 const unsigned Mask = Arg->getMask(); 3283 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 3284 3285 Register AndMaskSrc = LiveIn; 3286 3287 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 3288 // 0. 3289 if (Shift != 0) { 3290 auto ShiftAmt = B.buildConstant(S32, Shift); 3291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 3292 } 3293 3294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 3295 } else { 3296 B.buildCopy(DstReg, LiveIn); 3297 } 3298 3299 return true; 3300 } 3301 3302 bool AMDGPULegalizerInfo::loadInputValue( 3303 Register DstReg, MachineIRBuilder &B, 3304 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3305 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3306 const ArgDescriptor *Arg; 3307 const TargetRegisterClass *ArgRC; 3308 LLT ArgTy; 3309 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3310 3311 if (!Arg) { 3312 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3313 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3314 // case the pointer argument may be missing and we use null. 3315 B.buildConstant(DstReg, 0); 3316 return true; 3317 } 3318 3319 // It's undefined behavior if a function marked with the amdgpu-no-* 3320 // attributes uses the corresponding intrinsic. 3321 B.buildUndef(DstReg); 3322 return true; 3323 } 3324 3325 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3326 return false; // TODO: Handle these 3327 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3328 } 3329 3330 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 3331 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3332 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3333 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 3334 return false; 3335 3336 MI.eraseFromParent(); 3337 return true; 3338 } 3339 3340 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 3341 int64_t C) { 3342 B.buildConstant(MI.getOperand(0).getReg(), C); 3343 MI.eraseFromParent(); 3344 return true; 3345 } 3346 3347 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 3348 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 3349 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3350 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 3351 if (MaxID == 0) 3352 return replaceWithConstant(B, MI, 0); 3353 3354 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3355 const ArgDescriptor *Arg; 3356 const TargetRegisterClass *ArgRC; 3357 LLT ArgTy; 3358 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3359 3360 Register DstReg = MI.getOperand(0).getReg(); 3361 if (!Arg) { 3362 // It's undefined behavior if a function marked with the amdgpu-no-* 3363 // attributes uses the corresponding intrinsic. 3364 B.buildUndef(DstReg); 3365 MI.eraseFromParent(); 3366 return true; 3367 } 3368 3369 if (Arg->isMasked()) { 3370 // Don't bother inserting AssertZext for packed IDs since we're emitting the 3371 // masking operations anyway. 3372 // 3373 // TODO: We could assert the top bit is 0 for the source copy. 3374 if (!loadInputValue(DstReg, B, ArgType)) 3375 return false; 3376 } else { 3377 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 3378 if (!loadInputValue(TmpReg, B, ArgType)) 3379 return false; 3380 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); 3381 } 3382 3383 MI.eraseFromParent(); 3384 return true; 3385 } 3386 3387 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 3388 int64_t Offset) const { 3389 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3390 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 3391 3392 // TODO: If we passed in the base kernel offset we could have a better 3393 // alignment than 4, but we don't really need it. 3394 if (!loadInputValue(KernArgReg, B, 3395 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3396 llvm_unreachable("failed to find kernarg segment ptr"); 3397 3398 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 3399 // TODO: Should get nuw 3400 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 3401 } 3402 3403 /// Legalize a value that's loaded from kernel arguments. This is only used by 3404 /// legacy intrinsics. 3405 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 3406 MachineIRBuilder &B, 3407 uint64_t Offset, 3408 Align Alignment) const { 3409 Register DstReg = MI.getOperand(0).getReg(); 3410 3411 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 3412 "unexpected kernarg parameter type"); 3413 3414 Register Ptr = getKernargParameterPtr(B, Offset); 3415 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 3416 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 3417 MachineMemOperand::MODereferenceable | 3418 MachineMemOperand::MOInvariant); 3419 MI.eraseFromParent(); 3420 return true; 3421 } 3422 3423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 3424 MachineRegisterInfo &MRI, 3425 MachineIRBuilder &B) const { 3426 Register Dst = MI.getOperand(0).getReg(); 3427 LLT DstTy = MRI.getType(Dst); 3428 LLT S16 = LLT::scalar(16); 3429 LLT S32 = LLT::scalar(32); 3430 LLT S64 = LLT::scalar(64); 3431 3432 if (DstTy == S16) 3433 return legalizeFDIV16(MI, MRI, B); 3434 if (DstTy == S32) 3435 return legalizeFDIV32(MI, MRI, B); 3436 if (DstTy == S64) 3437 return legalizeFDIV64(MI, MRI, B); 3438 3439 return false; 3440 } 3441 3442 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3443 Register DstDivReg, 3444 Register DstRemReg, 3445 Register X, 3446 Register Y) const { 3447 const LLT S1 = LLT::scalar(1); 3448 const LLT S32 = LLT::scalar(32); 3449 3450 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 3451 // algorithm used here. 3452 3453 // Initial estimate of inv(y). 3454 auto FloatY = B.buildUITOFP(S32, Y); 3455 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 3456 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 3457 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 3458 auto Z = B.buildFPTOUI(S32, ScaledY); 3459 3460 // One round of UNR. 3461 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 3462 auto NegYZ = B.buildMul(S32, NegY, Z); 3463 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 3464 3465 // Quotient/remainder estimate. 3466 auto Q = B.buildUMulH(S32, X, Z); 3467 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 3468 3469 // First quotient/remainder refinement. 3470 auto One = B.buildConstant(S32, 1); 3471 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3472 if (DstDivReg) 3473 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 3474 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 3475 3476 // Second quotient/remainder refinement. 3477 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3478 if (DstDivReg) 3479 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 3480 3481 if (DstRemReg) 3482 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 3483 } 3484 3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 3486 // 3487 // Return lo, hi of result 3488 // 3489 // %cvt.lo = G_UITOFP Val.lo 3490 // %cvt.hi = G_UITOFP Val.hi 3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad 3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 3494 // %mul2 = G_FMUL %mul1, 2**(-32) 3495 // %trunc = G_INTRINSIC_TRUNC %mul2 3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 3499 Register Val) { 3500 const LLT S32 = LLT::scalar(32); 3501 auto Unmerge = B.buildUnmerge(S32, Val); 3502 3503 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 3504 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 3505 3506 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 3507 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 3508 3509 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 3510 auto Mul1 = 3511 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 3512 3513 // 2**(-32) 3514 auto Mul2 = 3515 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 3516 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 3517 3518 // -(2**32) 3519 auto Mad2 = B.buildFMAD(S32, Trunc, 3520 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 3521 3522 auto ResultLo = B.buildFPTOUI(S32, Mad2); 3523 auto ResultHi = B.buildFPTOUI(S32, Trunc); 3524 3525 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 3526 } 3527 3528 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3529 Register DstDivReg, 3530 Register DstRemReg, 3531 Register Numer, 3532 Register Denom) const { 3533 const LLT S32 = LLT::scalar(32); 3534 const LLT S64 = LLT::scalar(64); 3535 const LLT S1 = LLT::scalar(1); 3536 Register RcpLo, RcpHi; 3537 3538 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 3539 3540 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 3541 3542 auto Zero64 = B.buildConstant(S64, 0); 3543 auto NegDenom = B.buildSub(S64, Zero64, Denom); 3544 3545 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 3546 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 3547 3548 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 3549 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 3550 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 3551 3552 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 3553 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3554 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 3555 3556 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 3557 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 3558 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 3559 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 3560 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 3561 3562 auto Zero32 = B.buildConstant(S32, 0); 3563 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3564 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3565 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3566 3567 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3568 Register NumerLo = UnmergeNumer.getReg(0); 3569 Register NumerHi = UnmergeNumer.getReg(1); 3570 3571 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3572 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3573 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3574 Register Mul3_Lo = UnmergeMul3.getReg(0); 3575 Register Mul3_Hi = UnmergeMul3.getReg(1); 3576 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3577 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3578 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3579 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3580 3581 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3582 Register DenomLo = UnmergeDenom.getReg(0); 3583 Register DenomHi = UnmergeDenom.getReg(1); 3584 3585 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3586 auto C1 = B.buildSExt(S32, CmpHi); 3587 3588 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3589 auto C2 = B.buildSExt(S32, CmpLo); 3590 3591 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3592 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3593 3594 // TODO: Here and below portions of the code can be enclosed into if/endif. 3595 // Currently control flow is unconditional and we have 4 selects after 3596 // potential endif to substitute PHIs. 3597 3598 // if C3 != 0 ... 3599 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3600 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3601 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3602 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3603 3604 auto One64 = B.buildConstant(S64, 1); 3605 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3606 3607 auto C4 = 3608 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3609 auto C5 = 3610 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3611 auto C6 = B.buildSelect( 3612 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3613 3614 // if (C6 != 0) 3615 auto Add4 = B.buildAdd(S64, Add3, One64); 3616 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3617 3618 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3619 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3620 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3621 3622 // endif C6 3623 // endif C3 3624 3625 if (DstDivReg) { 3626 auto Sel1 = B.buildSelect( 3627 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3628 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3629 Sel1, MulHi3); 3630 } 3631 3632 if (DstRemReg) { 3633 auto Sel2 = B.buildSelect( 3634 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3635 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3636 Sel2, Sub1); 3637 } 3638 } 3639 3640 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3641 MachineRegisterInfo &MRI, 3642 MachineIRBuilder &B) const { 3643 Register DstDivReg, DstRemReg; 3644 switch (MI.getOpcode()) { 3645 default: 3646 llvm_unreachable("Unexpected opcode!"); 3647 case AMDGPU::G_UDIV: { 3648 DstDivReg = MI.getOperand(0).getReg(); 3649 break; 3650 } 3651 case AMDGPU::G_UREM: { 3652 DstRemReg = MI.getOperand(0).getReg(); 3653 break; 3654 } 3655 case AMDGPU::G_UDIVREM: { 3656 DstDivReg = MI.getOperand(0).getReg(); 3657 DstRemReg = MI.getOperand(1).getReg(); 3658 break; 3659 } 3660 } 3661 3662 const LLT S64 = LLT::scalar(64); 3663 const LLT S32 = LLT::scalar(32); 3664 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3665 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3666 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3667 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3668 3669 if (Ty == S32) 3670 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3671 else if (Ty == S64) 3672 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3673 else 3674 return false; 3675 3676 MI.eraseFromParent(); 3677 return true; 3678 } 3679 3680 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3681 MachineRegisterInfo &MRI, 3682 MachineIRBuilder &B) const { 3683 const LLT S64 = LLT::scalar(64); 3684 const LLT S32 = LLT::scalar(32); 3685 3686 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3687 if (Ty != S32 && Ty != S64) 3688 return false; 3689 3690 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3691 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3692 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3693 3694 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3695 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3696 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3697 3698 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3699 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3700 3701 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3702 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3703 3704 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3705 switch (MI.getOpcode()) { 3706 default: 3707 llvm_unreachable("Unexpected opcode!"); 3708 case AMDGPU::G_SDIV: { 3709 DstDivReg = MI.getOperand(0).getReg(); 3710 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3711 break; 3712 } 3713 case AMDGPU::G_SREM: { 3714 DstRemReg = MI.getOperand(0).getReg(); 3715 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3716 break; 3717 } 3718 case AMDGPU::G_SDIVREM: { 3719 DstDivReg = MI.getOperand(0).getReg(); 3720 DstRemReg = MI.getOperand(1).getReg(); 3721 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3722 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3723 break; 3724 } 3725 } 3726 3727 if (Ty == S32) 3728 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3729 else 3730 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3731 3732 if (DstDivReg) { 3733 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3734 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3735 B.buildSub(DstDivReg, SignXor, Sign); 3736 } 3737 3738 if (DstRemReg) { 3739 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3740 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3741 B.buildSub(DstRemReg, SignXor, Sign); 3742 } 3743 3744 MI.eraseFromParent(); 3745 return true; 3746 } 3747 3748 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3749 MachineRegisterInfo &MRI, 3750 MachineIRBuilder &B) const { 3751 Register Res = MI.getOperand(0).getReg(); 3752 Register LHS = MI.getOperand(1).getReg(); 3753 Register RHS = MI.getOperand(2).getReg(); 3754 uint16_t Flags = MI.getFlags(); 3755 LLT ResTy = MRI.getType(Res); 3756 3757 const MachineFunction &MF = B.getMF(); 3758 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3759 MI.getFlag(MachineInstr::FmAfn); 3760 3761 if (!AllowInaccurateRcp) 3762 return false; 3763 3764 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3765 // 1 / x -> RCP(x) 3766 if (CLHS->isExactlyValue(1.0)) { 3767 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3768 .addUse(RHS) 3769 .setMIFlags(Flags); 3770 3771 MI.eraseFromParent(); 3772 return true; 3773 } 3774 3775 // -1 / x -> RCP( FNEG(x) ) 3776 if (CLHS->isExactlyValue(-1.0)) { 3777 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3778 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3779 .addUse(FNeg.getReg(0)) 3780 .setMIFlags(Flags); 3781 3782 MI.eraseFromParent(); 3783 return true; 3784 } 3785 } 3786 3787 // x / y -> x * (1.0 / y) 3788 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3789 .addUse(RHS) 3790 .setMIFlags(Flags); 3791 B.buildFMul(Res, LHS, RCP, Flags); 3792 3793 MI.eraseFromParent(); 3794 return true; 3795 } 3796 3797 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3798 MachineRegisterInfo &MRI, 3799 MachineIRBuilder &B) const { 3800 Register Res = MI.getOperand(0).getReg(); 3801 Register X = MI.getOperand(1).getReg(); 3802 Register Y = MI.getOperand(2).getReg(); 3803 uint16_t Flags = MI.getFlags(); 3804 LLT ResTy = MRI.getType(Res); 3805 3806 const MachineFunction &MF = B.getMF(); 3807 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3808 MI.getFlag(MachineInstr::FmAfn); 3809 3810 if (!AllowInaccurateRcp) 3811 return false; 3812 3813 auto NegY = B.buildFNeg(ResTy, Y); 3814 auto One = B.buildFConstant(ResTy, 1.0); 3815 3816 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3817 .addUse(Y) 3818 .setMIFlags(Flags); 3819 3820 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3821 R = B.buildFMA(ResTy, Tmp0, R, R); 3822 3823 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3824 R = B.buildFMA(ResTy, Tmp1, R, R); 3825 3826 auto Ret = B.buildFMul(ResTy, X, R); 3827 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3828 3829 B.buildFMA(Res, Tmp2, R, Ret); 3830 MI.eraseFromParent(); 3831 return true; 3832 } 3833 3834 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3835 MachineRegisterInfo &MRI, 3836 MachineIRBuilder &B) const { 3837 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3838 return true; 3839 3840 Register Res = MI.getOperand(0).getReg(); 3841 Register LHS = MI.getOperand(1).getReg(); 3842 Register RHS = MI.getOperand(2).getReg(); 3843 3844 uint16_t Flags = MI.getFlags(); 3845 3846 LLT S16 = LLT::scalar(16); 3847 LLT S32 = LLT::scalar(32); 3848 3849 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3850 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3851 3852 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3853 .addUse(RHSExt.getReg(0)) 3854 .setMIFlags(Flags); 3855 3856 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3857 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3858 3859 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3860 .addUse(RDst.getReg(0)) 3861 .addUse(RHS) 3862 .addUse(LHS) 3863 .setMIFlags(Flags); 3864 3865 MI.eraseFromParent(); 3866 return true; 3867 } 3868 3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3871 static void toggleSPDenormMode(bool Enable, 3872 MachineIRBuilder &B, 3873 const GCNSubtarget &ST, 3874 AMDGPU::SIModeRegisterDefaults Mode) { 3875 // Set SP denorm mode to this value. 3876 unsigned SPDenormMode = 3877 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3878 3879 if (ST.hasDenormModeInst()) { 3880 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3881 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3882 3883 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3884 B.buildInstr(AMDGPU::S_DENORM_MODE) 3885 .addImm(NewDenormModeValue); 3886 3887 } else { 3888 // Select FP32 bit field in mode register. 3889 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3890 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3891 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3892 3893 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3894 .addImm(SPDenormMode) 3895 .addImm(SPDenormModeBitField); 3896 } 3897 } 3898 3899 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3900 MachineRegisterInfo &MRI, 3901 MachineIRBuilder &B) const { 3902 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3903 return true; 3904 3905 Register Res = MI.getOperand(0).getReg(); 3906 Register LHS = MI.getOperand(1).getReg(); 3907 Register RHS = MI.getOperand(2).getReg(); 3908 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3909 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3910 3911 uint16_t Flags = MI.getFlags(); 3912 3913 LLT S32 = LLT::scalar(32); 3914 LLT S1 = LLT::scalar(1); 3915 3916 auto One = B.buildFConstant(S32, 1.0f); 3917 3918 auto DenominatorScaled = 3919 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3920 .addUse(LHS) 3921 .addUse(RHS) 3922 .addImm(0) 3923 .setMIFlags(Flags); 3924 auto NumeratorScaled = 3925 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3926 .addUse(LHS) 3927 .addUse(RHS) 3928 .addImm(1) 3929 .setMIFlags(Flags); 3930 3931 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3932 .addUse(DenominatorScaled.getReg(0)) 3933 .setMIFlags(Flags); 3934 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3935 3936 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3937 // aren't modeled as reading it. 3938 if (!Mode.allFP32Denormals()) 3939 toggleSPDenormMode(true, B, ST, Mode); 3940 3941 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3942 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3943 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3944 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3945 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3946 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3947 3948 if (!Mode.allFP32Denormals()) 3949 toggleSPDenormMode(false, B, ST, Mode); 3950 3951 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3952 .addUse(Fma4.getReg(0)) 3953 .addUse(Fma1.getReg(0)) 3954 .addUse(Fma3.getReg(0)) 3955 .addUse(NumeratorScaled.getReg(1)) 3956 .setMIFlags(Flags); 3957 3958 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3959 .addUse(Fmas.getReg(0)) 3960 .addUse(RHS) 3961 .addUse(LHS) 3962 .setMIFlags(Flags); 3963 3964 MI.eraseFromParent(); 3965 return true; 3966 } 3967 3968 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3969 MachineRegisterInfo &MRI, 3970 MachineIRBuilder &B) const { 3971 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3972 return true; 3973 3974 Register Res = MI.getOperand(0).getReg(); 3975 Register LHS = MI.getOperand(1).getReg(); 3976 Register RHS = MI.getOperand(2).getReg(); 3977 3978 uint16_t Flags = MI.getFlags(); 3979 3980 LLT S64 = LLT::scalar(64); 3981 LLT S1 = LLT::scalar(1); 3982 3983 auto One = B.buildFConstant(S64, 1.0); 3984 3985 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3986 .addUse(LHS) 3987 .addUse(RHS) 3988 .addImm(0) 3989 .setMIFlags(Flags); 3990 3991 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3992 3993 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3994 .addUse(DivScale0.getReg(0)) 3995 .setMIFlags(Flags); 3996 3997 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3998 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3999 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4000 4001 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4002 .addUse(LHS) 4003 .addUse(RHS) 4004 .addImm(1) 4005 .setMIFlags(Flags); 4006 4007 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 4008 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4009 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4010 4011 Register Scale; 4012 if (!ST.hasUsableDivScaleConditionOutput()) { 4013 // Workaround a hardware bug on SI where the condition output from div_scale 4014 // is not usable. 4015 4016 LLT S32 = LLT::scalar(32); 4017 4018 auto NumUnmerge = B.buildUnmerge(S32, LHS); 4019 auto DenUnmerge = B.buildUnmerge(S32, RHS); 4020 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4021 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4022 4023 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4024 Scale1Unmerge.getReg(1)); 4025 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4026 Scale0Unmerge.getReg(1)); 4027 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4028 } else { 4029 Scale = DivScale1.getReg(1); 4030 } 4031 4032 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4033 .addUse(Fma4.getReg(0)) 4034 .addUse(Fma3.getReg(0)) 4035 .addUse(Mul.getReg(0)) 4036 .addUse(Scale) 4037 .setMIFlags(Flags); 4038 4039 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 4040 .addUse(Fmas.getReg(0)) 4041 .addUse(RHS) 4042 .addUse(LHS) 4043 .setMIFlags(Flags); 4044 4045 MI.eraseFromParent(); 4046 return true; 4047 } 4048 4049 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 4050 MachineRegisterInfo &MRI, 4051 MachineIRBuilder &B) const { 4052 Register Res = MI.getOperand(0).getReg(); 4053 Register LHS = MI.getOperand(2).getReg(); 4054 Register RHS = MI.getOperand(3).getReg(); 4055 uint16_t Flags = MI.getFlags(); 4056 4057 LLT S32 = LLT::scalar(32); 4058 LLT S1 = LLT::scalar(1); 4059 4060 auto Abs = B.buildFAbs(S32, RHS, Flags); 4061 const APFloat C0Val(1.0f); 4062 4063 auto C0 = B.buildConstant(S32, 0x6f800000); 4064 auto C1 = B.buildConstant(S32, 0x2f800000); 4065 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 4066 4067 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 4068 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 4069 4070 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 4071 4072 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4073 .addUse(Mul0.getReg(0)) 4074 .setMIFlags(Flags); 4075 4076 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 4077 4078 B.buildFMul(Res, Sel, Mul1, Flags); 4079 4080 MI.eraseFromParent(); 4081 return true; 4082 } 4083 4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4085 // FIXME: Why do we handle this one but not other removed instructions? 4086 // 4087 // Reciprocal square root. The clamp prevents infinite results, clamping 4088 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4089 // +-max_float. 4090 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4091 MachineRegisterInfo &MRI, 4092 MachineIRBuilder &B) const { 4093 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4094 return true; 4095 4096 Register Dst = MI.getOperand(0).getReg(); 4097 Register Src = MI.getOperand(2).getReg(); 4098 auto Flags = MI.getFlags(); 4099 4100 LLT Ty = MRI.getType(Dst); 4101 4102 const fltSemantics *FltSemantics; 4103 if (Ty == LLT::scalar(32)) 4104 FltSemantics = &APFloat::IEEEsingle(); 4105 else if (Ty == LLT::scalar(64)) 4106 FltSemantics = &APFloat::IEEEdouble(); 4107 else 4108 return false; 4109 4110 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4111 .addUse(Src) 4112 .setMIFlags(Flags); 4113 4114 // We don't need to concern ourselves with the snan handling difference, since 4115 // the rsq quieted (or not) so use the one which will directly select. 4116 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4117 const bool UseIEEE = MFI->getMode().IEEE; 4118 4119 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4120 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4121 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4122 4123 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4124 4125 if (UseIEEE) 4126 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4127 else 4128 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4129 MI.eraseFromParent(); 4130 return true; 4131 } 4132 4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4134 switch (IID) { 4135 case Intrinsic::amdgcn_ds_fadd: 4136 return AMDGPU::G_ATOMICRMW_FADD; 4137 case Intrinsic::amdgcn_ds_fmin: 4138 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4139 case Intrinsic::amdgcn_ds_fmax: 4140 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4141 default: 4142 llvm_unreachable("not a DS FP intrinsic"); 4143 } 4144 } 4145 4146 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4147 MachineInstr &MI, 4148 Intrinsic::ID IID) const { 4149 GISelChangeObserver &Observer = Helper.Observer; 4150 Observer.changingInstr(MI); 4151 4152 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4153 4154 // The remaining operands were used to set fields in the MemOperand on 4155 // construction. 4156 for (int I = 6; I > 3; --I) 4157 MI.removeOperand(I); 4158 4159 MI.removeOperand(1); // Remove the intrinsic ID. 4160 Observer.changedInstr(MI); 4161 return true; 4162 } 4163 4164 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4165 MachineRegisterInfo &MRI, 4166 MachineIRBuilder &B) const { 4167 uint64_t Offset = 4168 ST.getTargetLowering()->getImplicitParameterOffset( 4169 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4170 LLT DstTy = MRI.getType(DstReg); 4171 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4172 4173 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4174 if (!loadInputValue(KernargPtrReg, B, 4175 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4176 return false; 4177 4178 // FIXME: This should be nuw 4179 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4180 return true; 4181 } 4182 4183 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 4184 MachineRegisterInfo &MRI, 4185 MachineIRBuilder &B) const { 4186 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4187 if (!MFI->isEntryFunction()) { 4188 return legalizePreloadedArgIntrin(MI, MRI, B, 4189 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 4190 } 4191 4192 Register DstReg = MI.getOperand(0).getReg(); 4193 if (!getImplicitArgPtr(DstReg, MRI, B)) 4194 return false; 4195 4196 MI.eraseFromParent(); 4197 return true; 4198 } 4199 4200 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 4201 MachineRegisterInfo &MRI, 4202 MachineIRBuilder &B) const { 4203 Function &F = B.getMF().getFunction(); 4204 Optional<uint32_t> KnownSize = 4205 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 4206 if (KnownSize.has_value()) 4207 B.buildConstant(DstReg, KnownSize.value()); 4208 return false; 4209 } 4210 4211 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 4212 MachineRegisterInfo &MRI, 4213 MachineIRBuilder &B) const { 4214 4215 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4216 if (!MFI->isEntryFunction()) { 4217 return legalizePreloadedArgIntrin(MI, MRI, B, 4218 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 4219 } 4220 4221 Register DstReg = MI.getOperand(0).getReg(); 4222 if (!getLDSKernelId(DstReg, MRI, B)) 4223 return false; 4224 4225 MI.eraseFromParent(); 4226 return true; 4227 } 4228 4229 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 4230 MachineRegisterInfo &MRI, 4231 MachineIRBuilder &B, 4232 unsigned AddrSpace) const { 4233 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4234 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4235 Register Hi32 = Unmerge.getReg(1); 4236 4237 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 4238 MI.eraseFromParent(); 4239 return true; 4240 } 4241 4242 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 4243 // offset (the offset that is included in bounds checking and swizzling, to be 4244 // split between the instruction's voffset and immoffset fields) and soffset 4245 // (the offset that is excluded from bounds checking and swizzling, to go in 4246 // the instruction's soffset field). This function takes the first kind of 4247 // offset and figures out how to split it between voffset and immoffset. 4248 std::pair<Register, unsigned> 4249 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 4250 Register OrigOffset) const { 4251 const unsigned MaxImm = 4095; 4252 Register BaseReg; 4253 unsigned ImmOffset; 4254 const LLT S32 = LLT::scalar(32); 4255 MachineRegisterInfo &MRI = *B.getMRI(); 4256 4257 std::tie(BaseReg, ImmOffset) = 4258 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 4259 4260 // If BaseReg is a pointer, convert it to int. 4261 if (MRI.getType(BaseReg).isPointer()) 4262 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 4263 4264 // If the immediate value is too big for the immoffset field, put the value 4265 // and -4096 into the immoffset field so that the value that is copied/added 4266 // for the voffset field is a multiple of 4096, and it stands more chance 4267 // of being CSEd with the copy/add for another similar load/store. 4268 // However, do not do that rounding down to a multiple of 4096 if that is a 4269 // negative number, as it appears to be illegal to have a negative offset 4270 // in the vgpr, even if adding the immediate offset makes it positive. 4271 unsigned Overflow = ImmOffset & ~MaxImm; 4272 ImmOffset -= Overflow; 4273 if ((int32_t)Overflow < 0) { 4274 Overflow += ImmOffset; 4275 ImmOffset = 0; 4276 } 4277 4278 if (Overflow != 0) { 4279 if (!BaseReg) { 4280 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 4281 } else { 4282 auto OverflowVal = B.buildConstant(S32, Overflow); 4283 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 4284 } 4285 } 4286 4287 if (!BaseReg) 4288 BaseReg = B.buildConstant(S32, 0).getReg(0); 4289 4290 return std::make_pair(BaseReg, ImmOffset); 4291 } 4292 4293 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4294 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4295 Register VOffset, Register SOffset, 4296 unsigned ImmOffset, Register VIndex, 4297 MachineRegisterInfo &MRI) const { 4298 Optional<ValueAndVReg> MaybeVOffsetVal = 4299 getIConstantVRegValWithLookThrough(VOffset, MRI); 4300 Optional<ValueAndVReg> MaybeSOffsetVal = 4301 getIConstantVRegValWithLookThrough(SOffset, MRI); 4302 Optional<ValueAndVReg> MaybeVIndexVal = 4303 getIConstantVRegValWithLookThrough(VIndex, MRI); 4304 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4305 // update the MMO with that offset. The stride is unknown so we can only do 4306 // this if VIndex is constant 0. 4307 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4308 MaybeVIndexVal->Value == 0) { 4309 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4310 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4311 MMO->setOffset(TotalOffset); 4312 } else { 4313 // We don't have a constant combined offset to use in the MMO. Give up. 4314 MMO->setValue((Value *)nullptr); 4315 } 4316 } 4317 4318 /// Handle register layout difference for f16 images for some subtargets. 4319 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 4320 MachineRegisterInfo &MRI, 4321 Register Reg, 4322 bool ImageStore) const { 4323 const LLT S16 = LLT::scalar(16); 4324 const LLT S32 = LLT::scalar(32); 4325 LLT StoreVT = MRI.getType(Reg); 4326 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 4327 4328 if (ST.hasUnpackedD16VMem()) { 4329 auto Unmerge = B.buildUnmerge(S16, Reg); 4330 4331 SmallVector<Register, 4> WideRegs; 4332 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4333 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 4334 4335 int NumElts = StoreVT.getNumElements(); 4336 4337 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4338 .getReg(0); 4339 } 4340 4341 if (ImageStore && ST.hasImageStoreD16Bug()) { 4342 if (StoreVT.getNumElements() == 2) { 4343 SmallVector<Register, 4> PackedRegs; 4344 Reg = B.buildBitcast(S32, Reg).getReg(0); 4345 PackedRegs.push_back(Reg); 4346 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4347 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4348 .getReg(0); 4349 } 4350 4351 if (StoreVT.getNumElements() == 3) { 4352 SmallVector<Register, 4> PackedRegs; 4353 auto Unmerge = B.buildUnmerge(S16, Reg); 4354 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4355 PackedRegs.push_back(Unmerge.getReg(I)); 4356 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4357 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4358 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4359 } 4360 4361 if (StoreVT.getNumElements() == 4) { 4362 SmallVector<Register, 4> PackedRegs; 4363 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4364 auto Unmerge = B.buildUnmerge(S32, Reg); 4365 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4366 PackedRegs.push_back(Unmerge.getReg(I)); 4367 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4368 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4369 .getReg(0); 4370 } 4371 4372 llvm_unreachable("invalid data type"); 4373 } 4374 4375 if (StoreVT == LLT::fixed_vector(3, S16)) { 4376 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 4377 .getReg(0); 4378 } 4379 return Reg; 4380 } 4381 4382 Register AMDGPULegalizerInfo::fixStoreSourceType( 4383 MachineIRBuilder &B, Register VData, bool IsFormat) const { 4384 MachineRegisterInfo *MRI = B.getMRI(); 4385 LLT Ty = MRI->getType(VData); 4386 4387 const LLT S16 = LLT::scalar(16); 4388 4389 // Fixup illegal register types for i8 stores. 4390 if (Ty == LLT::scalar(8) || Ty == S16) { 4391 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 4392 return AnyExt; 4393 } 4394 4395 if (Ty.isVector()) { 4396 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 4397 if (IsFormat) 4398 return handleD16VData(B, *MRI, VData); 4399 } 4400 } 4401 4402 return VData; 4403 } 4404 4405 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 4406 MachineRegisterInfo &MRI, 4407 MachineIRBuilder &B, 4408 bool IsTyped, 4409 bool IsFormat) const { 4410 Register VData = MI.getOperand(1).getReg(); 4411 LLT Ty = MRI.getType(VData); 4412 LLT EltTy = Ty.getScalarType(); 4413 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4414 const LLT S32 = LLT::scalar(32); 4415 4416 VData = fixStoreSourceType(B, VData, IsFormat); 4417 Register RSrc = MI.getOperand(2).getReg(); 4418 4419 MachineMemOperand *MMO = *MI.memoperands_begin(); 4420 const int MemSize = MMO->getSize(); 4421 4422 unsigned ImmOffset; 4423 4424 // The typed intrinsics add an immediate after the registers. 4425 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4426 4427 // The struct intrinsic variants add one additional operand over raw. 4428 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4429 Register VIndex; 4430 int OpOffset = 0; 4431 if (HasVIndex) { 4432 VIndex = MI.getOperand(3).getReg(); 4433 OpOffset = 1; 4434 } else { 4435 VIndex = B.buildConstant(S32, 0).getReg(0); 4436 } 4437 4438 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4439 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4440 4441 unsigned Format = 0; 4442 if (IsTyped) { 4443 Format = MI.getOperand(5 + OpOffset).getImm(); 4444 ++OpOffset; 4445 } 4446 4447 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4448 4449 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4450 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4451 4452 unsigned Opc; 4453 if (IsTyped) { 4454 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 4455 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 4456 } else if (IsFormat) { 4457 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 4458 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 4459 } else { 4460 switch (MemSize) { 4461 case 1: 4462 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 4463 break; 4464 case 2: 4465 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 4466 break; 4467 default: 4468 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 4469 break; 4470 } 4471 } 4472 4473 auto MIB = B.buildInstr(Opc) 4474 .addUse(VData) // vdata 4475 .addUse(RSrc) // rsrc 4476 .addUse(VIndex) // vindex 4477 .addUse(VOffset) // voffset 4478 .addUse(SOffset) // soffset 4479 .addImm(ImmOffset); // offset(imm) 4480 4481 if (IsTyped) 4482 MIB.addImm(Format); 4483 4484 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4485 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4486 .addMemOperand(MMO); 4487 4488 MI.eraseFromParent(); 4489 return true; 4490 } 4491 4492 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 4493 MachineRegisterInfo &MRI, 4494 MachineIRBuilder &B, 4495 bool IsFormat, 4496 bool IsTyped) const { 4497 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 4498 MachineMemOperand *MMO = *MI.memoperands_begin(); 4499 const LLT MemTy = MMO->getMemoryType(); 4500 const LLT S32 = LLT::scalar(32); 4501 4502 Register Dst = MI.getOperand(0).getReg(); 4503 Register RSrc = MI.getOperand(2).getReg(); 4504 4505 // The typed intrinsics add an immediate after the registers. 4506 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 4507 4508 // The struct intrinsic variants add one additional operand over raw. 4509 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4510 Register VIndex; 4511 int OpOffset = 0; 4512 if (HasVIndex) { 4513 VIndex = MI.getOperand(3).getReg(); 4514 OpOffset = 1; 4515 } else { 4516 VIndex = B.buildConstant(S32, 0).getReg(0); 4517 } 4518 4519 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 4520 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 4521 4522 unsigned Format = 0; 4523 if (IsTyped) { 4524 Format = MI.getOperand(5 + OpOffset).getImm(); 4525 ++OpOffset; 4526 } 4527 4528 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 4529 unsigned ImmOffset; 4530 4531 LLT Ty = MRI.getType(Dst); 4532 LLT EltTy = Ty.getScalarType(); 4533 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 4534 const bool Unpacked = ST.hasUnpackedD16VMem(); 4535 4536 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4537 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 4538 4539 unsigned Opc; 4540 4541 if (IsTyped) { 4542 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 4543 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 4544 } else if (IsFormat) { 4545 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 4546 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4547 } else { 4548 switch (MemTy.getSizeInBits()) { 4549 case 8: 4550 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 4551 break; 4552 case 16: 4553 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 4554 break; 4555 default: 4556 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 4557 break; 4558 } 4559 } 4560 4561 Register LoadDstReg; 4562 4563 bool IsExtLoad = 4564 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 4565 LLT UnpackedTy = Ty.changeElementSize(32); 4566 4567 if (IsExtLoad) 4568 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4569 else if (Unpacked && IsD16 && Ty.isVector()) 4570 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4571 else 4572 LoadDstReg = Dst; 4573 4574 auto MIB = B.buildInstr(Opc) 4575 .addDef(LoadDstReg) // vdata 4576 .addUse(RSrc) // rsrc 4577 .addUse(VIndex) // vindex 4578 .addUse(VOffset) // voffset 4579 .addUse(SOffset) // soffset 4580 .addImm(ImmOffset); // offset(imm) 4581 4582 if (IsTyped) 4583 MIB.addImm(Format); 4584 4585 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4586 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4587 .addMemOperand(MMO); 4588 4589 if (LoadDstReg != Dst) { 4590 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 4591 4592 // Widen result for extending loads was widened. 4593 if (IsExtLoad) 4594 B.buildTrunc(Dst, LoadDstReg); 4595 else { 4596 // Repack to original 16-bit vector result 4597 // FIXME: G_TRUNC should work, but legalization currently fails 4598 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4599 SmallVector<Register, 4> Repack; 4600 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4601 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4602 B.buildMerge(Dst, Repack); 4603 } 4604 } 4605 4606 MI.eraseFromParent(); 4607 return true; 4608 } 4609 4610 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4611 MachineIRBuilder &B, 4612 bool IsInc) const { 4613 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4614 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4615 B.buildInstr(Opc) 4616 .addDef(MI.getOperand(0).getReg()) 4617 .addUse(MI.getOperand(2).getReg()) 4618 .addUse(MI.getOperand(3).getReg()) 4619 .cloneMemRefs(MI); 4620 MI.eraseFromParent(); 4621 return true; 4622 } 4623 4624 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4625 switch (IntrID) { 4626 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4627 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4628 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4629 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4630 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4631 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4632 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4633 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4634 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4635 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4636 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4637 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4638 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4639 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4641 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4642 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4643 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4644 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4645 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4646 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4647 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4648 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4649 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4650 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4651 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4652 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4653 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4654 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4656 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4657 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4658 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4659 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4660 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4661 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4662 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4663 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4664 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4665 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4666 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4667 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4668 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4669 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4670 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4671 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4672 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4673 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4674 default: 4675 llvm_unreachable("unhandled atomic opcode"); 4676 } 4677 } 4678 4679 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4680 MachineIRBuilder &B, 4681 Intrinsic::ID IID) const { 4682 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4683 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4684 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4685 4686 Register Dst; 4687 4688 int OpOffset = 0; 4689 if (HasReturn) { 4690 // A few FP atomics do not support return values. 4691 Dst = MI.getOperand(0).getReg(); 4692 } else { 4693 OpOffset = -1; 4694 } 4695 4696 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4697 Register CmpVal; 4698 4699 if (IsCmpSwap) { 4700 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4701 ++OpOffset; 4702 } 4703 4704 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4705 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4706 4707 // The struct intrinsic variants add one additional operand over raw. 4708 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4709 Register VIndex; 4710 if (HasVIndex) { 4711 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4712 ++OpOffset; 4713 } else { 4714 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4715 } 4716 4717 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4718 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4719 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4720 4721 MachineMemOperand *MMO = *MI.memoperands_begin(); 4722 4723 unsigned ImmOffset; 4724 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4725 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4726 4727 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4728 4729 if (HasReturn) 4730 MIB.addDef(Dst); 4731 4732 MIB.addUse(VData); // vdata 4733 4734 if (IsCmpSwap) 4735 MIB.addReg(CmpVal); 4736 4737 MIB.addUse(RSrc) // rsrc 4738 .addUse(VIndex) // vindex 4739 .addUse(VOffset) // voffset 4740 .addUse(SOffset) // soffset 4741 .addImm(ImmOffset) // offset(imm) 4742 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4743 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4744 .addMemOperand(MMO); 4745 4746 MI.eraseFromParent(); 4747 return true; 4748 } 4749 4750 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4751 /// vector with s16 typed elements. 4752 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4753 SmallVectorImpl<Register> &PackedAddrs, 4754 unsigned ArgOffset, 4755 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4756 bool IsA16, bool IsG16) { 4757 const LLT S16 = LLT::scalar(16); 4758 const LLT V2S16 = LLT::fixed_vector(2, 16); 4759 auto EndIdx = Intr->VAddrEnd; 4760 4761 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4762 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4763 if (!SrcOp.isReg()) 4764 continue; // _L to _LZ may have eliminated this. 4765 4766 Register AddrReg = SrcOp.getReg(); 4767 4768 if ((I < Intr->GradientStart) || 4769 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4770 (I >= Intr->CoordStart && !IsA16)) { 4771 if ((I < Intr->GradientStart) && IsA16 && 4772 (B.getMRI()->getType(AddrReg) == S16)) { 4773 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 4774 // Special handling of bias when A16 is on. Bias is of type half but 4775 // occupies full 32-bit. 4776 PackedAddrs.push_back( 4777 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4778 .getReg(0)); 4779 } else { 4780 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 4781 "Bias needs to be converted to 16 bit in A16 mode"); 4782 // Handle any gradient or coordinate operands that should not be packed 4783 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4784 PackedAddrs.push_back(AddrReg); 4785 } 4786 } else { 4787 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4788 // derivatives dx/dh and dx/dv are packed with undef. 4789 if (((I + 1) >= EndIdx) || 4790 ((Intr->NumGradients / 2) % 2 == 1 && 4791 (I == static_cast<unsigned>(Intr->GradientStart + 4792 (Intr->NumGradients / 2) - 1) || 4793 I == static_cast<unsigned>(Intr->GradientStart + 4794 Intr->NumGradients - 1))) || 4795 // Check for _L to _LZ optimization 4796 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4797 PackedAddrs.push_back( 4798 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4799 .getReg(0)); 4800 } else { 4801 PackedAddrs.push_back( 4802 B.buildBuildVector( 4803 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4804 .getReg(0)); 4805 ++I; 4806 } 4807 } 4808 } 4809 } 4810 4811 /// Convert from separate vaddr components to a single vector address register, 4812 /// and replace the remaining operands with $noreg. 4813 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4814 int DimIdx, int NumVAddrs) { 4815 const LLT S32 = LLT::scalar(32); 4816 4817 SmallVector<Register, 8> AddrRegs; 4818 for (int I = 0; I != NumVAddrs; ++I) { 4819 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4820 if (SrcOp.isReg()) { 4821 AddrRegs.push_back(SrcOp.getReg()); 4822 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4823 } 4824 } 4825 4826 int NumAddrRegs = AddrRegs.size(); 4827 if (NumAddrRegs != 1) { 4828 // Above 8 elements round up to next power of 2 (i.e. 16). 4829 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4830 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4831 auto Undef = B.buildUndef(S32); 4832 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4833 NumAddrRegs = RoundedNumRegs; 4834 } 4835 4836 auto VAddr = 4837 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4838 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4839 } 4840 4841 for (int I = 1; I != NumVAddrs; ++I) { 4842 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4843 if (SrcOp.isReg()) 4844 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4845 } 4846 } 4847 4848 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4849 /// 4850 /// Depending on the subtarget, load/store with 16-bit element data need to be 4851 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4852 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4853 /// registers. 4854 /// 4855 /// We don't want to directly select image instructions just yet, but also want 4856 /// to exposes all register repacking to the legalizer/combiners. We also don't 4857 /// want a selected instruction entering RegBankSelect. In order to avoid 4858 /// defining a multitude of intermediate image instructions, directly hack on 4859 /// the intrinsic's arguments. In cases like a16 addresses, this requires 4860 /// padding now unnecessary arguments with $noreg. 4861 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4862 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4863 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4864 4865 const unsigned NumDefs = MI.getNumExplicitDefs(); 4866 const unsigned ArgOffset = NumDefs + 1; 4867 bool IsTFE = NumDefs == 2; 4868 // We are only processing the operands of d16 image operations on subtargets 4869 // that use the unpacked register layout, or need to repack the TFE result. 4870 4871 // TODO: Do we need to guard against already legalized intrinsics? 4872 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4873 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4874 4875 MachineRegisterInfo *MRI = B.getMRI(); 4876 const LLT S32 = LLT::scalar(32); 4877 const LLT S16 = LLT::scalar(16); 4878 const LLT V2S16 = LLT::fixed_vector(2, 16); 4879 4880 unsigned DMask = 0; 4881 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 4882 LLT Ty = MRI->getType(VData); 4883 4884 // Check for 16 bit addresses and pack if true. 4885 LLT GradTy = 4886 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4887 LLT AddrTy = 4888 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4889 const bool IsG16 = GradTy == S16; 4890 const bool IsA16 = AddrTy == S16; 4891 const bool IsD16 = Ty.getScalarType() == S16; 4892 4893 int DMaskLanes = 0; 4894 if (!BaseOpcode->Atomic) { 4895 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4896 if (BaseOpcode->Gather4) { 4897 DMaskLanes = 4; 4898 } else if (DMask != 0) { 4899 DMaskLanes = countPopulation(DMask); 4900 } else if (!IsTFE && !BaseOpcode->Store) { 4901 // If dmask is 0, this is a no-op load. This can be eliminated. 4902 B.buildUndef(MI.getOperand(0)); 4903 MI.eraseFromParent(); 4904 return true; 4905 } 4906 } 4907 4908 Observer.changingInstr(MI); 4909 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4910 4911 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 4912 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 4913 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 4914 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4915 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 4916 4917 // Track that we legalized this 4918 MI.setDesc(B.getTII().get(NewOpcode)); 4919 4920 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4921 // dmask to be at least 1 otherwise the instruction will fail 4922 if (IsTFE && DMask == 0) { 4923 DMask = 0x1; 4924 DMaskLanes = 1; 4925 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4926 } 4927 4928 if (BaseOpcode->Atomic) { 4929 Register VData0 = MI.getOperand(2).getReg(); 4930 LLT Ty = MRI->getType(VData0); 4931 4932 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4933 if (Ty.isVector()) 4934 return false; 4935 4936 if (BaseOpcode->AtomicX2) { 4937 Register VData1 = MI.getOperand(3).getReg(); 4938 // The two values are packed in one register. 4939 LLT PackedTy = LLT::fixed_vector(2, Ty); 4940 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4941 MI.getOperand(2).setReg(Concat.getReg(0)); 4942 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4943 } 4944 } 4945 4946 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4947 4948 // Rewrite the addressing register layout before doing anything else. 4949 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4950 // 16 bit gradients are supported, but are tied to the A16 control 4951 // so both gradients and addresses must be 16 bit 4952 return false; 4953 } 4954 4955 if (IsA16 && !ST.hasA16()) { 4956 // A16 not supported 4957 return false; 4958 } 4959 4960 if (IsA16 || IsG16) { 4961 if (Intr->NumVAddrs > 1) { 4962 SmallVector<Register, 4> PackedRegs; 4963 4964 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4965 IsG16); 4966 4967 // See also below in the non-a16 branch 4968 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4969 PackedRegs.size() <= ST.getNSAMaxSize(); 4970 4971 if (!UseNSA && PackedRegs.size() > 1) { 4972 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4973 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4974 PackedRegs[0] = Concat.getReg(0); 4975 PackedRegs.resize(1); 4976 } 4977 4978 const unsigned NumPacked = PackedRegs.size(); 4979 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4980 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4981 if (!SrcOp.isReg()) { 4982 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4983 continue; 4984 } 4985 4986 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4987 4988 if (I - Intr->VAddrStart < NumPacked) 4989 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4990 else 4991 SrcOp.setReg(AMDGPU::NoRegister); 4992 } 4993 } 4994 } else { 4995 // If the register allocator cannot place the address registers contiguously 4996 // without introducing moves, then using the non-sequential address encoding 4997 // is always preferable, since it saves VALU instructions and is usually a 4998 // wash in terms of code size or even better. 4999 // 5000 // However, we currently have no way of hinting to the register allocator 5001 // that MIMG addresses should be placed contiguously when it is possible to 5002 // do so, so force non-NSA for the common 2-address case as a heuristic. 5003 // 5004 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 5005 // allocation when possible. 5006 // 5007 // TODO: we can actually allow partial NSA where the final register is a 5008 // contiguous set of the remaining addresses. 5009 // This could help where there are more addresses than supported. 5010 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 5011 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 5012 5013 if (!UseNSA && Intr->NumVAddrs > 1) 5014 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5015 Intr->NumVAddrs); 5016 } 5017 5018 int Flags = 0; 5019 if (IsA16) 5020 Flags |= 1; 5021 if (IsG16) 5022 Flags |= 2; 5023 MI.addOperand(MachineOperand::CreateImm(Flags)); 5024 5025 if (BaseOpcode->Store) { // No TFE for stores? 5026 // TODO: Handle dmask trim 5027 if (!Ty.isVector() || !IsD16) 5028 return true; 5029 5030 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 5031 if (RepackedReg != VData) { 5032 MI.getOperand(1).setReg(RepackedReg); 5033 } 5034 5035 return true; 5036 } 5037 5038 Register DstReg = MI.getOperand(0).getReg(); 5039 const LLT EltTy = Ty.getScalarType(); 5040 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 5041 5042 // Confirm that the return type is large enough for the dmask specified 5043 if (NumElts < DMaskLanes) 5044 return false; 5045 5046 if (NumElts > 4 || DMaskLanes > 4) 5047 return false; 5048 5049 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5050 const LLT AdjustedTy = 5051 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 5052 5053 // The raw dword aligned data component of the load. The only legal cases 5054 // where this matters should be when using the packed D16 format, for 5055 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 5056 LLT RoundedTy; 5057 5058 // S32 vector to to cover all data, plus TFE result element. 5059 LLT TFETy; 5060 5061 // Register type to use for each loaded component. Will be S32 or V2S16. 5062 LLT RegTy; 5063 5064 if (IsD16 && ST.hasUnpackedD16VMem()) { 5065 RoundedTy = 5066 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5067 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 5068 RegTy = S32; 5069 } else { 5070 unsigned EltSize = EltTy.getSizeInBits(); 5071 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 5072 unsigned RoundedSize = 32 * RoundedElts; 5073 RoundedTy = LLT::scalarOrVector( 5074 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5075 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 5076 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 5077 } 5078 5079 // The return type does not need adjustment. 5080 // TODO: Should we change s16 case to s32 or <2 x s16>? 5081 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 5082 return true; 5083 5084 Register Dst1Reg; 5085 5086 // Insert after the instruction. 5087 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 5088 5089 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 5090 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 5091 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 5092 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 5093 5094 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 5095 5096 MI.getOperand(0).setReg(NewResultReg); 5097 5098 // In the IR, TFE is supposed to be used with a 2 element struct return 5099 // type. The instruction really returns these two values in one contiguous 5100 // register, with one additional dword beyond the loaded data. Rewrite the 5101 // return type to use a single register result. 5102 5103 if (IsTFE) { 5104 Dst1Reg = MI.getOperand(1).getReg(); 5105 if (MRI->getType(Dst1Reg) != S32) 5106 return false; 5107 5108 // TODO: Make sure the TFE operand bit is set. 5109 MI.removeOperand(1); 5110 5111 // Handle the easy case that requires no repack instructions. 5112 if (Ty == S32) { 5113 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 5114 return true; 5115 } 5116 } 5117 5118 // Now figure out how to copy the new result register back into the old 5119 // result. 5120 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 5121 5122 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 5123 5124 if (ResultNumRegs == 1) { 5125 assert(!IsTFE); 5126 ResultRegs[0] = NewResultReg; 5127 } else { 5128 // We have to repack into a new vector of some kind. 5129 for (int I = 0; I != NumDataRegs; ++I) 5130 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 5131 B.buildUnmerge(ResultRegs, NewResultReg); 5132 5133 // Drop the final TFE element to get the data part. The TFE result is 5134 // directly written to the right place already. 5135 if (IsTFE) 5136 ResultRegs.resize(NumDataRegs); 5137 } 5138 5139 // For an s16 scalar result, we form an s32 result with a truncate regardless 5140 // of packed vs. unpacked. 5141 if (IsD16 && !Ty.isVector()) { 5142 B.buildTrunc(DstReg, ResultRegs[0]); 5143 return true; 5144 } 5145 5146 // Avoid a build/concat_vector of 1 entry. 5147 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 5148 B.buildBitcast(DstReg, ResultRegs[0]); 5149 return true; 5150 } 5151 5152 assert(Ty.isVector()); 5153 5154 if (IsD16) { 5155 // For packed D16 results with TFE enabled, all the data components are 5156 // S32. Cast back to the expected type. 5157 // 5158 // TODO: We don't really need to use load s32 elements. We would only need one 5159 // cast for the TFE result if a multiple of v2s16 was used. 5160 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 5161 for (Register &Reg : ResultRegs) 5162 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 5163 } else if (ST.hasUnpackedD16VMem()) { 5164 for (Register &Reg : ResultRegs) 5165 Reg = B.buildTrunc(S16, Reg).getReg(0); 5166 } 5167 } 5168 5169 auto padWithUndef = [&](LLT Ty, int NumElts) { 5170 if (NumElts == 0) 5171 return; 5172 Register Undef = B.buildUndef(Ty).getReg(0); 5173 for (int I = 0; I != NumElts; ++I) 5174 ResultRegs.push_back(Undef); 5175 }; 5176 5177 // Pad out any elements eliminated due to the dmask. 5178 LLT ResTy = MRI->getType(ResultRegs[0]); 5179 if (!ResTy.isVector()) { 5180 padWithUndef(ResTy, NumElts - ResultRegs.size()); 5181 B.buildBuildVector(DstReg, ResultRegs); 5182 return true; 5183 } 5184 5185 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 5186 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 5187 5188 // Deal with the one annoying legal case. 5189 const LLT V3S16 = LLT::fixed_vector(3, 16); 5190 if (Ty == V3S16) { 5191 if (IsTFE) { 5192 if (ResultRegs.size() == 1) { 5193 NewResultReg = ResultRegs[0]; 5194 } else if (ResultRegs.size() == 2) { 5195 LLT V4S16 = LLT::fixed_vector(4, 16); 5196 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 5197 } else { 5198 return false; 5199 } 5200 } 5201 5202 if (MRI->getType(DstReg).getNumElements() < 5203 MRI->getType(NewResultReg).getNumElements()) { 5204 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 5205 } else { 5206 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 5207 } 5208 return true; 5209 } 5210 5211 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 5212 B.buildConcatVectors(DstReg, ResultRegs); 5213 return true; 5214 } 5215 5216 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5217 LegalizerHelper &Helper, MachineInstr &MI) const { 5218 MachineIRBuilder &B = Helper.MIRBuilder; 5219 GISelChangeObserver &Observer = Helper.Observer; 5220 5221 Register Dst = MI.getOperand(0).getReg(); 5222 LLT Ty = B.getMRI()->getType(Dst); 5223 unsigned Size = Ty.getSizeInBits(); 5224 MachineFunction &MF = B.getMF(); 5225 5226 Observer.changingInstr(MI); 5227 5228 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5229 Ty = getBitcastRegisterType(Ty); 5230 Helper.bitcastDst(MI, Ty, 0); 5231 Dst = MI.getOperand(0).getReg(); 5232 B.setInsertPt(B.getMBB(), MI); 5233 } 5234 5235 // FIXME: We don't really need this intermediate instruction. The intrinsic 5236 // should be fixed to have a memory operand. Since it's readnone, we're not 5237 // allowed to add one. 5238 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 5239 MI.removeOperand(1); // Remove intrinsic ID 5240 5241 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 5242 // TODO: Should this use datalayout alignment? 5243 const unsigned MemSize = (Size + 7) / 8; 5244 const Align MemAlign(4); 5245 MachineMemOperand *MMO = MF.getMachineMemOperand( 5246 MachinePointerInfo(), 5247 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5248 MachineMemOperand::MOInvariant, 5249 MemSize, MemAlign); 5250 MI.addMemOperand(MF, MMO); 5251 5252 // There are no 96-bit result scalar loads, but widening to 128-bit should 5253 // always be legal. We may need to restore this to a 96-bit result if it turns 5254 // out this needs to be converted to a vector load during RegBankSelect. 5255 if (!isPowerOf2_32(Size)) { 5256 if (Ty.isVector()) 5257 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 5258 else 5259 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 5260 } 5261 5262 Observer.changedInstr(MI); 5263 return true; 5264 } 5265 5266 // TODO: Move to selection 5267 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 5268 MachineRegisterInfo &MRI, 5269 MachineIRBuilder &B) const { 5270 if (!ST.isTrapHandlerEnabled() || 5271 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5272 return legalizeTrapEndpgm(MI, MRI, B); 5273 5274 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5275 switch (*HsaAbiVer) { 5276 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5277 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5278 return legalizeTrapHsaQueuePtr(MI, MRI, B); 5279 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 5280 case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5281 return ST.supportsGetDoorbellID() ? 5282 legalizeTrapHsa(MI, MRI, B) : 5283 legalizeTrapHsaQueuePtr(MI, MRI, B); 5284 } 5285 } 5286 5287 llvm_unreachable("Unknown trap handler"); 5288 } 5289 5290 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5291 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5292 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5293 MI.eraseFromParent(); 5294 return true; 5295 } 5296 5297 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5298 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5299 MachineFunction &MF = B.getMF(); 5300 const LLT S64 = LLT::scalar(64); 5301 5302 Register SGPR01(AMDGPU::SGPR0_SGPR1); 5303 // For code object version 5, queue_ptr is passed through implicit kernarg. 5304 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 5305 AMDGPUTargetLowering::ImplicitParameter Param = 5306 AMDGPUTargetLowering::QUEUE_PTR; 5307 uint64_t Offset = 5308 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 5309 5310 Register KernargPtrReg = MRI.createGenericVirtualRegister( 5311 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5312 5313 if (!loadInputValue(KernargPtrReg, B, 5314 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5315 return false; 5316 5317 // TODO: can we be smarter about machine pointer info? 5318 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 5319 MachineMemOperand *MMO = MF.getMachineMemOperand( 5320 PtrInfo, 5321 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 5322 MachineMemOperand::MOInvariant, 5323 LLT::scalar(64), commonAlignment(Align(64), Offset)); 5324 5325 // Pointer address 5326 Register LoadAddr = MRI.createGenericVirtualRegister( 5327 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5328 B.buildPtrAdd(LoadAddr, KernargPtrReg, 5329 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 5330 // Load address 5331 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 5332 B.buildCopy(SGPR01, Temp); 5333 B.buildInstr(AMDGPU::S_TRAP) 5334 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5335 .addReg(SGPR01, RegState::Implicit); 5336 MI.eraseFromParent(); 5337 return true; 5338 } 5339 5340 // Pass queue pointer to trap handler as input, and insert trap instruction 5341 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5342 Register LiveIn = 5343 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5344 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 5345 return false; 5346 5347 B.buildCopy(SGPR01, LiveIn); 5348 B.buildInstr(AMDGPU::S_TRAP) 5349 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 5350 .addReg(SGPR01, RegState::Implicit); 5351 5352 MI.eraseFromParent(); 5353 return true; 5354 } 5355 5356 bool AMDGPULegalizerInfo::legalizeTrapHsa( 5357 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5358 B.buildInstr(AMDGPU::S_TRAP) 5359 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 5360 MI.eraseFromParent(); 5361 return true; 5362 } 5363 5364 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 5365 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5366 // Is non-HSA path or trap-handler disabled? Then, report a warning 5367 // accordingly 5368 if (!ST.isTrapHandlerEnabled() || 5369 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 5370 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 5371 "debugtrap handler not supported", 5372 MI.getDebugLoc(), DS_Warning); 5373 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 5374 Ctx.diagnose(NoTrap); 5375 } else { 5376 // Insert debug-trap instruction 5377 B.buildInstr(AMDGPU::S_TRAP) 5378 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 5379 } 5380 5381 MI.eraseFromParent(); 5382 return true; 5383 } 5384 5385 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5386 MachineIRBuilder &B) const { 5387 MachineRegisterInfo &MRI = *B.getMRI(); 5388 const LLT S16 = LLT::scalar(16); 5389 const LLT S32 = LLT::scalar(32); 5390 const LLT V2S16 = LLT::fixed_vector(2, 16); 5391 const LLT V3S32 = LLT::fixed_vector(3, 32); 5392 5393 Register DstReg = MI.getOperand(0).getReg(); 5394 Register NodePtr = MI.getOperand(2).getReg(); 5395 Register RayExtent = MI.getOperand(3).getReg(); 5396 Register RayOrigin = MI.getOperand(4).getReg(); 5397 Register RayDir = MI.getOperand(5).getReg(); 5398 Register RayInvDir = MI.getOperand(6).getReg(); 5399 Register TDescr = MI.getOperand(7).getReg(); 5400 5401 if (!ST.hasGFX10_AEncoding()) { 5402 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5403 "intrinsic not supported on subtarget", 5404 MI.getDebugLoc()); 5405 B.getMF().getFunction().getContext().diagnose(BadIntrin); 5406 return false; 5407 } 5408 5409 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 5410 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5411 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5412 const unsigned NumVDataDwords = 4; 5413 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 5414 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 5415 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 5416 const unsigned BaseOpcodes[2][2] = { 5417 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5418 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5419 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5420 int Opcode; 5421 if (UseNSA) { 5422 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 5423 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 5424 : AMDGPU::MIMGEncGfx10NSA, 5425 NumVDataDwords, NumVAddrDwords); 5426 } else { 5427 Opcode = AMDGPU::getMIMGOpcode( 5428 BaseOpcodes[Is64][IsA16], 5429 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 5430 NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); 5431 } 5432 assert(Opcode != -1); 5433 5434 SmallVector<Register, 12> Ops; 5435 if (UseNSA && IsGFX11Plus) { 5436 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 5437 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5438 auto Merged = B.buildMerge( 5439 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 5440 Ops.push_back(Merged.getReg(0)); 5441 }; 5442 5443 Ops.push_back(NodePtr); 5444 Ops.push_back(RayExtent); 5445 packLanes(RayOrigin); 5446 5447 if (IsA16) { 5448 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5449 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5450 auto MergedDir = B.buildMerge( 5451 V3S32, 5452 {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), 5453 UnmergeRayDir.getReg(0)})) 5454 .getReg(0), 5455 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), 5456 UnmergeRayDir.getReg(1)})) 5457 .getReg(0), 5458 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), 5459 UnmergeRayDir.getReg(2)})) 5460 .getReg(0)}); 5461 Ops.push_back(MergedDir.getReg(0)); 5462 } else { 5463 packLanes(RayDir); 5464 packLanes(RayInvDir); 5465 } 5466 } else { 5467 if (Is64) { 5468 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5469 Ops.push_back(Unmerge.getReg(0)); 5470 Ops.push_back(Unmerge.getReg(1)); 5471 } else { 5472 Ops.push_back(NodePtr); 5473 } 5474 Ops.push_back(RayExtent); 5475 5476 auto packLanes = [&Ops, &S32, &B](Register Src) { 5477 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5478 Ops.push_back(Unmerge.getReg(0)); 5479 Ops.push_back(Unmerge.getReg(1)); 5480 Ops.push_back(Unmerge.getReg(2)); 5481 }; 5482 5483 packLanes(RayOrigin); 5484 if (IsA16) { 5485 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 5486 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5487 Register R1 = MRI.createGenericVirtualRegister(S32); 5488 Register R2 = MRI.createGenericVirtualRegister(S32); 5489 Register R3 = MRI.createGenericVirtualRegister(S32); 5490 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5491 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5492 B.buildMerge(R3, 5493 {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5494 Ops.push_back(R1); 5495 Ops.push_back(R2); 5496 Ops.push_back(R3); 5497 } else { 5498 packLanes(RayDir); 5499 packLanes(RayInvDir); 5500 } 5501 } 5502 5503 if (!UseNSA) { 5504 // Build a single vector containing all the operands so far prepared. 5505 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5506 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5507 Ops.clear(); 5508 Ops.push_back(MergedOps); 5509 } 5510 5511 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5512 .addDef(DstReg) 5513 .addImm(Opcode); 5514 5515 for (Register R : Ops) { 5516 MIB.addUse(R); 5517 } 5518 5519 MIB.addUse(TDescr) 5520 .addImm(IsA16 ? 1 : 0) 5521 .cloneMemRefs(MI); 5522 5523 MI.eraseFromParent(); 5524 return true; 5525 } 5526 5527 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 5528 MachineIRBuilder &B) const { 5529 unsigned Opc; 5530 int RoundMode = MI.getOperand(2).getImm(); 5531 5532 if (RoundMode == (int)RoundingMode::TowardPositive) 5533 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 5534 else if (RoundMode == (int)RoundingMode::TowardNegative) 5535 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 5536 else 5537 return false; 5538 5539 B.buildInstr(Opc) 5540 .addDef(MI.getOperand(0).getReg()) 5541 .addUse(MI.getOperand(1).getReg()); 5542 5543 MI.eraseFromParent(); 5544 5545 return true; 5546 } 5547 5548 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 5549 MachineInstr &MI) const { 5550 MachineIRBuilder &B = Helper.MIRBuilder; 5551 MachineRegisterInfo &MRI = *B.getMRI(); 5552 5553 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5554 auto IntrID = MI.getIntrinsicID(); 5555 switch (IntrID) { 5556 case Intrinsic::amdgcn_if: 5557 case Intrinsic::amdgcn_else: { 5558 MachineInstr *Br = nullptr; 5559 MachineBasicBlock *UncondBrTarget = nullptr; 5560 bool Negated = false; 5561 if (MachineInstr *BrCond = 5562 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5563 const SIRegisterInfo *TRI 5564 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5565 5566 Register Def = MI.getOperand(1).getReg(); 5567 Register Use = MI.getOperand(3).getReg(); 5568 5569 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5570 5571 if (Negated) 5572 std::swap(CondBrTarget, UncondBrTarget); 5573 5574 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5575 if (IntrID == Intrinsic::amdgcn_if) { 5576 B.buildInstr(AMDGPU::SI_IF) 5577 .addDef(Def) 5578 .addUse(Use) 5579 .addMBB(UncondBrTarget); 5580 } else { 5581 B.buildInstr(AMDGPU::SI_ELSE) 5582 .addDef(Def) 5583 .addUse(Use) 5584 .addMBB(UncondBrTarget); 5585 } 5586 5587 if (Br) { 5588 Br->getOperand(0).setMBB(CondBrTarget); 5589 } else { 5590 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 5591 // since we're swapping branch targets it needs to be reinserted. 5592 // FIXME: IRTranslator should probably not do this 5593 B.buildBr(*CondBrTarget); 5594 } 5595 5596 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 5597 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 5598 MI.eraseFromParent(); 5599 BrCond->eraseFromParent(); 5600 return true; 5601 } 5602 5603 return false; 5604 } 5605 case Intrinsic::amdgcn_loop: { 5606 MachineInstr *Br = nullptr; 5607 MachineBasicBlock *UncondBrTarget = nullptr; 5608 bool Negated = false; 5609 if (MachineInstr *BrCond = 5610 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 5611 const SIRegisterInfo *TRI 5612 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 5613 5614 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5615 Register Reg = MI.getOperand(2).getReg(); 5616 5617 if (Negated) 5618 std::swap(CondBrTarget, UncondBrTarget); 5619 5620 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5621 B.buildInstr(AMDGPU::SI_LOOP) 5622 .addUse(Reg) 5623 .addMBB(UncondBrTarget); 5624 5625 if (Br) 5626 Br->getOperand(0).setMBB(CondBrTarget); 5627 else 5628 B.buildBr(*CondBrTarget); 5629 5630 MI.eraseFromParent(); 5631 BrCond->eraseFromParent(); 5632 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 5633 return true; 5634 } 5635 5636 return false; 5637 } 5638 case Intrinsic::amdgcn_kernarg_segment_ptr: 5639 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 5640 // This only makes sense to call in a kernel, so just lower to null. 5641 B.buildConstant(MI.getOperand(0).getReg(), 0); 5642 MI.eraseFromParent(); 5643 return true; 5644 } 5645 5646 return legalizePreloadedArgIntrin( 5647 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 5648 case Intrinsic::amdgcn_implicitarg_ptr: 5649 return legalizeImplicitArgPtr(MI, MRI, B); 5650 case Intrinsic::amdgcn_workitem_id_x: 5651 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 5652 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 5653 case Intrinsic::amdgcn_workitem_id_y: 5654 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 5655 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 5656 case Intrinsic::amdgcn_workitem_id_z: 5657 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 5658 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 5659 case Intrinsic::amdgcn_workgroup_id_x: 5660 return legalizePreloadedArgIntrin(MI, MRI, B, 5661 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 5662 case Intrinsic::amdgcn_workgroup_id_y: 5663 return legalizePreloadedArgIntrin(MI, MRI, B, 5664 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 5665 case Intrinsic::amdgcn_workgroup_id_z: 5666 return legalizePreloadedArgIntrin(MI, MRI, B, 5667 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5668 case Intrinsic::amdgcn_lds_kernel_id: 5669 return legalizePreloadedArgIntrin(MI, MRI, B, 5670 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5671 case Intrinsic::amdgcn_dispatch_ptr: 5672 return legalizePreloadedArgIntrin(MI, MRI, B, 5673 AMDGPUFunctionArgInfo::DISPATCH_PTR); 5674 case Intrinsic::amdgcn_queue_ptr: 5675 return legalizePreloadedArgIntrin(MI, MRI, B, 5676 AMDGPUFunctionArgInfo::QUEUE_PTR); 5677 case Intrinsic::amdgcn_implicit_buffer_ptr: 5678 return legalizePreloadedArgIntrin( 5679 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 5680 case Intrinsic::amdgcn_dispatch_id: 5681 return legalizePreloadedArgIntrin(MI, MRI, B, 5682 AMDGPUFunctionArgInfo::DISPATCH_ID); 5683 case Intrinsic::r600_read_ngroups_x: 5684 // TODO: Emit error for hsa 5685 return legalizeKernargMemParameter(MI, B, 5686 SI::KernelInputOffsets::NGROUPS_X); 5687 case Intrinsic::r600_read_ngroups_y: 5688 return legalizeKernargMemParameter(MI, B, 5689 SI::KernelInputOffsets::NGROUPS_Y); 5690 case Intrinsic::r600_read_ngroups_z: 5691 return legalizeKernargMemParameter(MI, B, 5692 SI::KernelInputOffsets::NGROUPS_Z); 5693 case Intrinsic::r600_read_local_size_x: 5694 // TODO: Could insert G_ASSERT_ZEXT from s16 5695 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 5696 case Intrinsic::r600_read_local_size_y: 5697 // TODO: Could insert G_ASSERT_ZEXT from s16 5698 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 5699 // TODO: Could insert G_ASSERT_ZEXT from s16 5700 case Intrinsic::r600_read_local_size_z: 5701 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 5702 case Intrinsic::r600_read_global_size_x: 5703 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 5704 case Intrinsic::r600_read_global_size_y: 5705 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 5706 case Intrinsic::r600_read_global_size_z: 5707 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 5708 case Intrinsic::amdgcn_fdiv_fast: 5709 return legalizeFDIVFastIntrin(MI, MRI, B); 5710 case Intrinsic::amdgcn_is_shared: 5711 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 5712 case Intrinsic::amdgcn_is_private: 5713 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 5714 case Intrinsic::amdgcn_wavefrontsize: { 5715 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 5716 MI.eraseFromParent(); 5717 return true; 5718 } 5719 case Intrinsic::amdgcn_s_buffer_load: 5720 return legalizeSBufferLoad(Helper, MI); 5721 case Intrinsic::amdgcn_raw_buffer_store: 5722 case Intrinsic::amdgcn_struct_buffer_store: 5723 return legalizeBufferStore(MI, MRI, B, false, false); 5724 case Intrinsic::amdgcn_raw_buffer_store_format: 5725 case Intrinsic::amdgcn_struct_buffer_store_format: 5726 return legalizeBufferStore(MI, MRI, B, false, true); 5727 case Intrinsic::amdgcn_raw_tbuffer_store: 5728 case Intrinsic::amdgcn_struct_tbuffer_store: 5729 return legalizeBufferStore(MI, MRI, B, true, true); 5730 case Intrinsic::amdgcn_raw_buffer_load: 5731 case Intrinsic::amdgcn_struct_buffer_load: 5732 return legalizeBufferLoad(MI, MRI, B, false, false); 5733 case Intrinsic::amdgcn_raw_buffer_load_format: 5734 case Intrinsic::amdgcn_struct_buffer_load_format: 5735 return legalizeBufferLoad(MI, MRI, B, true, false); 5736 case Intrinsic::amdgcn_raw_tbuffer_load: 5737 case Intrinsic::amdgcn_struct_tbuffer_load: 5738 return legalizeBufferLoad(MI, MRI, B, true, true); 5739 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 5740 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 5741 case Intrinsic::amdgcn_raw_buffer_atomic_add: 5742 case Intrinsic::amdgcn_struct_buffer_atomic_add: 5743 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5744 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5745 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5746 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5747 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5748 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5749 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5750 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5751 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5752 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5753 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5754 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5755 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5756 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5757 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5758 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5759 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5760 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5761 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5762 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5763 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5764 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5765 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5766 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5767 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5768 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5769 return legalizeBufferAtomic(MI, B, IntrID); 5770 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5771 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 5772 Register DstReg = MI.getOperand(0).getReg(); 5773 if (!MRI.use_empty(DstReg) && 5774 !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { 5775 Function &F = B.getMF().getFunction(); 5776 DiagnosticInfoUnsupported NoFpRet( 5777 F, "return versions of fp atomics not supported", B.getDebugLoc(), 5778 DS_Error); 5779 F.getContext().diagnose(NoFpRet); 5780 B.buildUndef(DstReg); 5781 MI.eraseFromParent(); 5782 return true; 5783 } 5784 5785 return legalizeBufferAtomic(MI, B, IntrID); 5786 } 5787 case Intrinsic::amdgcn_atomic_inc: 5788 return legalizeAtomicIncDec(MI, B, true); 5789 case Intrinsic::amdgcn_atomic_dec: 5790 return legalizeAtomicIncDec(MI, B, false); 5791 case Intrinsic::trap: 5792 return legalizeTrapIntrinsic(MI, MRI, B); 5793 case Intrinsic::debugtrap: 5794 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5795 case Intrinsic::amdgcn_rsq_clamp: 5796 return legalizeRsqClampIntrinsic(MI, MRI, B); 5797 case Intrinsic::amdgcn_ds_fadd: 5798 case Intrinsic::amdgcn_ds_fmin: 5799 case Intrinsic::amdgcn_ds_fmax: 5800 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5801 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5802 return legalizeBVHIntrinsic(MI, B); 5803 default: { 5804 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5805 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5806 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5807 return true; 5808 } 5809 } 5810 5811 return true; 5812 } 5813