1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "llvm/ADT/ScopeExit.h" 22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 25 #include "llvm/IR/DiagnosticInfo.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 28 #define DEBUG_TYPE "amdgpu-legalinfo" 29 30 using namespace llvm; 31 using namespace LegalizeActions; 32 using namespace LegalizeMutations; 33 using namespace LegalityPredicates; 34 using namespace MIPatternMatch; 35 36 // Hack until load/store selection patterns support any tuple of legal types. 37 static cl::opt<bool> EnableNewLegality( 38 "amdgpu-global-isel-new-legality", 39 cl::desc("Use GlobalISel desired legality, rather than try to use" 40 "rules compatible with selection patterns"), 41 cl::init(false), 42 cl::ReallyHidden); 43 44 static constexpr unsigned MaxRegisterSize = 1024; 45 46 // Round the number of elements to the next power of two elements 47 static LLT getPow2VectorType(LLT Ty) { 48 unsigned NElts = Ty.getNumElements(); 49 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 50 return Ty.changeNumElements(Pow2NElts); 51 } 52 53 // Round the number of bits to the next power of two bits 54 static LLT getPow2ScalarType(LLT Ty) { 55 unsigned Bits = Ty.getSizeInBits(); 56 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 57 return LLT::scalar(Pow2Bits); 58 } 59 60 /// \returs true if this is an odd sized vector which should widen by adding an 61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62 /// excludes s1 vectors, which should always be scalarized. 63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 64 return [=](const LegalityQuery &Query) { 65 const LLT Ty = Query.Types[TypeIdx]; 66 if (!Ty.isVector()) 67 return false; 68 69 const LLT EltTy = Ty.getElementType(); 70 const unsigned EltSize = EltTy.getSizeInBits(); 71 return Ty.getNumElements() % 2 != 0 && 72 EltSize > 1 && EltSize < 32 && 73 Ty.getSizeInBits() % 32 != 0; 74 }; 75 } 76 77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78 return [=](const LegalityQuery &Query) { 79 const LLT Ty = Query.Types[TypeIdx]; 80 return Ty.getSizeInBits() % 32 == 0; 81 }; 82 } 83 84 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 85 return [=](const LegalityQuery &Query) { 86 const LLT Ty = Query.Types[TypeIdx]; 87 const LLT EltTy = Ty.getScalarType(); 88 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 89 }; 90 } 91 92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 93 return [=](const LegalityQuery &Query) { 94 const LLT Ty = Query.Types[TypeIdx]; 95 const LLT EltTy = Ty.getElementType(); 96 return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 97 }; 98 } 99 100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 101 return [=](const LegalityQuery &Query) { 102 const LLT Ty = Query.Types[TypeIdx]; 103 const LLT EltTy = Ty.getElementType(); 104 unsigned Size = Ty.getSizeInBits(); 105 unsigned Pieces = (Size + 63) / 64; 106 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 107 return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 108 }; 109 } 110 111 // Increase the number of vector elements to reach the next multiple of 32-bit 112 // type. 113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 114 return [=](const LegalityQuery &Query) { 115 const LLT Ty = Query.Types[TypeIdx]; 116 117 const LLT EltTy = Ty.getElementType(); 118 const int Size = Ty.getSizeInBits(); 119 const int EltSize = EltTy.getSizeInBits(); 120 const int NextMul32 = (Size + 31) / 32; 121 122 assert(EltSize < 32); 123 124 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 125 return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 126 }; 127 } 128 129 static LLT getBitcastRegisterType(const LLT Ty) { 130 const unsigned Size = Ty.getSizeInBits(); 131 132 LLT CoercedTy; 133 if (Size <= 32) { 134 // <2 x s8> -> s16 135 // <4 x s8> -> s32 136 return LLT::scalar(Size); 137 } 138 139 return LLT::scalarOrVector(Size / 32, 32); 140 } 141 142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143 return [=](const LegalityQuery &Query) { 144 const LLT Ty = Query.Types[TypeIdx]; 145 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146 }; 147 } 148 149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150 return [=](const LegalityQuery &Query) { 151 const LLT Ty = Query.Types[TypeIdx]; 152 unsigned Size = Ty.getSizeInBits(); 153 assert(Size % 32 == 0); 154 return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 155 }; 156 } 157 158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 159 return [=](const LegalityQuery &Query) { 160 const LLT QueryTy = Query.Types[TypeIdx]; 161 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 162 }; 163 } 164 165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 166 return [=](const LegalityQuery &Query) { 167 const LLT QueryTy = Query.Types[TypeIdx]; 168 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 169 }; 170 } 171 172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 173 return [=](const LegalityQuery &Query) { 174 const LLT QueryTy = Query.Types[TypeIdx]; 175 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 176 }; 177 } 178 179 static bool isRegisterSize(unsigned Size) { 180 return Size % 32 == 0 && Size <= MaxRegisterSize; 181 } 182 183 static bool isRegisterVectorElementType(LLT EltTy) { 184 const int EltSize = EltTy.getSizeInBits(); 185 return EltSize == 16 || EltSize % 32 == 0; 186 } 187 188 static bool isRegisterVectorType(LLT Ty) { 189 const int EltSize = Ty.getElementType().getSizeInBits(); 190 return EltSize == 32 || EltSize == 64 || 191 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 192 EltSize == 128 || EltSize == 256; 193 } 194 195 static bool isRegisterType(LLT Ty) { 196 if (!isRegisterSize(Ty.getSizeInBits())) 197 return false; 198 199 if (Ty.isVector()) 200 return isRegisterVectorType(Ty); 201 202 return true; 203 } 204 205 // Any combination of 32 or 64-bit elements up the maximum register size, and 206 // multiples of v2s16. 207 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 208 return [=](const LegalityQuery &Query) { 209 return isRegisterType(Query.Types[TypeIdx]); 210 }; 211 } 212 213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 const LLT QueryTy = Query.Types[TypeIdx]; 216 if (!QueryTy.isVector()) 217 return false; 218 const LLT EltTy = QueryTy.getElementType(); 219 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 220 }; 221 } 222 223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 224 return [=](const LegalityQuery &Query) { 225 const LLT Ty = Query.Types[TypeIdx]; 226 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 227 Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 228 }; 229 } 230 231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 232 // handle some operations by just promoting the register during 233 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 235 bool IsLoad) { 236 switch (AS) { 237 case AMDGPUAS::PRIVATE_ADDRESS: 238 // FIXME: Private element size. 239 return ST.enableFlatScratch() ? 128 : 32; 240 case AMDGPUAS::LOCAL_ADDRESS: 241 return ST.useDS128() ? 128 : 64; 242 case AMDGPUAS::GLOBAL_ADDRESS: 243 case AMDGPUAS::CONSTANT_ADDRESS: 244 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 245 // Treat constant and global as identical. SMRD loads are sometimes usable for 246 // global loads (ideally constant address space should be eliminated) 247 // depending on the context. Legality cannot be context dependent, but 248 // RegBankSelect can split the load as necessary depending on the pointer 249 // register bank/uniformity and if the memory is invariant or not written in a 250 // kernel. 251 return IsLoad ? 512 : 128; 252 default: 253 // Flat addresses may contextually need to be split to 32-bit parts if they 254 // may alias scratch depending on the subtarget. 255 return 128; 256 } 257 } 258 259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 260 const LegalityQuery &Query, 261 unsigned Opcode) { 262 const LLT Ty = Query.Types[0]; 263 264 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 265 const bool IsLoad = Opcode != AMDGPU::G_STORE; 266 267 unsigned RegSize = Ty.getSizeInBits(); 268 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 270 unsigned AS = Query.Types[1].getAddressSpace(); 271 272 // All of these need to be custom lowered to cast the pointer operand. 273 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 274 return false; 275 276 // TODO: We should be able to widen loads if the alignment is high enough, but 277 // we also need to modify the memory access size. 278 #if 0 279 // Accept widening loads based on alignment. 280 if (IsLoad && MemSize < Size) 281 MemSize = std::max(MemSize, Align); 282 #endif 283 284 // Only 1-byte and 2-byte to 32-bit extloads are valid. 285 if (MemSize != RegSize && RegSize != 32) 286 return false; 287 288 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 289 return false; 290 291 switch (MemSize) { 292 case 8: 293 case 16: 294 case 32: 295 case 64: 296 case 128: 297 break; 298 case 96: 299 if (!ST.hasDwordx3LoadStores()) 300 return false; 301 break; 302 case 256: 303 case 512: 304 // These may contextually need to be broken down. 305 break; 306 default: 307 return false; 308 } 309 310 assert(RegSize >= MemSize); 311 312 if (AlignBits < MemSize) { 313 const SITargetLowering *TLI = ST.getTargetLowering(); 314 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315 Align(AlignBits / 8))) 316 return false; 317 } 318 319 return true; 320 } 321 322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 323 // workaround this. Eventually it should ignore the type for loads and only care 324 // about the size. Return true in cases where we will workaround this for now by 325 // bitcasting. 326 static bool loadStoreBitcastWorkaround(const LLT Ty) { 327 if (EnableNewLegality) 328 return false; 329 330 const unsigned Size = Ty.getSizeInBits(); 331 if (Size <= 64) 332 return false; 333 if (!Ty.isVector()) 334 return true; 335 336 LLT EltTy = Ty.getElementType(); 337 if (EltTy.isPointer()) 338 return true; 339 340 unsigned EltSize = EltTy.getSizeInBits(); 341 return EltSize != 32 && EltSize != 64; 342 } 343 344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 345 unsigned Opcode) { 346 const LLT Ty = Query.Types[0]; 347 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 348 !loadStoreBitcastWorkaround(Ty); 349 } 350 351 /// Return true if a load or store of the type should be lowered with a bitcast 352 /// to a different type. 353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354 const unsigned MemSizeInBits) { 355 const unsigned Size = Ty.getSizeInBits(); 356 if (Size != MemSizeInBits) 357 return Size <= 32 && Ty.isVector(); 358 359 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360 return true; 361 return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362 !isRegisterVectorElementType(Ty.getElementType()); 363 } 364 365 /// Return true if we should legalize a load by widening an odd sized memory 366 /// access up to the alignment. Note this case when the memory access itself 367 /// changes, not the size of the result register. 368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369 unsigned AlignInBits, unsigned AddrSpace, 370 unsigned Opcode) { 371 // We don't want to widen cases that are naturally legal. 372 if (isPowerOf2_32(SizeInBits)) 373 return false; 374 375 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376 // end up widening these for a scalar load during RegBankSelect, since there 377 // aren't 96-bit scalar loads. 378 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379 return false; 380 381 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382 return false; 383 384 // A load is known dereferenceable up to the alignment, so it's legal to widen 385 // to it. 386 // 387 // TODO: Could check dereferenceable for less aligned cases. 388 unsigned RoundedSize = NextPowerOf2(SizeInBits); 389 if (AlignInBits < RoundedSize) 390 return false; 391 392 // Do not widen if it would introduce a slow unaligned load. 393 const SITargetLowering *TLI = ST.getTargetLowering(); 394 bool Fast = false; 395 return TLI->allowsMisalignedMemoryAccessesImpl( 396 RoundedSize, AddrSpace, Align(AlignInBits / 8), 397 MachineMemOperand::MOLoad, &Fast) && 398 Fast; 399 } 400 401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402 unsigned Opcode) { 403 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404 return false; 405 406 return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407 Query.MMODescrs[0].AlignInBits, 408 Query.Types[1].getAddressSpace(), Opcode); 409 } 410 411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 412 const GCNTargetMachine &TM) 413 : ST(ST_) { 414 using namespace TargetOpcode; 415 416 auto GetAddrSpacePtr = [&TM](unsigned AS) { 417 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 418 }; 419 420 const LLT S1 = LLT::scalar(1); 421 const LLT S8 = LLT::scalar(8); 422 const LLT S16 = LLT::scalar(16); 423 const LLT S32 = LLT::scalar(32); 424 const LLT S64 = LLT::scalar(64); 425 const LLT S128 = LLT::scalar(128); 426 const LLT S256 = LLT::scalar(256); 427 const LLT S512 = LLT::scalar(512); 428 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 429 430 const LLT V2S8 = LLT::vector(2, 8); 431 const LLT V2S16 = LLT::vector(2, 16); 432 const LLT V4S16 = LLT::vector(4, 16); 433 434 const LLT V2S32 = LLT::vector(2, 32); 435 const LLT V3S32 = LLT::vector(3, 32); 436 const LLT V4S32 = LLT::vector(4, 32); 437 const LLT V5S32 = LLT::vector(5, 32); 438 const LLT V6S32 = LLT::vector(6, 32); 439 const LLT V7S32 = LLT::vector(7, 32); 440 const LLT V8S32 = LLT::vector(8, 32); 441 const LLT V9S32 = LLT::vector(9, 32); 442 const LLT V10S32 = LLT::vector(10, 32); 443 const LLT V11S32 = LLT::vector(11, 32); 444 const LLT V12S32 = LLT::vector(12, 32); 445 const LLT V13S32 = LLT::vector(13, 32); 446 const LLT V14S32 = LLT::vector(14, 32); 447 const LLT V15S32 = LLT::vector(15, 32); 448 const LLT V16S32 = LLT::vector(16, 32); 449 const LLT V32S32 = LLT::vector(32, 32); 450 451 const LLT V2S64 = LLT::vector(2, 64); 452 const LLT V3S64 = LLT::vector(3, 64); 453 const LLT V4S64 = LLT::vector(4, 64); 454 const LLT V5S64 = LLT::vector(5, 64); 455 const LLT V6S64 = LLT::vector(6, 64); 456 const LLT V7S64 = LLT::vector(7, 64); 457 const LLT V8S64 = LLT::vector(8, 64); 458 const LLT V16S64 = LLT::vector(16, 64); 459 460 std::initializer_list<LLT> AllS32Vectors = 461 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 462 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 463 std::initializer_list<LLT> AllS64Vectors = 464 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 465 466 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 467 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 468 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 469 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 470 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 471 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 472 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 473 474 const LLT CodePtr = FlatPtr; 475 476 const std::initializer_list<LLT> AddrSpaces64 = { 477 GlobalPtr, ConstantPtr, FlatPtr 478 }; 479 480 const std::initializer_list<LLT> AddrSpaces32 = { 481 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 482 }; 483 484 const std::initializer_list<LLT> FPTypesBase = { 485 S32, S64 486 }; 487 488 const std::initializer_list<LLT> FPTypes16 = { 489 S32, S64, S16 490 }; 491 492 const std::initializer_list<LLT> FPTypesPK16 = { 493 S32, S64, S16, V2S16 494 }; 495 496 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 497 498 setAction({G_BRCOND, S1}, Legal); // VCC branches 499 setAction({G_BRCOND, S32}, Legal); // SCC branches 500 501 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 502 // elements for v3s16 503 getActionDefinitionsBuilder(G_PHI) 504 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 505 .legalFor(AllS32Vectors) 506 .legalFor(AllS64Vectors) 507 .legalFor(AddrSpaces64) 508 .legalFor(AddrSpaces32) 509 .legalIf(isPointer(0)) 510 .clampScalar(0, S16, S256) 511 .widenScalarToNextPow2(0, 32) 512 .clampMaxNumElements(0, S32, 16) 513 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514 .scalarize(0); 515 516 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517 // Full set of gfx9 features. 518 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 519 .legalFor({S32, S16, V2S16}) 520 .clampScalar(0, S16, S32) 521 .clampMaxNumElements(0, S16, 2) 522 .scalarize(0) 523 .widenScalarToNextPow2(0, 32); 524 525 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526 .legalFor({S32, S16, V2S16}) // Clamp modifier 527 .minScalarOrElt(0, S16) 528 .clampMaxNumElements(0, S16, 2) 529 .scalarize(0) 530 .widenScalarToNextPow2(0, 32) 531 .lower(); 532 } else if (ST.has16BitInsts()) { 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16}) 535 .clampScalar(0, S16, S32) 536 .scalarize(0) 537 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538 539 // Technically the saturating operations require clamp bit support, but this 540 // was introduced at the same time as 16-bit operations. 541 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542 .legalFor({S32, S16}) // Clamp modifier 543 .minScalar(0, S16) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 16) 546 .lower(); 547 548 // We're just lowering this, but it helps get a better result to try to 549 // coerce to the desired type first. 550 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551 .minScalar(0, S16) 552 .scalarize(0) 553 .lower(); 554 } else { 555 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 556 .legalFor({S32}) 557 .clampScalar(0, S32, S32) 558 .scalarize(0); 559 560 if (ST.hasIntClamp()) { 561 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562 .legalFor({S32}) // Clamp modifier. 563 .scalarize(0) 564 .minScalarOrElt(0, S32) 565 .lower(); 566 } else { 567 // Clamp bit support was added in VI, along with 16-bit operations. 568 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569 .minScalar(0, S32) 570 .scalarize(0) 571 .lower(); 572 } 573 574 // FIXME: DAG expansion gets better results. The widening uses the smaller 575 // range values and goes for the min/max lowering directly. 576 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577 .minScalar(0, S32) 578 .scalarize(0) 579 .lower(); 580 } 581 582 getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 583 .customFor({S32, S64}) 584 .clampScalar(0, S32, S64) 585 .widenScalarToNextPow2(0, 32) 586 .scalarize(0); 587 588 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 589 .legalFor({S32}) 590 .maxScalarOrElt(0, S32); 591 592 if (ST.hasVOP3PInsts()) { 593 Mulh 594 .clampMaxNumElements(0, S8, 2) 595 .lowerFor({V2S8}); 596 } 597 598 Mulh 599 .scalarize(0) 600 .lower(); 601 602 // Report legal for any types we can handle anywhere. For the cases only legal 603 // on the SALU, RegBankSelect will be able to re-legalize. 604 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 605 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 606 .clampScalar(0, S32, S64) 607 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 608 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 609 .widenScalarToNextPow2(0) 610 .scalarize(0); 611 612 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 613 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614 .legalFor({{S32, S1}, {S32, S32}}) 615 .minScalar(0, S32) 616 // TODO: .scalarize(0) 617 .lower(); 618 619 getActionDefinitionsBuilder(G_BITCAST) 620 // Don't worry about the size constraint. 621 .legalIf(all(isRegisterType(0), isRegisterType(1))) 622 .lower(); 623 624 625 getActionDefinitionsBuilder(G_CONSTANT) 626 .legalFor({S1, S32, S64, S16, GlobalPtr, 627 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628 .legalIf(isPointer(0)) 629 .clampScalar(0, S32, S64) 630 .widenScalarToNextPow2(0); 631 632 getActionDefinitionsBuilder(G_FCONSTANT) 633 .legalFor({S32, S64, S16}) 634 .clampScalar(0, S16, S64); 635 636 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 637 .legalIf(isRegisterType(0)) 638 // s1 and s16 are special cases because they have legal operations on 639 // them, but don't really occupy registers in the normal way. 640 .legalFor({S1, S16}) 641 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 642 .clampScalarOrElt(0, S32, MaxScalar) 643 .widenScalarToNextPow2(0, 32) 644 .clampMaxNumElements(0, S32, 16); 645 646 setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 647 648 // If the amount is divergent, we have to do a wave reduction to get the 649 // maximum value, so this is expanded during RegBankSelect. 650 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 651 .legalFor({{PrivatePtr, S32}}); 652 653 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654 .customIf(typeIsNot(0, PrivatePtr)); 655 656 setAction({G_BLOCK_ADDR, CodePtr}, Legal); 657 658 auto &FPOpActions = getActionDefinitionsBuilder( 659 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 660 .legalFor({S32, S64}); 661 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 662 .customFor({S32, S64}); 663 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 664 .customFor({S32, S64}); 665 666 if (ST.has16BitInsts()) { 667 if (ST.hasVOP3PInsts()) 668 FPOpActions.legalFor({S16, V2S16}); 669 else 670 FPOpActions.legalFor({S16}); 671 672 TrigActions.customFor({S16}); 673 FDIVActions.customFor({S16}); 674 } 675 676 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 677 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 678 679 if (ST.hasVOP3PInsts()) { 680 MinNumMaxNum.customFor(FPTypesPK16) 681 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682 .clampMaxNumElements(0, S16, 2) 683 .clampScalar(0, S16, S64) 684 .scalarize(0); 685 } else if (ST.has16BitInsts()) { 686 MinNumMaxNum.customFor(FPTypes16) 687 .clampScalar(0, S16, S64) 688 .scalarize(0); 689 } else { 690 MinNumMaxNum.customFor(FPTypesBase) 691 .clampScalar(0, S32, S64) 692 .scalarize(0); 693 } 694 695 if (ST.hasVOP3PInsts()) 696 FPOpActions.clampMaxNumElements(0, S16, 2); 697 698 FPOpActions 699 .scalarize(0) 700 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 701 702 TrigActions 703 .scalarize(0) 704 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 705 706 FDIVActions 707 .scalarize(0) 708 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 709 710 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 711 .legalFor(FPTypesPK16) 712 .clampMaxNumElements(0, S16, 2) 713 .scalarize(0) 714 .clampScalar(0, S16, S64); 715 716 if (ST.has16BitInsts()) { 717 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 718 .legalFor({S32, S64, S16}) 719 .scalarize(0) 720 .clampScalar(0, S16, S64); 721 } else { 722 getActionDefinitionsBuilder(G_FSQRT) 723 .legalFor({S32, S64}) 724 .scalarize(0) 725 .clampScalar(0, S32, S64); 726 727 if (ST.hasFractBug()) { 728 getActionDefinitionsBuilder(G_FFLOOR) 729 .customFor({S64}) 730 .legalFor({S32, S64}) 731 .scalarize(0) 732 .clampScalar(0, S32, S64); 733 } else { 734 getActionDefinitionsBuilder(G_FFLOOR) 735 .legalFor({S32, S64}) 736 .scalarize(0) 737 .clampScalar(0, S32, S64); 738 } 739 } 740 741 getActionDefinitionsBuilder(G_FPTRUNC) 742 .legalFor({{S32, S64}, {S16, S32}}) 743 .scalarize(0) 744 .lower(); 745 746 getActionDefinitionsBuilder(G_FPEXT) 747 .legalFor({{S64, S32}, {S32, S16}}) 748 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 749 .scalarize(0); 750 751 getActionDefinitionsBuilder(G_FSUB) 752 // Use actual fsub instruction 753 .legalFor({S32}) 754 // Must use fadd + fneg 755 .lowerFor({S64, S16, V2S16}) 756 .scalarize(0) 757 .clampScalar(0, S32, S64); 758 759 // Whether this is legal depends on the floating point mode for the function. 760 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 761 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 762 FMad.customFor({S32, S16}); 763 else if (ST.hasMadMacF32Insts()) 764 FMad.customFor({S32}); 765 else if (ST.hasMadF16()) 766 FMad.customFor({S16}); 767 FMad.scalarize(0) 768 .lower(); 769 770 auto &FRem = getActionDefinitionsBuilder(G_FREM); 771 if (ST.has16BitInsts()) { 772 FRem.customFor({S16, S32, S64}); 773 } else { 774 FRem.minScalar(0, S32) 775 .customFor({S32, S64}); 776 } 777 FRem.scalarize(0); 778 779 // TODO: Do we need to clamp maximum bitwidth? 780 getActionDefinitionsBuilder(G_TRUNC) 781 .legalIf(isScalar(0)) 782 .legalFor({{V2S16, V2S32}}) 783 .clampMaxNumElements(0, S16, 2) 784 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 785 // situations (like an invalid implicit use), we don't want to infinite loop 786 // in the legalizer. 787 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 788 .alwaysLegal(); 789 790 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 791 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 792 {S32, S1}, {S64, S1}, {S16, S1}}) 793 .scalarize(0) 794 .clampScalar(0, S32, S64) 795 .widenScalarToNextPow2(1, 32); 796 797 // TODO: Split s1->s64 during regbankselect for VALU. 798 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 800 .lowerFor({{S32, S64}}) 801 .lowerIf(typeIs(1, S1)) 802 .customFor({{S64, S64}}); 803 if (ST.has16BitInsts()) 804 IToFP.legalFor({{S16, S16}}); 805 IToFP.clampScalar(1, S32, S64) 806 .minScalar(0, S32) 807 .scalarize(0) 808 .widenScalarToNextPow2(1); 809 810 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 811 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812 .customFor({{S64, S64}}) 813 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 814 if (ST.has16BitInsts()) 815 FPToI.legalFor({{S16, S16}}); 816 else 817 FPToI.minScalar(1, S32); 818 819 FPToI.minScalar(0, S32) 820 .scalarize(0) 821 .lower(); 822 823 // Lower roundeven into G_FRINT 824 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825 .scalarize(0) 826 .lower(); 827 828 if (ST.has16BitInsts()) { 829 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830 .legalFor({S16, S32, S64}) 831 .clampScalar(0, S16, S64) 832 .scalarize(0); 833 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 834 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 835 .legalFor({S32, S64}) 836 .clampScalar(0, S32, S64) 837 .scalarize(0); 838 } else { 839 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 840 .legalFor({S32}) 841 .customFor({S64}) 842 .clampScalar(0, S32, S64) 843 .scalarize(0); 844 } 845 846 getActionDefinitionsBuilder(G_PTR_ADD) 847 .legalIf(all(isPointer(0), sameSize(0, 1))) 848 .scalarize(0) 849 .scalarSameSizeAs(1, 0); 850 851 getActionDefinitionsBuilder(G_PTRMASK) 852 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853 .scalarSameSizeAs(1, 0) 854 .scalarize(0); 855 856 auto &CmpBuilder = 857 getActionDefinitionsBuilder(G_ICMP) 858 // The compare output type differs based on the register bank of the output, 859 // so make both s1 and s32 legal. 860 // 861 // Scalar compares producing output in scc will be promoted to s32, as that 862 // is the allocatable register type that will be needed for the copy from 863 // scc. This will be promoted during RegBankSelect, and we assume something 864 // before that won't try to use s32 result types. 865 // 866 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867 // bank. 868 .legalForCartesianProduct( 869 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870 .legalForCartesianProduct( 871 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 872 if (ST.has16BitInsts()) { 873 CmpBuilder.legalFor({{S1, S16}}); 874 } 875 876 CmpBuilder 877 .widenScalarToNextPow2(1) 878 .clampScalar(1, S32, S64) 879 .scalarize(0) 880 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 881 882 getActionDefinitionsBuilder(G_FCMP) 883 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 884 .widenScalarToNextPow2(1) 885 .clampScalar(1, S32, S64) 886 .scalarize(0); 887 888 // FIXME: fpow has a selection pattern that should move to custom lowering. 889 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 890 if (ST.has16BitInsts()) 891 Exp2Ops.legalFor({S32, S16}); 892 else 893 Exp2Ops.legalFor({S32}); 894 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 895 Exp2Ops.scalarize(0); 896 897 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 898 if (ST.has16BitInsts()) 899 ExpOps.customFor({{S32}, {S16}}); 900 else 901 ExpOps.customFor({S32}); 902 ExpOps.clampScalar(0, MinScalarFPTy, S32) 903 .scalarize(0); 904 905 getActionDefinitionsBuilder(G_FPOWI) 906 .clampScalar(0, MinScalarFPTy, S32) 907 .lower(); 908 909 // The 64-bit versions produce 32-bit results, but only on the SALU. 910 getActionDefinitionsBuilder(G_CTPOP) 911 .legalFor({{S32, S32}, {S32, S64}}) 912 .clampScalar(0, S32, S32) 913 .clampScalar(1, S32, S64) 914 .scalarize(0) 915 .widenScalarToNextPow2(0, 32) 916 .widenScalarToNextPow2(1, 32); 917 918 // The hardware instructions return a different result on 0 than the generic 919 // instructions expect. The hardware produces -1, but these produce the 920 // bitwidth. 921 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 922 .scalarize(0) 923 .clampScalar(0, S32, S32) 924 .clampScalar(1, S32, S64) 925 .widenScalarToNextPow2(0, 32) 926 .widenScalarToNextPow2(1, 32) 927 .lower(); 928 929 // The 64-bit versions produce 32-bit results, but only on the SALU. 930 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 931 .legalFor({{S32, S32}, {S32, S64}}) 932 .clampScalar(0, S32, S32) 933 .clampScalar(1, S32, S64) 934 .scalarize(0) 935 .widenScalarToNextPow2(0, 32) 936 .widenScalarToNextPow2(1, 32); 937 938 getActionDefinitionsBuilder(G_BITREVERSE) 939 .legalFor({S32}) 940 .clampScalar(0, S32, S32) 941 .scalarize(0); 942 943 if (ST.has16BitInsts()) { 944 getActionDefinitionsBuilder(G_BSWAP) 945 .legalFor({S16, S32, V2S16}) 946 .clampMaxNumElements(0, S16, 2) 947 // FIXME: Fixing non-power-of-2 before clamp is workaround for 948 // narrowScalar limitation. 949 .widenScalarToNextPow2(0) 950 .clampScalar(0, S16, S32) 951 .scalarize(0); 952 953 if (ST.hasVOP3PInsts()) { 954 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 955 .legalFor({S32, S16, V2S16}) 956 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 957 .clampMaxNumElements(0, S16, 2) 958 .minScalar(0, S16) 959 .widenScalarToNextPow2(0) 960 .scalarize(0) 961 .lower(); 962 } else { 963 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 964 .legalFor({S32, S16}) 965 .widenScalarToNextPow2(0) 966 .minScalar(0, S16) 967 .scalarize(0) 968 .lower(); 969 } 970 } else { 971 // TODO: Should have same legality without v_perm_b32 972 getActionDefinitionsBuilder(G_BSWAP) 973 .legalFor({S32}) 974 .lowerIf(scalarNarrowerThan(0, 32)) 975 // FIXME: Fixing non-power-of-2 before clamp is workaround for 976 // narrowScalar limitation. 977 .widenScalarToNextPow2(0) 978 .maxScalar(0, S32) 979 .scalarize(0) 980 .lower(); 981 982 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 983 .legalFor({S32}) 984 .minScalar(0, S32) 985 .widenScalarToNextPow2(0) 986 .scalarize(0) 987 .lower(); 988 } 989 990 getActionDefinitionsBuilder(G_INTTOPTR) 991 // List the common cases 992 .legalForCartesianProduct(AddrSpaces64, {S64}) 993 .legalForCartesianProduct(AddrSpaces32, {S32}) 994 .scalarize(0) 995 // Accept any address space as long as the size matches 996 .legalIf(sameSize(0, 1)) 997 .widenScalarIf(smallerThan(1, 0), 998 [](const LegalityQuery &Query) { 999 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1000 }) 1001 .narrowScalarIf(largerThan(1, 0), 1002 [](const LegalityQuery &Query) { 1003 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1004 }); 1005 1006 getActionDefinitionsBuilder(G_PTRTOINT) 1007 // List the common cases 1008 .legalForCartesianProduct(AddrSpaces64, {S64}) 1009 .legalForCartesianProduct(AddrSpaces32, {S32}) 1010 .scalarize(0) 1011 // Accept any address space as long as the size matches 1012 .legalIf(sameSize(0, 1)) 1013 .widenScalarIf(smallerThan(0, 1), 1014 [](const LegalityQuery &Query) { 1015 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1016 }) 1017 .narrowScalarIf( 1018 largerThan(0, 1), 1019 [](const LegalityQuery &Query) { 1020 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1021 }); 1022 1023 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1024 .scalarize(0) 1025 .custom(); 1026 1027 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1028 bool IsLoad) -> bool { 1029 const LLT DstTy = Query.Types[0]; 1030 1031 // Split vector extloads. 1032 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1033 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1034 1035 if (MemSize < DstTy.getSizeInBits()) 1036 MemSize = std::max(MemSize, AlignBits); 1037 1038 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1039 return true; 1040 1041 const LLT PtrTy = Query.Types[1]; 1042 unsigned AS = PtrTy.getAddressSpace(); 1043 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1044 return true; 1045 1046 // Catch weird sized loads that don't evenly divide into the access sizes 1047 // TODO: May be able to widen depending on alignment etc. 1048 unsigned NumRegs = (MemSize + 31) / 32; 1049 if (NumRegs == 3) { 1050 if (!ST.hasDwordx3LoadStores()) 1051 return true; 1052 } else { 1053 // If the alignment allows, these should have been widened. 1054 if (!isPowerOf2_32(NumRegs)) 1055 return true; 1056 } 1057 1058 if (AlignBits < MemSize) { 1059 const SITargetLowering *TLI = ST.getTargetLowering(); 1060 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1061 Align(AlignBits / 8)); 1062 } 1063 1064 return false; 1065 }; 1066 1067 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1068 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1069 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1070 1071 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1072 // LDS 1073 // TODO: Unsupported flat for SI. 1074 1075 for (unsigned Op : {G_LOAD, G_STORE}) { 1076 const bool IsStore = Op == G_STORE; 1077 1078 auto &Actions = getActionDefinitionsBuilder(Op); 1079 // Explicitly list some common cases. 1080 // TODO: Does this help compile time at all? 1081 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 1082 {V2S32, GlobalPtr, 64, GlobalAlign32}, 1083 {V4S32, GlobalPtr, 128, GlobalAlign32}, 1084 {S64, GlobalPtr, 64, GlobalAlign32}, 1085 {V2S64, GlobalPtr, 128, GlobalAlign32}, 1086 {V2S16, GlobalPtr, 32, GlobalAlign32}, 1087 {S32, GlobalPtr, 8, GlobalAlign8}, 1088 {S32, GlobalPtr, 16, GlobalAlign16}, 1089 1090 {S32, LocalPtr, 32, 32}, 1091 {S64, LocalPtr, 64, 32}, 1092 {V2S32, LocalPtr, 64, 32}, 1093 {S32, LocalPtr, 8, 8}, 1094 {S32, LocalPtr, 16, 16}, 1095 {V2S16, LocalPtr, 32, 32}, 1096 1097 {S32, PrivatePtr, 32, 32}, 1098 {S32, PrivatePtr, 8, 8}, 1099 {S32, PrivatePtr, 16, 16}, 1100 {V2S16, PrivatePtr, 32, 32}, 1101 1102 {S32, ConstantPtr, 32, GlobalAlign32}, 1103 {V2S32, ConstantPtr, 64, GlobalAlign32}, 1104 {V4S32, ConstantPtr, 128, GlobalAlign32}, 1105 {S64, ConstantPtr, 64, GlobalAlign32}, 1106 {V2S32, ConstantPtr, 32, GlobalAlign32}}); 1107 Actions.legalIf( 1108 [=](const LegalityQuery &Query) -> bool { 1109 return isLoadStoreLegal(ST, Query, Op); 1110 }); 1111 1112 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1113 // 64-bits. 1114 // 1115 // TODO: Should generalize bitcast action into coerce, which will also cover 1116 // inserting addrspacecasts. 1117 Actions.customIf(typeIs(1, Constant32Ptr)); 1118 1119 // Turn any illegal element vectors into something easier to deal 1120 // with. These will ultimately produce 32-bit scalar shifts to extract the 1121 // parts anyway. 1122 // 1123 // For odd 16-bit element vectors, prefer to split those into pieces with 1124 // 16-bit vector parts. 1125 Actions.bitcastIf( 1126 [=](const LegalityQuery &Query) -> bool { 1127 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1128 Query.MMODescrs[0].SizeInBits); 1129 }, bitcastToRegisterType(0)); 1130 1131 if (!IsStore) { 1132 // Widen suitably aligned loads by loading extra bytes. The standard 1133 // legalization actions can't properly express widening memory operands. 1134 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1135 return shouldWidenLoad(ST, Query, G_LOAD); 1136 }); 1137 } 1138 1139 // FIXME: load/store narrowing should be moved to lower action 1140 Actions 1141 .narrowScalarIf( 1142 [=](const LegalityQuery &Query) -> bool { 1143 return !Query.Types[0].isVector() && 1144 needToSplitMemOp(Query, Op == G_LOAD); 1145 }, 1146 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1147 const LLT DstTy = Query.Types[0]; 1148 const LLT PtrTy = Query.Types[1]; 1149 1150 const unsigned DstSize = DstTy.getSizeInBits(); 1151 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1152 1153 // Split extloads. 1154 if (DstSize > MemSize) 1155 return std::make_pair(0, LLT::scalar(MemSize)); 1156 1157 if (!isPowerOf2_32(DstSize)) { 1158 // We're probably decomposing an odd sized store. Try to split 1159 // to the widest type. TODO: Account for alignment. As-is it 1160 // should be OK, since the new parts will be further legalized. 1161 unsigned FloorSize = PowerOf2Floor(DstSize); 1162 return std::make_pair(0, LLT::scalar(FloorSize)); 1163 } 1164 1165 if (DstSize > 32 && (DstSize % 32 != 0)) { 1166 // FIXME: Need a way to specify non-extload of larger size if 1167 // suitably aligned. 1168 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1169 } 1170 1171 unsigned MaxSize = maxSizeForAddrSpace(ST, 1172 PtrTy.getAddressSpace(), 1173 Op == G_LOAD); 1174 if (MemSize > MaxSize) 1175 return std::make_pair(0, LLT::scalar(MaxSize)); 1176 1177 unsigned Align = Query.MMODescrs[0].AlignInBits; 1178 return std::make_pair(0, LLT::scalar(Align)); 1179 }) 1180 .fewerElementsIf( 1181 [=](const LegalityQuery &Query) -> bool { 1182 return Query.Types[0].isVector() && 1183 needToSplitMemOp(Query, Op == G_LOAD); 1184 }, 1185 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1186 const LLT DstTy = Query.Types[0]; 1187 const LLT PtrTy = Query.Types[1]; 1188 1189 LLT EltTy = DstTy.getElementType(); 1190 unsigned MaxSize = maxSizeForAddrSpace(ST, 1191 PtrTy.getAddressSpace(), 1192 Op == G_LOAD); 1193 1194 // FIXME: Handle widened to power of 2 results better. This ends 1195 // up scalarizing. 1196 // FIXME: 3 element stores scalarized on SI 1197 1198 // Split if it's too large for the address space. 1199 if (Query.MMODescrs[0].SizeInBits > MaxSize) { 1200 unsigned NumElts = DstTy.getNumElements(); 1201 unsigned EltSize = EltTy.getSizeInBits(); 1202 1203 if (MaxSize % EltSize == 0) { 1204 return std::make_pair( 1205 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1206 } 1207 1208 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 1209 1210 // FIXME: Refine when odd breakdowns handled 1211 // The scalars will need to be re-legalized. 1212 if (NumPieces == 1 || NumPieces >= NumElts || 1213 NumElts % NumPieces != 0) 1214 return std::make_pair(0, EltTy); 1215 1216 return std::make_pair(0, 1217 LLT::vector(NumElts / NumPieces, EltTy)); 1218 } 1219 1220 // FIXME: We could probably handle weird extending loads better. 1221 unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1222 if (DstTy.getSizeInBits() > MemSize) 1223 return std::make_pair(0, EltTy); 1224 1225 unsigned EltSize = EltTy.getSizeInBits(); 1226 unsigned DstSize = DstTy.getSizeInBits(); 1227 if (!isPowerOf2_32(DstSize)) { 1228 // We're probably decomposing an odd sized store. Try to split 1229 // to the widest type. TODO: Account for alignment. As-is it 1230 // should be OK, since the new parts will be further legalized. 1231 unsigned FloorSize = PowerOf2Floor(DstSize); 1232 return std::make_pair( 1233 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1234 } 1235 1236 // Need to split because of alignment. 1237 unsigned Align = Query.MMODescrs[0].AlignInBits; 1238 if (EltSize > Align && 1239 (EltSize / Align < DstTy.getNumElements())) { 1240 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 1241 } 1242 1243 // May need relegalization for the scalars. 1244 return std::make_pair(0, EltTy); 1245 }) 1246 .lowerIfMemSizeNotPow2() 1247 .minScalar(0, S32); 1248 1249 if (IsStore) 1250 Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 1251 1252 Actions 1253 .widenScalarToNextPow2(0) 1254 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1255 .lower(); 1256 } 1257 1258 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1259 .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 1260 {S32, GlobalPtr, 16, 2 * 8}, 1261 {S32, LocalPtr, 8, 8}, 1262 {S32, LocalPtr, 16, 16}, 1263 {S32, PrivatePtr, 8, 8}, 1264 {S32, PrivatePtr, 16, 16}, 1265 {S32, ConstantPtr, 8, 8}, 1266 {S32, ConstantPtr, 16, 2 * 8}}); 1267 if (ST.hasFlatAddressSpace()) { 1268 ExtLoads.legalForTypesWithMemDesc( 1269 {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 1270 } 1271 1272 ExtLoads.clampScalar(0, S32, S32) 1273 .widenScalarToNextPow2(0) 1274 .unsupportedIfMemSizeNotPow2() 1275 .lower(); 1276 1277 auto &Atomics = getActionDefinitionsBuilder( 1278 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1279 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1280 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281 G_ATOMICRMW_UMIN}) 1282 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283 {S64, GlobalPtr}, {S64, LocalPtr}, 1284 {S32, RegionPtr}, {S64, RegionPtr}}); 1285 if (ST.hasFlatAddressSpace()) { 1286 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1287 } 1288 1289 if (ST.hasLDSFPAtomics()) { 1290 getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1291 .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1292 } 1293 1294 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1295 // demarshalling 1296 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1297 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1298 {S32, FlatPtr}, {S64, FlatPtr}}) 1299 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1300 {S32, RegionPtr}, {S64, RegionPtr}}); 1301 // TODO: Pointer types, any 32-bit or 64-bit vector 1302 1303 // Condition should be s32 for scalar, s1 for vector. 1304 getActionDefinitionsBuilder(G_SELECT) 1305 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 1306 GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1307 LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 1308 .clampScalar(0, S16, S64) 1309 .scalarize(1) 1310 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1311 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1312 .clampMaxNumElements(0, S32, 2) 1313 .clampMaxNumElements(0, LocalPtr, 2) 1314 .clampMaxNumElements(0, PrivatePtr, 2) 1315 .scalarize(0) 1316 .widenScalarToNextPow2(0) 1317 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1318 1319 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1320 // be more flexible with the shift amount type. 1321 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1322 .legalFor({{S32, S32}, {S64, S32}}); 1323 if (ST.has16BitInsts()) { 1324 if (ST.hasVOP3PInsts()) { 1325 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1326 .clampMaxNumElements(0, S16, 2); 1327 } else 1328 Shifts.legalFor({{S16, S16}}); 1329 1330 // TODO: Support 16-bit shift amounts for all types 1331 Shifts.widenScalarIf( 1332 [=](const LegalityQuery &Query) { 1333 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1334 // 32-bit amount. 1335 const LLT ValTy = Query.Types[0]; 1336 const LLT AmountTy = Query.Types[1]; 1337 return ValTy.getSizeInBits() <= 16 && 1338 AmountTy.getSizeInBits() < 16; 1339 }, changeTo(1, S16)); 1340 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1341 Shifts.clampScalar(1, S32, S32); 1342 Shifts.clampScalar(0, S16, S64); 1343 Shifts.widenScalarToNextPow2(0, 16); 1344 1345 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1346 .minScalar(0, S16) 1347 .scalarize(0) 1348 .lower(); 1349 } else { 1350 // Make sure we legalize the shift amount type first, as the general 1351 // expansion for the shifted type will produce much worse code if it hasn't 1352 // been truncated already. 1353 Shifts.clampScalar(1, S32, S32); 1354 Shifts.clampScalar(0, S32, S64); 1355 Shifts.widenScalarToNextPow2(0, 32); 1356 1357 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1358 .minScalar(0, S32) 1359 .scalarize(0) 1360 .lower(); 1361 } 1362 Shifts.scalarize(0); 1363 1364 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1365 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1366 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1367 unsigned IdxTypeIdx = 2; 1368 1369 getActionDefinitionsBuilder(Op) 1370 .customIf([=](const LegalityQuery &Query) { 1371 const LLT EltTy = Query.Types[EltTypeIdx]; 1372 const LLT VecTy = Query.Types[VecTypeIdx]; 1373 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1374 const unsigned EltSize = EltTy.getSizeInBits(); 1375 return (EltSize == 32 || EltSize == 64) && 1376 VecTy.getSizeInBits() % 32 == 0 && 1377 VecTy.getSizeInBits() <= MaxRegisterSize && 1378 IdxTy.getSizeInBits() == 32; 1379 }) 1380 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1381 bitcastToVectorElement32(VecTypeIdx)) 1382 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1383 .bitcastIf( 1384 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1385 [=](const LegalityQuery &Query) { 1386 // For > 64-bit element types, try to turn this into a 64-bit 1387 // element vector since we may be able to do better indexing 1388 // if this is scalar. If not, fall back to 32. 1389 const LLT EltTy = Query.Types[EltTypeIdx]; 1390 const LLT VecTy = Query.Types[VecTypeIdx]; 1391 const unsigned DstEltSize = EltTy.getSizeInBits(); 1392 const unsigned VecSize = VecTy.getSizeInBits(); 1393 1394 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1395 return std::make_pair( 1396 VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1397 }) 1398 .clampScalar(EltTypeIdx, S32, S64) 1399 .clampScalar(VecTypeIdx, S32, S64) 1400 .clampScalar(IdxTypeIdx, S32, S32) 1401 .clampMaxNumElements(VecTypeIdx, S32, 32) 1402 // TODO: Clamp elements for 64-bit vectors? 1403 // It should only be necessary with variable indexes. 1404 // As a last resort, lower to the stack 1405 .lower(); 1406 } 1407 1408 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1409 .unsupportedIf([=](const LegalityQuery &Query) { 1410 const LLT &EltTy = Query.Types[1].getElementType(); 1411 return Query.Types[0] != EltTy; 1412 }); 1413 1414 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1415 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1416 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1417 1418 // FIXME: Doesn't handle extract of illegal sizes. 1419 getActionDefinitionsBuilder(Op) 1420 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1421 // FIXME: Multiples of 16 should not be legal. 1422 .legalIf([=](const LegalityQuery &Query) { 1423 const LLT BigTy = Query.Types[BigTyIdx]; 1424 const LLT LitTy = Query.Types[LitTyIdx]; 1425 return (BigTy.getSizeInBits() % 32 == 0) && 1426 (LitTy.getSizeInBits() % 16 == 0); 1427 }) 1428 .widenScalarIf( 1429 [=](const LegalityQuery &Query) { 1430 const LLT BigTy = Query.Types[BigTyIdx]; 1431 return (BigTy.getScalarSizeInBits() < 16); 1432 }, 1433 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1434 .widenScalarIf( 1435 [=](const LegalityQuery &Query) { 1436 const LLT LitTy = Query.Types[LitTyIdx]; 1437 return (LitTy.getScalarSizeInBits() < 16); 1438 }, 1439 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1440 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1441 .widenScalarToNextPow2(BigTyIdx, 32); 1442 1443 } 1444 1445 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1446 .legalForCartesianProduct(AllS32Vectors, {S32}) 1447 .legalForCartesianProduct(AllS64Vectors, {S64}) 1448 .clampNumElements(0, V16S32, V32S32) 1449 .clampNumElements(0, V2S64, V16S64) 1450 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1451 1452 if (ST.hasScalarPackInsts()) { 1453 BuildVector 1454 // FIXME: Should probably widen s1 vectors straight to s32 1455 .minScalarOrElt(0, S16) 1456 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1457 .minScalar(1, S32); 1458 1459 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1460 .legalFor({V2S16, S32}) 1461 .lower(); 1462 BuildVector.minScalarOrElt(0, S32); 1463 } else { 1464 BuildVector.customFor({V2S16, S16}); 1465 BuildVector.minScalarOrElt(0, S32); 1466 1467 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1468 .customFor({V2S16, S32}) 1469 .lower(); 1470 } 1471 1472 BuildVector.legalIf(isRegisterType(0)); 1473 1474 // FIXME: Clamp maximum size 1475 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1476 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1477 .clampMaxNumElements(0, S32, 32) 1478 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1479 .clampMaxNumElements(0, S16, 64); 1480 1481 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1482 // pre-legalize. 1483 if (ST.hasVOP3PInsts()) { 1484 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1485 .customFor({V2S16, V2S16}) 1486 .lower(); 1487 } else 1488 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1489 1490 // Merge/Unmerge 1491 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1492 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1493 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1494 1495 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1496 const LLT Ty = Query.Types[TypeIdx]; 1497 if (Ty.isVector()) { 1498 const LLT &EltTy = Ty.getElementType(); 1499 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1500 return true; 1501 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1502 return true; 1503 } 1504 return false; 1505 }; 1506 1507 auto &Builder = getActionDefinitionsBuilder(Op) 1508 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1509 .lowerFor({{S16, V2S16}}) 1510 .lowerIf([=](const LegalityQuery &Query) { 1511 const LLT BigTy = Query.Types[BigTyIdx]; 1512 return BigTy.getSizeInBits() == 32; 1513 }) 1514 // Try to widen to s16 first for small types. 1515 // TODO: Only do this on targets with legal s16 shifts 1516 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1517 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1518 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1519 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1520 elementTypeIs(1, S16)), 1521 changeTo(1, V2S16)) 1522 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1523 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1524 // valid. 1525 .clampScalar(LitTyIdx, S32, S512) 1526 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1527 // Break up vectors with weird elements into scalars 1528 .fewerElementsIf( 1529 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1530 scalarize(0)) 1531 .fewerElementsIf( 1532 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1533 scalarize(1)) 1534 .clampScalar(BigTyIdx, S32, MaxScalar); 1535 1536 if (Op == G_MERGE_VALUES) { 1537 Builder.widenScalarIf( 1538 // TODO: Use 16-bit shifts if legal for 8-bit values? 1539 [=](const LegalityQuery &Query) { 1540 const LLT Ty = Query.Types[LitTyIdx]; 1541 return Ty.getSizeInBits() < 32; 1542 }, 1543 changeTo(LitTyIdx, S32)); 1544 } 1545 1546 Builder.widenScalarIf( 1547 [=](const LegalityQuery &Query) { 1548 const LLT Ty = Query.Types[BigTyIdx]; 1549 return !isPowerOf2_32(Ty.getSizeInBits()) && 1550 Ty.getSizeInBits() % 16 != 0; 1551 }, 1552 [=](const LegalityQuery &Query) { 1553 // Pick the next power of 2, or a multiple of 64 over 128. 1554 // Whichever is smaller. 1555 const LLT &Ty = Query.Types[BigTyIdx]; 1556 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1557 if (NewSizeInBits >= 256) { 1558 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1559 if (RoundedTo < NewSizeInBits) 1560 NewSizeInBits = RoundedTo; 1561 } 1562 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1563 }) 1564 // Any vectors left are the wrong size. Scalarize them. 1565 .scalarize(0) 1566 .scalarize(1); 1567 } 1568 1569 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1570 // RegBankSelect. 1571 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1572 .legalFor({{S32}, {S64}}); 1573 1574 if (ST.hasVOP3PInsts()) { 1575 SextInReg.lowerFor({{V2S16}}) 1576 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1577 // get more vector shift opportunities, since we'll get those when 1578 // expanded. 1579 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1580 } else if (ST.has16BitInsts()) { 1581 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1582 } else { 1583 // Prefer to promote to s32 before lowering if we don't have 16-bit 1584 // shifts. This avoid a lot of intermediate truncate and extend operations. 1585 SextInReg.lowerFor({{S32}, {S64}}); 1586 } 1587 1588 SextInReg 1589 .scalarize(0) 1590 .clampScalar(0, S32, S64) 1591 .lower(); 1592 1593 getActionDefinitionsBuilder(G_FSHR) 1594 .legalFor({{S32, S32}}) 1595 .scalarize(0) 1596 .lower(); 1597 1598 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1599 .legalFor({S64}); 1600 1601 getActionDefinitionsBuilder(G_FENCE) 1602 .alwaysLegal(); 1603 1604 getActionDefinitionsBuilder({ 1605 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1606 G_FCOPYSIGN, 1607 1608 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1609 G_ATOMICRMW_NAND, 1610 G_ATOMICRMW_FSUB, 1611 G_READ_REGISTER, 1612 G_WRITE_REGISTER, 1613 1614 G_SADDO, G_SSUBO, 1615 1616 // TODO: Implement 1617 G_FMINIMUM, G_FMAXIMUM, 1618 G_FSHL 1619 }).lower(); 1620 1621 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1622 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1623 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1624 .unsupported(); 1625 1626 computeTables(); 1627 verify(*ST.getInstrInfo()); 1628 } 1629 1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1631 MachineInstr &MI) const { 1632 MachineIRBuilder &B = Helper.MIRBuilder; 1633 MachineRegisterInfo &MRI = *B.getMRI(); 1634 1635 switch (MI.getOpcode()) { 1636 case TargetOpcode::G_ADDRSPACE_CAST: 1637 return legalizeAddrSpaceCast(MI, MRI, B); 1638 case TargetOpcode::G_FRINT: 1639 return legalizeFrint(MI, MRI, B); 1640 case TargetOpcode::G_FCEIL: 1641 return legalizeFceil(MI, MRI, B); 1642 case TargetOpcode::G_FREM: 1643 return legalizeFrem(MI, MRI, B); 1644 case TargetOpcode::G_INTRINSIC_TRUNC: 1645 return legalizeIntrinsicTrunc(MI, MRI, B); 1646 case TargetOpcode::G_SITOFP: 1647 return legalizeITOFP(MI, MRI, B, true); 1648 case TargetOpcode::G_UITOFP: 1649 return legalizeITOFP(MI, MRI, B, false); 1650 case TargetOpcode::G_FPTOSI: 1651 return legalizeFPTOI(MI, MRI, B, true); 1652 case TargetOpcode::G_FPTOUI: 1653 return legalizeFPTOI(MI, MRI, B, false); 1654 case TargetOpcode::G_FMINNUM: 1655 case TargetOpcode::G_FMAXNUM: 1656 case TargetOpcode::G_FMINNUM_IEEE: 1657 case TargetOpcode::G_FMAXNUM_IEEE: 1658 return legalizeMinNumMaxNum(Helper, MI); 1659 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1660 return legalizeExtractVectorElt(MI, MRI, B); 1661 case TargetOpcode::G_INSERT_VECTOR_ELT: 1662 return legalizeInsertVectorElt(MI, MRI, B); 1663 case TargetOpcode::G_SHUFFLE_VECTOR: 1664 return legalizeShuffleVector(MI, MRI, B); 1665 case TargetOpcode::G_FSIN: 1666 case TargetOpcode::G_FCOS: 1667 return legalizeSinCos(MI, MRI, B); 1668 case TargetOpcode::G_GLOBAL_VALUE: 1669 return legalizeGlobalValue(MI, MRI, B); 1670 case TargetOpcode::G_LOAD: 1671 return legalizeLoad(Helper, MI); 1672 case TargetOpcode::G_FMAD: 1673 return legalizeFMad(MI, MRI, B); 1674 case TargetOpcode::G_FDIV: 1675 return legalizeFDIV(MI, MRI, B); 1676 case TargetOpcode::G_UDIV: 1677 case TargetOpcode::G_UREM: 1678 return legalizeUDIV_UREM(MI, MRI, B); 1679 case TargetOpcode::G_SDIV: 1680 case TargetOpcode::G_SREM: 1681 return legalizeSDIV_SREM(MI, MRI, B); 1682 case TargetOpcode::G_ATOMIC_CMPXCHG: 1683 return legalizeAtomicCmpXChg(MI, MRI, B); 1684 case TargetOpcode::G_FLOG: 1685 return legalizeFlog(MI, B, numbers::ln2f); 1686 case TargetOpcode::G_FLOG10: 1687 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1688 case TargetOpcode::G_FEXP: 1689 return legalizeFExp(MI, B); 1690 case TargetOpcode::G_FPOW: 1691 return legalizeFPow(MI, B); 1692 case TargetOpcode::G_FFLOOR: 1693 return legalizeFFloor(MI, MRI, B); 1694 case TargetOpcode::G_BUILD_VECTOR: 1695 return legalizeBuildVector(MI, MRI, B); 1696 default: 1697 return false; 1698 } 1699 1700 llvm_unreachable("expected switch to return"); 1701 } 1702 1703 Register AMDGPULegalizerInfo::getSegmentAperture( 1704 unsigned AS, 1705 MachineRegisterInfo &MRI, 1706 MachineIRBuilder &B) const { 1707 MachineFunction &MF = B.getMF(); 1708 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1709 const LLT S32 = LLT::scalar(32); 1710 1711 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1712 1713 if (ST.hasApertureRegs()) { 1714 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1715 // getreg. 1716 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1717 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1718 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1719 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1720 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1721 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1722 unsigned Encoding = 1723 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1724 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1725 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1726 1727 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1728 1729 B.buildInstr(AMDGPU::S_GETREG_B32) 1730 .addDef(GetReg) 1731 .addImm(Encoding); 1732 MRI.setType(GetReg, S32); 1733 1734 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1735 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1736 } 1737 1738 Register QueuePtr = MRI.createGenericVirtualRegister( 1739 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1740 1741 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1742 return Register(); 1743 1744 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1745 // private_segment_aperture_base_hi. 1746 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1747 1748 // TODO: can we be smarter about machine pointer info? 1749 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1750 MachineMemOperand *MMO = MF.getMachineMemOperand( 1751 PtrInfo, 1752 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1753 MachineMemOperand::MOInvariant, 1754 4, commonAlignment(Align(64), StructOffset)); 1755 1756 Register LoadAddr; 1757 1758 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1759 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1760 } 1761 1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1763 MachineInstr &MI, MachineRegisterInfo &MRI, 1764 MachineIRBuilder &B) const { 1765 MachineFunction &MF = B.getMF(); 1766 1767 const LLT S32 = LLT::scalar(32); 1768 Register Dst = MI.getOperand(0).getReg(); 1769 Register Src = MI.getOperand(1).getReg(); 1770 1771 LLT DstTy = MRI.getType(Dst); 1772 LLT SrcTy = MRI.getType(Src); 1773 unsigned DestAS = DstTy.getAddressSpace(); 1774 unsigned SrcAS = SrcTy.getAddressSpace(); 1775 1776 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1777 // vector element. 1778 assert(!DstTy.isVector()); 1779 1780 const AMDGPUTargetMachine &TM 1781 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1782 1783 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1784 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1785 return true; 1786 } 1787 1788 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1789 // Truncate. 1790 B.buildExtract(Dst, Src, 0); 1791 MI.eraseFromParent(); 1792 return true; 1793 } 1794 1795 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1796 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1797 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1798 1799 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1800 // another. Merge operands are required to be the same type, but creating an 1801 // extra ptrtoint would be kind of pointless. 1802 auto HighAddr = B.buildConstant( 1803 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1804 B.buildMerge(Dst, {Src, HighAddr}); 1805 MI.eraseFromParent(); 1806 return true; 1807 } 1808 1809 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1810 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1811 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1812 unsigned NullVal = TM.getNullPointerValue(DestAS); 1813 1814 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1815 auto FlatNull = B.buildConstant(SrcTy, 0); 1816 1817 // Extract low 32-bits of the pointer. 1818 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1819 1820 auto CmpRes = 1821 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1822 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1823 1824 MI.eraseFromParent(); 1825 return true; 1826 } 1827 1828 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1829 return false; 1830 1831 if (!ST.hasFlatAddressSpace()) 1832 return false; 1833 1834 auto SegmentNull = 1835 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1836 auto FlatNull = 1837 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1838 1839 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1840 if (!ApertureReg.isValid()) 1841 return false; 1842 1843 auto CmpRes = 1844 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1845 1846 // Coerce the type of the low half of the result so we can use merge_values. 1847 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1848 1849 // TODO: Should we allow mismatched types but matching sizes in merges to 1850 // avoid the ptrtoint? 1851 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1852 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1853 1854 MI.eraseFromParent(); 1855 return true; 1856 } 1857 1858 bool AMDGPULegalizerInfo::legalizeFrint( 1859 MachineInstr &MI, MachineRegisterInfo &MRI, 1860 MachineIRBuilder &B) const { 1861 Register Src = MI.getOperand(1).getReg(); 1862 LLT Ty = MRI.getType(Src); 1863 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1864 1865 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1866 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1867 1868 auto C1 = B.buildFConstant(Ty, C1Val); 1869 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1870 1871 // TODO: Should this propagate fast-math-flags? 1872 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1873 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1874 1875 auto C2 = B.buildFConstant(Ty, C2Val); 1876 auto Fabs = B.buildFAbs(Ty, Src); 1877 1878 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1879 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1880 MI.eraseFromParent(); 1881 return true; 1882 } 1883 1884 bool AMDGPULegalizerInfo::legalizeFceil( 1885 MachineInstr &MI, MachineRegisterInfo &MRI, 1886 MachineIRBuilder &B) const { 1887 1888 const LLT S1 = LLT::scalar(1); 1889 const LLT S64 = LLT::scalar(64); 1890 1891 Register Src = MI.getOperand(1).getReg(); 1892 assert(MRI.getType(Src) == S64); 1893 1894 // result = trunc(src) 1895 // if (src > 0.0 && src != result) 1896 // result += 1.0 1897 1898 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1899 1900 const auto Zero = B.buildFConstant(S64, 0.0); 1901 const auto One = B.buildFConstant(S64, 1.0); 1902 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1903 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1904 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1905 auto Add = B.buildSelect(S64, And, One, Zero); 1906 1907 // TODO: Should this propagate fast-math-flags? 1908 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1909 return true; 1910 } 1911 1912 bool AMDGPULegalizerInfo::legalizeFrem( 1913 MachineInstr &MI, MachineRegisterInfo &MRI, 1914 MachineIRBuilder &B) const { 1915 Register DstReg = MI.getOperand(0).getReg(); 1916 Register Src0Reg = MI.getOperand(1).getReg(); 1917 Register Src1Reg = MI.getOperand(2).getReg(); 1918 auto Flags = MI.getFlags(); 1919 LLT Ty = MRI.getType(DstReg); 1920 1921 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1922 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1923 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1924 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1925 MI.eraseFromParent(); 1926 return true; 1927 } 1928 1929 static MachineInstrBuilder extractF64Exponent(Register Hi, 1930 MachineIRBuilder &B) { 1931 const unsigned FractBits = 52; 1932 const unsigned ExpBits = 11; 1933 LLT S32 = LLT::scalar(32); 1934 1935 auto Const0 = B.buildConstant(S32, FractBits - 32); 1936 auto Const1 = B.buildConstant(S32, ExpBits); 1937 1938 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1939 .addUse(Hi) 1940 .addUse(Const0.getReg(0)) 1941 .addUse(Const1.getReg(0)); 1942 1943 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 1944 } 1945 1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 1947 MachineInstr &MI, MachineRegisterInfo &MRI, 1948 MachineIRBuilder &B) const { 1949 const LLT S1 = LLT::scalar(1); 1950 const LLT S32 = LLT::scalar(32); 1951 const LLT S64 = LLT::scalar(64); 1952 1953 Register Src = MI.getOperand(1).getReg(); 1954 assert(MRI.getType(Src) == S64); 1955 1956 // TODO: Should this use extract since the low half is unused? 1957 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 1958 Register Hi = Unmerge.getReg(1); 1959 1960 // Extract the upper half, since this is where we will find the sign and 1961 // exponent. 1962 auto Exp = extractF64Exponent(Hi, B); 1963 1964 const unsigned FractBits = 52; 1965 1966 // Extract the sign bit. 1967 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 1968 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 1969 1970 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 1971 1972 const auto Zero32 = B.buildConstant(S32, 0); 1973 1974 // Extend back to 64-bits. 1975 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 1976 1977 auto Shr = B.buildAShr(S64, FractMask, Exp); 1978 auto Not = B.buildNot(S64, Shr); 1979 auto Tmp0 = B.buildAnd(S64, Src, Not); 1980 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 1981 1982 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 1983 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 1984 1985 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 1986 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1987 MI.eraseFromParent(); 1988 return true; 1989 } 1990 1991 bool AMDGPULegalizerInfo::legalizeITOFP( 1992 MachineInstr &MI, MachineRegisterInfo &MRI, 1993 MachineIRBuilder &B, bool Signed) const { 1994 1995 Register Dst = MI.getOperand(0).getReg(); 1996 Register Src = MI.getOperand(1).getReg(); 1997 1998 const LLT S64 = LLT::scalar(64); 1999 const LLT S32 = LLT::scalar(32); 2000 2001 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2002 2003 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2004 2005 auto CvtHi = Signed ? 2006 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2007 B.buildUITOFP(S64, Unmerge.getReg(1)); 2008 2009 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2010 2011 auto ThirtyTwo = B.buildConstant(S32, 32); 2012 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2013 .addUse(CvtHi.getReg(0)) 2014 .addUse(ThirtyTwo.getReg(0)); 2015 2016 // TODO: Should this propagate fast-math-flags? 2017 B.buildFAdd(Dst, LdExp, CvtLo); 2018 MI.eraseFromParent(); 2019 return true; 2020 } 2021 2022 // TODO: Copied from DAG implementation. Verify logic and document how this 2023 // actually works. 2024 bool AMDGPULegalizerInfo::legalizeFPTOI( 2025 MachineInstr &MI, MachineRegisterInfo &MRI, 2026 MachineIRBuilder &B, bool Signed) const { 2027 2028 Register Dst = MI.getOperand(0).getReg(); 2029 Register Src = MI.getOperand(1).getReg(); 2030 2031 const LLT S64 = LLT::scalar(64); 2032 const LLT S32 = LLT::scalar(32); 2033 2034 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2035 2036 unsigned Flags = MI.getFlags(); 2037 2038 auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 2039 auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 2040 auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 2041 2042 auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 2043 auto FloorMul = B.buildFFloor(S64, Mul, Flags); 2044 auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 2045 2046 auto Hi = Signed ? 2047 B.buildFPTOSI(S32, FloorMul) : 2048 B.buildFPTOUI(S32, FloorMul); 2049 auto Lo = B.buildFPTOUI(S32, Fma); 2050 2051 B.buildMerge(Dst, { Lo, Hi }); 2052 MI.eraseFromParent(); 2053 2054 return true; 2055 } 2056 2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2058 MachineInstr &MI) const { 2059 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2060 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2061 2062 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2063 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2064 2065 // With ieee_mode disabled, the instructions have the correct behavior 2066 // already for G_FMINNUM/G_FMAXNUM 2067 if (!MFI->getMode().IEEE) 2068 return !IsIEEEOp; 2069 2070 if (IsIEEEOp) 2071 return true; 2072 2073 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2074 } 2075 2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2077 MachineInstr &MI, MachineRegisterInfo &MRI, 2078 MachineIRBuilder &B) const { 2079 // TODO: Should move some of this into LegalizerHelper. 2080 2081 // TODO: Promote dynamic indexing of s16 to s32 2082 2083 // FIXME: Artifact combiner probably should have replaced the truncated 2084 // constant before this, so we shouldn't need 2085 // getConstantVRegValWithLookThrough. 2086 Optional<ValueAndVReg> MaybeIdxVal = 2087 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2088 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2089 return true; 2090 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2091 2092 Register Dst = MI.getOperand(0).getReg(); 2093 Register Vec = MI.getOperand(1).getReg(); 2094 2095 LLT VecTy = MRI.getType(Vec); 2096 LLT EltTy = VecTy.getElementType(); 2097 assert(EltTy == MRI.getType(Dst)); 2098 2099 if (IdxVal < VecTy.getNumElements()) 2100 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2101 else 2102 B.buildUndef(Dst); 2103 2104 MI.eraseFromParent(); 2105 return true; 2106 } 2107 2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2109 MachineInstr &MI, MachineRegisterInfo &MRI, 2110 MachineIRBuilder &B) const { 2111 // TODO: Should move some of this into LegalizerHelper. 2112 2113 // TODO: Promote dynamic indexing of s16 to s32 2114 2115 // FIXME: Artifact combiner probably should have replaced the truncated 2116 // constant before this, so we shouldn't need 2117 // getConstantVRegValWithLookThrough. 2118 Optional<ValueAndVReg> MaybeIdxVal = 2119 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2120 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2121 return true; 2122 2123 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2124 Register Dst = MI.getOperand(0).getReg(); 2125 Register Vec = MI.getOperand(1).getReg(); 2126 Register Ins = MI.getOperand(2).getReg(); 2127 2128 LLT VecTy = MRI.getType(Vec); 2129 LLT EltTy = VecTy.getElementType(); 2130 assert(EltTy == MRI.getType(Ins)); 2131 2132 if (IdxVal < VecTy.getNumElements()) 2133 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2134 else 2135 B.buildUndef(Dst); 2136 2137 MI.eraseFromParent(); 2138 return true; 2139 } 2140 2141 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2142 MachineInstr &MI, MachineRegisterInfo &MRI, 2143 MachineIRBuilder &B) const { 2144 const LLT V2S16 = LLT::vector(2, 16); 2145 2146 Register Dst = MI.getOperand(0).getReg(); 2147 Register Src0 = MI.getOperand(1).getReg(); 2148 LLT DstTy = MRI.getType(Dst); 2149 LLT SrcTy = MRI.getType(Src0); 2150 2151 if (SrcTy == V2S16 && DstTy == V2S16 && 2152 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2153 return true; 2154 2155 MachineIRBuilder HelperBuilder(MI); 2156 GISelObserverWrapper DummyObserver; 2157 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2158 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2159 } 2160 2161 bool AMDGPULegalizerInfo::legalizeSinCos( 2162 MachineInstr &MI, MachineRegisterInfo &MRI, 2163 MachineIRBuilder &B) const { 2164 2165 Register DstReg = MI.getOperand(0).getReg(); 2166 Register SrcReg = MI.getOperand(1).getReg(); 2167 LLT Ty = MRI.getType(DstReg); 2168 unsigned Flags = MI.getFlags(); 2169 2170 Register TrigVal; 2171 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2172 if (ST.hasTrigReducedRange()) { 2173 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2174 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2175 .addUse(MulVal.getReg(0)) 2176 .setMIFlags(Flags).getReg(0); 2177 } else 2178 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2179 2180 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2181 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2182 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2183 .addUse(TrigVal) 2184 .setMIFlags(Flags); 2185 MI.eraseFromParent(); 2186 return true; 2187 } 2188 2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2190 MachineIRBuilder &B, 2191 const GlobalValue *GV, 2192 int64_t Offset, 2193 unsigned GAFlags) const { 2194 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2195 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2196 // to the following code sequence: 2197 // 2198 // For constant address space: 2199 // s_getpc_b64 s[0:1] 2200 // s_add_u32 s0, s0, $symbol 2201 // s_addc_u32 s1, s1, 0 2202 // 2203 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2204 // a fixup or relocation is emitted to replace $symbol with a literal 2205 // constant, which is a pc-relative offset from the encoding of the $symbol 2206 // operand to the global variable. 2207 // 2208 // For global address space: 2209 // s_getpc_b64 s[0:1] 2210 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2211 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2212 // 2213 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2214 // fixups or relocations are emitted to replace $symbol@*@lo and 2215 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2216 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2217 // operand to the global variable. 2218 // 2219 // What we want here is an offset from the value returned by s_getpc 2220 // (which is the address of the s_add_u32 instruction) to the global 2221 // variable, but since the encoding of $symbol starts 4 bytes after the start 2222 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2223 // small. This requires us to add 4 to the global variable offset in order to 2224 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2225 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2226 // instruction. 2227 2228 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2229 2230 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2231 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2232 2233 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2234 .addDef(PCReg); 2235 2236 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2237 if (GAFlags == SIInstrInfo::MO_NONE) 2238 MIB.addImm(0); 2239 else 2240 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2241 2242 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2243 2244 if (PtrTy.getSizeInBits() == 32) 2245 B.buildExtract(DstReg, PCReg, 0); 2246 return true; 2247 } 2248 2249 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2250 MachineInstr &MI, MachineRegisterInfo &MRI, 2251 MachineIRBuilder &B) const { 2252 Register DstReg = MI.getOperand(0).getReg(); 2253 LLT Ty = MRI.getType(DstReg); 2254 unsigned AS = Ty.getAddressSpace(); 2255 2256 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2257 MachineFunction &MF = B.getMF(); 2258 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2259 2260 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2261 if (!MFI->isModuleEntryFunction()) { 2262 const Function &Fn = MF.getFunction(); 2263 DiagnosticInfoUnsupported BadLDSDecl( 2264 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2265 DS_Warning); 2266 Fn.getContext().diagnose(BadLDSDecl); 2267 2268 // We currently don't have a way to correctly allocate LDS objects that 2269 // aren't directly associated with a kernel. We do force inlining of 2270 // functions that use local objects. However, if these dead functions are 2271 // not eliminated, we don't want a compile time error. Just emit a warning 2272 // and a trap, since there should be no callable path here. 2273 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2274 B.buildUndef(DstReg); 2275 MI.eraseFromParent(); 2276 return true; 2277 } 2278 2279 // TODO: We could emit code to handle the initialization somewhere. 2280 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2281 const SITargetLowering *TLI = ST.getTargetLowering(); 2282 if (!TLI->shouldUseLDSConstAddress(GV)) { 2283 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2284 return true; // Leave in place; 2285 } 2286 2287 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2288 Type *Ty = GV->getValueType(); 2289 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2290 // zero-sized type in other languages to declare the dynamic shared 2291 // memory which size is not known at the compile time. They will be 2292 // allocated by the runtime and placed directly after the static 2293 // allocated ones. They all share the same offset. 2294 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2295 // Adjust alignment for that dynamic shared memory array. 2296 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2297 LLT S32 = LLT::scalar(32); 2298 auto Sz = 2299 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2300 B.buildIntToPtr(DstReg, Sz); 2301 MI.eraseFromParent(); 2302 return true; 2303 } 2304 } 2305 2306 B.buildConstant( 2307 DstReg, 2308 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2309 MI.eraseFromParent(); 2310 return true; 2311 } 2312 2313 const Function &Fn = MF.getFunction(); 2314 DiagnosticInfoUnsupported BadInit( 2315 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2316 Fn.getContext().diagnose(BadInit); 2317 return true; 2318 } 2319 2320 const SITargetLowering *TLI = ST.getTargetLowering(); 2321 2322 if (TLI->shouldEmitFixup(GV)) { 2323 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2324 MI.eraseFromParent(); 2325 return true; 2326 } 2327 2328 if (TLI->shouldEmitPCReloc(GV)) { 2329 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2330 MI.eraseFromParent(); 2331 return true; 2332 } 2333 2334 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2335 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2336 2337 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2338 MachinePointerInfo::getGOT(MF), 2339 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2340 MachineMemOperand::MOInvariant, 2341 8 /*Size*/, Align(8)); 2342 2343 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2344 2345 if (Ty.getSizeInBits() == 32) { 2346 // Truncate if this is a 32-bit constant adrdess. 2347 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2348 B.buildExtract(DstReg, Load, 0); 2349 } else 2350 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2351 2352 MI.eraseFromParent(); 2353 return true; 2354 } 2355 2356 static LLT widenToNextPowerOf2(LLT Ty) { 2357 if (Ty.isVector()) 2358 return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2359 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2360 } 2361 2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2363 MachineInstr &MI) const { 2364 MachineIRBuilder &B = Helper.MIRBuilder; 2365 MachineRegisterInfo &MRI = *B.getMRI(); 2366 GISelChangeObserver &Observer = Helper.Observer; 2367 2368 Register PtrReg = MI.getOperand(1).getReg(); 2369 LLT PtrTy = MRI.getType(PtrReg); 2370 unsigned AddrSpace = PtrTy.getAddressSpace(); 2371 2372 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2373 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2374 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2375 Observer.changingInstr(MI); 2376 MI.getOperand(1).setReg(Cast.getReg(0)); 2377 Observer.changedInstr(MI); 2378 return true; 2379 } 2380 2381 Register ValReg = MI.getOperand(0).getReg(); 2382 LLT ValTy = MRI.getType(ValReg); 2383 2384 MachineMemOperand *MMO = *MI.memoperands_begin(); 2385 const unsigned ValSize = ValTy.getSizeInBits(); 2386 const unsigned MemSize = 8 * MMO->getSize(); 2387 const Align MemAlign = MMO->getAlign(); 2388 const unsigned AlignInBits = 8 * MemAlign.value(); 2389 2390 // Widen non-power-of-2 loads to the alignment if needed 2391 if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2392 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2393 2394 // This was already the correct extending load result type, so just adjust 2395 // the memory type. 2396 if (WideMemSize == ValSize) { 2397 MachineFunction &MF = B.getMF(); 2398 2399 MachineMemOperand *WideMMO = 2400 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2401 Observer.changingInstr(MI); 2402 MI.setMemRefs(MF, {WideMMO}); 2403 Observer.changedInstr(MI); 2404 return true; 2405 } 2406 2407 // Don't bother handling edge case that should probably never be produced. 2408 if (ValSize > WideMemSize) 2409 return false; 2410 2411 LLT WideTy = widenToNextPowerOf2(ValTy); 2412 2413 Register WideLoad; 2414 if (!WideTy.isVector()) { 2415 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2416 B.buildTrunc(ValReg, WideLoad).getReg(0); 2417 } else { 2418 // Extract the subvector. 2419 2420 if (isRegisterType(ValTy)) { 2421 // If this a case where G_EXTRACT is legal, use it. 2422 // (e.g. <3 x s32> -> <4 x s32>) 2423 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2424 B.buildExtract(ValReg, WideLoad, 0); 2425 } else { 2426 // For cases where the widened type isn't a nice register value, unmerge 2427 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2428 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2429 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2430 B.setInsertPt(B.getMBB(), MI.getIterator()); 2431 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2432 } 2433 } 2434 2435 MI.eraseFromParent(); 2436 return true; 2437 } 2438 2439 return false; 2440 } 2441 2442 bool AMDGPULegalizerInfo::legalizeFMad( 2443 MachineInstr &MI, MachineRegisterInfo &MRI, 2444 MachineIRBuilder &B) const { 2445 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2446 assert(Ty.isScalar()); 2447 2448 MachineFunction &MF = B.getMF(); 2449 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2450 2451 // TODO: Always legal with future ftz flag. 2452 // FIXME: Do we need just output? 2453 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2454 return true; 2455 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2456 return true; 2457 2458 MachineIRBuilder HelperBuilder(MI); 2459 GISelObserverWrapper DummyObserver; 2460 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2461 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2462 } 2463 2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2465 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2466 Register DstReg = MI.getOperand(0).getReg(); 2467 Register PtrReg = MI.getOperand(1).getReg(); 2468 Register CmpVal = MI.getOperand(2).getReg(); 2469 Register NewVal = MI.getOperand(3).getReg(); 2470 2471 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2472 "this should not have been custom lowered"); 2473 2474 LLT ValTy = MRI.getType(CmpVal); 2475 LLT VecTy = LLT::vector(2, ValTy); 2476 2477 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2478 2479 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2480 .addDef(DstReg) 2481 .addUse(PtrReg) 2482 .addUse(PackedVal) 2483 .setMemRefs(MI.memoperands()); 2484 2485 MI.eraseFromParent(); 2486 return true; 2487 } 2488 2489 bool AMDGPULegalizerInfo::legalizeFlog( 2490 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2491 Register Dst = MI.getOperand(0).getReg(); 2492 Register Src = MI.getOperand(1).getReg(); 2493 LLT Ty = B.getMRI()->getType(Dst); 2494 unsigned Flags = MI.getFlags(); 2495 2496 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2497 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2498 2499 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2500 MI.eraseFromParent(); 2501 return true; 2502 } 2503 2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2505 MachineIRBuilder &B) const { 2506 Register Dst = MI.getOperand(0).getReg(); 2507 Register Src = MI.getOperand(1).getReg(); 2508 unsigned Flags = MI.getFlags(); 2509 LLT Ty = B.getMRI()->getType(Dst); 2510 2511 auto K = B.buildFConstant(Ty, numbers::log2e); 2512 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2513 B.buildFExp2(Dst, Mul, Flags); 2514 MI.eraseFromParent(); 2515 return true; 2516 } 2517 2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2519 MachineIRBuilder &B) const { 2520 Register Dst = MI.getOperand(0).getReg(); 2521 Register Src0 = MI.getOperand(1).getReg(); 2522 Register Src1 = MI.getOperand(2).getReg(); 2523 unsigned Flags = MI.getFlags(); 2524 LLT Ty = B.getMRI()->getType(Dst); 2525 const LLT S16 = LLT::scalar(16); 2526 const LLT S32 = LLT::scalar(32); 2527 2528 if (Ty == S32) { 2529 auto Log = B.buildFLog2(S32, Src0, Flags); 2530 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2531 .addUse(Log.getReg(0)) 2532 .addUse(Src1) 2533 .setMIFlags(Flags); 2534 B.buildFExp2(Dst, Mul, Flags); 2535 } else if (Ty == S16) { 2536 // There's no f16 fmul_legacy, so we need to convert for it. 2537 auto Log = B.buildFLog2(S16, Src0, Flags); 2538 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2539 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2540 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2541 .addUse(Ext0.getReg(0)) 2542 .addUse(Ext1.getReg(0)) 2543 .setMIFlags(Flags); 2544 2545 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2546 } else 2547 return false; 2548 2549 MI.eraseFromParent(); 2550 return true; 2551 } 2552 2553 // Find a source register, ignoring any possible source modifiers. 2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2555 Register ModSrc = OrigSrc; 2556 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2557 ModSrc = SrcFNeg->getOperand(1).getReg(); 2558 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2559 ModSrc = SrcFAbs->getOperand(1).getReg(); 2560 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2561 ModSrc = SrcFAbs->getOperand(1).getReg(); 2562 return ModSrc; 2563 } 2564 2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2566 MachineRegisterInfo &MRI, 2567 MachineIRBuilder &B) const { 2568 2569 const LLT S1 = LLT::scalar(1); 2570 const LLT S64 = LLT::scalar(64); 2571 Register Dst = MI.getOperand(0).getReg(); 2572 Register OrigSrc = MI.getOperand(1).getReg(); 2573 unsigned Flags = MI.getFlags(); 2574 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2575 "this should not have been custom lowered"); 2576 2577 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2578 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2579 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2580 // V_FRACT bug is: 2581 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2582 // 2583 // Convert floor(x) to (x - fract(x)) 2584 2585 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2586 .addUse(OrigSrc) 2587 .setMIFlags(Flags); 2588 2589 // Give source modifier matching some assistance before obscuring a foldable 2590 // pattern. 2591 2592 // TODO: We can avoid the neg on the fract? The input sign to fract 2593 // shouldn't matter? 2594 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2595 2596 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2597 2598 Register Min = MRI.createGenericVirtualRegister(S64); 2599 2600 // We don't need to concern ourselves with the snan handling difference, so 2601 // use the one which will directly select. 2602 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2603 if (MFI->getMode().IEEE) 2604 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2605 else 2606 B.buildFMinNum(Min, Fract, Const, Flags); 2607 2608 Register CorrectedFract = Min; 2609 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2610 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2611 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2612 } 2613 2614 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2615 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2616 2617 MI.eraseFromParent(); 2618 return true; 2619 } 2620 2621 // Turn an illegal packed v2s16 build vector into bit operations. 2622 // TODO: This should probably be a bitcast action in LegalizerHelper. 2623 bool AMDGPULegalizerInfo::legalizeBuildVector( 2624 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2625 Register Dst = MI.getOperand(0).getReg(); 2626 const LLT S32 = LLT::scalar(32); 2627 assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2628 2629 Register Src0 = MI.getOperand(1).getReg(); 2630 Register Src1 = MI.getOperand(2).getReg(); 2631 assert(MRI.getType(Src0) == LLT::scalar(16)); 2632 2633 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2634 B.buildBitcast(Dst, Merge); 2635 2636 MI.eraseFromParent(); 2637 return true; 2638 } 2639 2640 // Check that this is a G_XOR x, -1 2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2642 if (MI.getOpcode() != TargetOpcode::G_XOR) 2643 return false; 2644 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2645 return ConstVal && *ConstVal == -1; 2646 } 2647 2648 // Return the use branch instruction, otherwise null if the usage is invalid. 2649 static MachineInstr * 2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2651 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2652 Register CondDef = MI.getOperand(0).getReg(); 2653 if (!MRI.hasOneNonDBGUse(CondDef)) 2654 return nullptr; 2655 2656 MachineBasicBlock *Parent = MI.getParent(); 2657 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2658 2659 if (isNot(MRI, *UseMI)) { 2660 Register NegatedCond = UseMI->getOperand(0).getReg(); 2661 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2662 return nullptr; 2663 2664 // We're deleting the def of this value, so we need to remove it. 2665 UseMI->eraseFromParent(); 2666 2667 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2668 Negated = true; 2669 } 2670 2671 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2672 return nullptr; 2673 2674 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2675 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2676 if (Next == Parent->end()) { 2677 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2678 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2679 return nullptr; 2680 UncondBrTarget = &*NextMBB; 2681 } else { 2682 if (Next->getOpcode() != AMDGPU::G_BR) 2683 return nullptr; 2684 Br = &*Next; 2685 UncondBrTarget = Br->getOperand(0).getMBB(); 2686 } 2687 2688 return UseMI; 2689 } 2690 2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2692 const ArgDescriptor *Arg, 2693 const TargetRegisterClass *ArgRC, 2694 LLT ArgTy) const { 2695 MCRegister SrcReg = Arg->getRegister(); 2696 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2697 assert(DstReg.isVirtual() && "Virtual register expected"); 2698 2699 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2700 ArgTy); 2701 if (Arg->isMasked()) { 2702 // TODO: Should we try to emit this once in the entry block? 2703 const LLT S32 = LLT::scalar(32); 2704 const unsigned Mask = Arg->getMask(); 2705 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2706 2707 Register AndMaskSrc = LiveIn; 2708 2709 if (Shift != 0) { 2710 auto ShiftAmt = B.buildConstant(S32, Shift); 2711 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2712 } 2713 2714 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2715 } else { 2716 B.buildCopy(DstReg, LiveIn); 2717 } 2718 2719 return true; 2720 } 2721 2722 bool AMDGPULegalizerInfo::loadInputValue( 2723 Register DstReg, MachineIRBuilder &B, 2724 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2725 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2726 const ArgDescriptor *Arg; 2727 const TargetRegisterClass *ArgRC; 2728 LLT ArgTy; 2729 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2730 2731 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2732 return false; // TODO: Handle these 2733 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2734 } 2735 2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2737 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2738 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2739 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2740 return false; 2741 2742 MI.eraseFromParent(); 2743 return true; 2744 } 2745 2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2747 MachineRegisterInfo &MRI, 2748 MachineIRBuilder &B) const { 2749 Register Dst = MI.getOperand(0).getReg(); 2750 LLT DstTy = MRI.getType(Dst); 2751 LLT S16 = LLT::scalar(16); 2752 LLT S32 = LLT::scalar(32); 2753 LLT S64 = LLT::scalar(64); 2754 2755 if (DstTy == S16) 2756 return legalizeFDIV16(MI, MRI, B); 2757 if (DstTy == S32) 2758 return legalizeFDIV32(MI, MRI, B); 2759 if (DstTy == S64) 2760 return legalizeFDIV64(MI, MRI, B); 2761 2762 return false; 2763 } 2764 2765 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2766 Register DstReg, 2767 Register X, 2768 Register Y, 2769 bool IsDiv) const { 2770 const LLT S1 = LLT::scalar(1); 2771 const LLT S32 = LLT::scalar(32); 2772 2773 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2774 // algorithm used here. 2775 2776 // Initial estimate of inv(y). 2777 auto FloatY = B.buildUITOFP(S32, Y); 2778 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2779 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2780 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2781 auto Z = B.buildFPTOUI(S32, ScaledY); 2782 2783 // One round of UNR. 2784 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2785 auto NegYZ = B.buildMul(S32, NegY, Z); 2786 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2787 2788 // Quotient/remainder estimate. 2789 auto Q = B.buildUMulH(S32, X, Z); 2790 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2791 2792 // First quotient/remainder refinement. 2793 auto One = B.buildConstant(S32, 1); 2794 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2795 if (IsDiv) 2796 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2797 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2798 2799 // Second quotient/remainder refinement. 2800 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2801 if (IsDiv) 2802 B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2803 else 2804 B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2805 } 2806 2807 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2808 MachineRegisterInfo &MRI, 2809 MachineIRBuilder &B) const { 2810 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2811 Register DstReg = MI.getOperand(0).getReg(); 2812 Register Num = MI.getOperand(1).getReg(); 2813 Register Den = MI.getOperand(2).getReg(); 2814 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2815 MI.eraseFromParent(); 2816 return true; 2817 } 2818 2819 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2820 // 2821 // Return lo, hi of result 2822 // 2823 // %cvt.lo = G_UITOFP Val.lo 2824 // %cvt.hi = G_UITOFP Val.hi 2825 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2826 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2827 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2828 // %mul2 = G_FMUL %mul1, 2**(-32) 2829 // %trunc = G_INTRINSIC_TRUNC %mul2 2830 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2831 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2832 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2833 Register Val) { 2834 const LLT S32 = LLT::scalar(32); 2835 auto Unmerge = B.buildUnmerge(S32, Val); 2836 2837 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2838 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2839 2840 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2841 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2842 2843 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2844 auto Mul1 = 2845 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2846 2847 // 2**(-32) 2848 auto Mul2 = 2849 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2850 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2851 2852 // -(2**32) 2853 auto Mad2 = B.buildFMAD(S32, Trunc, 2854 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2855 2856 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2857 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2858 2859 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2860 } 2861 2862 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2863 Register DstReg, 2864 Register Numer, 2865 Register Denom, 2866 bool IsDiv) const { 2867 const LLT S32 = LLT::scalar(32); 2868 const LLT S64 = LLT::scalar(64); 2869 const LLT S1 = LLT::scalar(1); 2870 Register RcpLo, RcpHi; 2871 2872 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2873 2874 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2875 2876 auto Zero64 = B.buildConstant(S64, 0); 2877 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2878 2879 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2880 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2881 2882 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2883 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2884 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2885 2886 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2887 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2888 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2889 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2890 2891 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2892 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2893 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2894 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2895 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2896 2897 auto Zero32 = B.buildConstant(S32, 0); 2898 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2899 auto Add2_HiC = 2900 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2901 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2902 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2903 2904 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2905 Register NumerLo = UnmergeNumer.getReg(0); 2906 Register NumerHi = UnmergeNumer.getReg(1); 2907 2908 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2909 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2910 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2911 Register Mul3_Lo = UnmergeMul3.getReg(0); 2912 Register Mul3_Hi = UnmergeMul3.getReg(1); 2913 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2914 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2915 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2916 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2917 2918 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2919 Register DenomLo = UnmergeDenom.getReg(0); 2920 Register DenomHi = UnmergeDenom.getReg(1); 2921 2922 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2923 auto C1 = B.buildSExt(S32, CmpHi); 2924 2925 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2926 auto C2 = B.buildSExt(S32, CmpLo); 2927 2928 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2929 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2930 2931 // TODO: Here and below portions of the code can be enclosed into if/endif. 2932 // Currently control flow is unconditional and we have 4 selects after 2933 // potential endif to substitute PHIs. 2934 2935 // if C3 != 0 ... 2936 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2937 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2938 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2939 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2940 2941 auto One64 = B.buildConstant(S64, 1); 2942 auto Add3 = B.buildAdd(S64, MulHi3, One64); 2943 2944 auto C4 = 2945 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2946 auto C5 = 2947 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2948 auto C6 = B.buildSelect( 2949 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2950 2951 // if (C6 != 0) 2952 auto Add4 = B.buildAdd(S64, Add3, One64); 2953 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2954 2955 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2956 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2957 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2958 2959 // endif C6 2960 // endif C3 2961 2962 if (IsDiv) { 2963 auto Sel1 = B.buildSelect( 2964 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2965 B.buildSelect(DstReg, 2966 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2967 } else { 2968 auto Sel2 = B.buildSelect( 2969 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2970 B.buildSelect(DstReg, 2971 B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2972 } 2973 } 2974 2975 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2976 MachineRegisterInfo &MRI, 2977 MachineIRBuilder &B) const { 2978 const LLT S64 = LLT::scalar(64); 2979 const LLT S32 = LLT::scalar(32); 2980 const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2981 Register DstReg = MI.getOperand(0).getReg(); 2982 Register Num = MI.getOperand(1).getReg(); 2983 Register Den = MI.getOperand(2).getReg(); 2984 LLT Ty = MRI.getType(DstReg); 2985 2986 if (Ty == S32) 2987 legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2988 else if (Ty == S64) 2989 legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2990 else 2991 return false; 2992 2993 MI.eraseFromParent(); 2994 return true; 2995 2996 } 2997 2998 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2999 MachineRegisterInfo &MRI, 3000 MachineIRBuilder &B) const { 3001 const LLT S64 = LLT::scalar(64); 3002 const LLT S32 = LLT::scalar(32); 3003 3004 Register DstReg = MI.getOperand(0).getReg(); 3005 const LLT Ty = MRI.getType(DstReg); 3006 if (Ty != S32 && Ty != S64) 3007 return false; 3008 3009 const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 3010 3011 Register LHS = MI.getOperand(1).getReg(); 3012 Register RHS = MI.getOperand(2).getReg(); 3013 3014 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3015 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3016 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3017 3018 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3019 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3020 3021 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3022 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3023 3024 Register UDivRem = MRI.createGenericVirtualRegister(Ty); 3025 if (Ty == S32) 3026 legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 3027 else 3028 legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 3029 3030 Register Sign; 3031 if (IsDiv) 3032 Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3033 else 3034 Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3035 3036 UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 3037 B.buildSub(DstReg, UDivRem, Sign); 3038 3039 MI.eraseFromParent(); 3040 return true; 3041 } 3042 3043 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3044 MachineRegisterInfo &MRI, 3045 MachineIRBuilder &B) const { 3046 Register Res = MI.getOperand(0).getReg(); 3047 Register LHS = MI.getOperand(1).getReg(); 3048 Register RHS = MI.getOperand(2).getReg(); 3049 uint16_t Flags = MI.getFlags(); 3050 LLT ResTy = MRI.getType(Res); 3051 3052 const MachineFunction &MF = B.getMF(); 3053 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3054 MI.getFlag(MachineInstr::FmAfn); 3055 3056 if (!AllowInaccurateRcp) 3057 return false; 3058 3059 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3060 // 1 / x -> RCP(x) 3061 if (CLHS->isExactlyValue(1.0)) { 3062 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3063 .addUse(RHS) 3064 .setMIFlags(Flags); 3065 3066 MI.eraseFromParent(); 3067 return true; 3068 } 3069 3070 // -1 / x -> RCP( FNEG(x) ) 3071 if (CLHS->isExactlyValue(-1.0)) { 3072 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3073 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3074 .addUse(FNeg.getReg(0)) 3075 .setMIFlags(Flags); 3076 3077 MI.eraseFromParent(); 3078 return true; 3079 } 3080 } 3081 3082 // x / y -> x * (1.0 / y) 3083 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3084 .addUse(RHS) 3085 .setMIFlags(Flags); 3086 B.buildFMul(Res, LHS, RCP, Flags); 3087 3088 MI.eraseFromParent(); 3089 return true; 3090 } 3091 3092 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3093 MachineRegisterInfo &MRI, 3094 MachineIRBuilder &B) const { 3095 Register Res = MI.getOperand(0).getReg(); 3096 Register X = MI.getOperand(1).getReg(); 3097 Register Y = MI.getOperand(2).getReg(); 3098 uint16_t Flags = MI.getFlags(); 3099 LLT ResTy = MRI.getType(Res); 3100 3101 const MachineFunction &MF = B.getMF(); 3102 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3103 MI.getFlag(MachineInstr::FmAfn); 3104 3105 if (!AllowInaccurateRcp) 3106 return false; 3107 3108 auto NegY = B.buildFNeg(ResTy, Y); 3109 auto One = B.buildFConstant(ResTy, 1.0); 3110 3111 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3112 .addUse(Y) 3113 .setMIFlags(Flags); 3114 3115 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3116 R = B.buildFMA(ResTy, Tmp0, R, R); 3117 3118 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3119 R = B.buildFMA(ResTy, Tmp1, R, R); 3120 3121 auto Ret = B.buildFMul(ResTy, X, R); 3122 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3123 3124 B.buildFMA(Res, Tmp2, R, Ret); 3125 MI.eraseFromParent(); 3126 return true; 3127 } 3128 3129 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3130 MachineRegisterInfo &MRI, 3131 MachineIRBuilder &B) const { 3132 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3133 return true; 3134 3135 Register Res = MI.getOperand(0).getReg(); 3136 Register LHS = MI.getOperand(1).getReg(); 3137 Register RHS = MI.getOperand(2).getReg(); 3138 3139 uint16_t Flags = MI.getFlags(); 3140 3141 LLT S16 = LLT::scalar(16); 3142 LLT S32 = LLT::scalar(32); 3143 3144 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3145 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3146 3147 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3148 .addUse(RHSExt.getReg(0)) 3149 .setMIFlags(Flags); 3150 3151 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3152 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3153 3154 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3155 .addUse(RDst.getReg(0)) 3156 .addUse(RHS) 3157 .addUse(LHS) 3158 .setMIFlags(Flags); 3159 3160 MI.eraseFromParent(); 3161 return true; 3162 } 3163 3164 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3165 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3166 static void toggleSPDenormMode(bool Enable, 3167 MachineIRBuilder &B, 3168 const GCNSubtarget &ST, 3169 AMDGPU::SIModeRegisterDefaults Mode) { 3170 // Set SP denorm mode to this value. 3171 unsigned SPDenormMode = 3172 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3173 3174 if (ST.hasDenormModeInst()) { 3175 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3176 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3177 3178 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3179 B.buildInstr(AMDGPU::S_DENORM_MODE) 3180 .addImm(NewDenormModeValue); 3181 3182 } else { 3183 // Select FP32 bit field in mode register. 3184 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3185 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3186 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3187 3188 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3189 .addImm(SPDenormMode) 3190 .addImm(SPDenormModeBitField); 3191 } 3192 } 3193 3194 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3195 MachineRegisterInfo &MRI, 3196 MachineIRBuilder &B) const { 3197 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3198 return true; 3199 3200 Register Res = MI.getOperand(0).getReg(); 3201 Register LHS = MI.getOperand(1).getReg(); 3202 Register RHS = MI.getOperand(2).getReg(); 3203 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3204 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3205 3206 uint16_t Flags = MI.getFlags(); 3207 3208 LLT S32 = LLT::scalar(32); 3209 LLT S1 = LLT::scalar(1); 3210 3211 auto One = B.buildFConstant(S32, 1.0f); 3212 3213 auto DenominatorScaled = 3214 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3215 .addUse(LHS) 3216 .addUse(RHS) 3217 .addImm(0) 3218 .setMIFlags(Flags); 3219 auto NumeratorScaled = 3220 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3221 .addUse(LHS) 3222 .addUse(RHS) 3223 .addImm(1) 3224 .setMIFlags(Flags); 3225 3226 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3227 .addUse(DenominatorScaled.getReg(0)) 3228 .setMIFlags(Flags); 3229 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3230 3231 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3232 // aren't modeled as reading it. 3233 if (!Mode.allFP32Denormals()) 3234 toggleSPDenormMode(true, B, ST, Mode); 3235 3236 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3237 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3238 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3239 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3240 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3241 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3242 3243 if (!Mode.allFP32Denormals()) 3244 toggleSPDenormMode(false, B, ST, Mode); 3245 3246 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3247 .addUse(Fma4.getReg(0)) 3248 .addUse(Fma1.getReg(0)) 3249 .addUse(Fma3.getReg(0)) 3250 .addUse(NumeratorScaled.getReg(1)) 3251 .setMIFlags(Flags); 3252 3253 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3254 .addUse(Fmas.getReg(0)) 3255 .addUse(RHS) 3256 .addUse(LHS) 3257 .setMIFlags(Flags); 3258 3259 MI.eraseFromParent(); 3260 return true; 3261 } 3262 3263 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3264 MachineRegisterInfo &MRI, 3265 MachineIRBuilder &B) const { 3266 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3267 return true; 3268 3269 Register Res = MI.getOperand(0).getReg(); 3270 Register LHS = MI.getOperand(1).getReg(); 3271 Register RHS = MI.getOperand(2).getReg(); 3272 3273 uint16_t Flags = MI.getFlags(); 3274 3275 LLT S64 = LLT::scalar(64); 3276 LLT S1 = LLT::scalar(1); 3277 3278 auto One = B.buildFConstant(S64, 1.0); 3279 3280 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3281 .addUse(LHS) 3282 .addUse(RHS) 3283 .addImm(0) 3284 .setMIFlags(Flags); 3285 3286 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3287 3288 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3289 .addUse(DivScale0.getReg(0)) 3290 .setMIFlags(Flags); 3291 3292 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3293 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3294 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3295 3296 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3297 .addUse(LHS) 3298 .addUse(RHS) 3299 .addImm(1) 3300 .setMIFlags(Flags); 3301 3302 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3303 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3304 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3305 3306 Register Scale; 3307 if (!ST.hasUsableDivScaleConditionOutput()) { 3308 // Workaround a hardware bug on SI where the condition output from div_scale 3309 // is not usable. 3310 3311 LLT S32 = LLT::scalar(32); 3312 3313 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3314 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3315 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3316 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3317 3318 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3319 Scale1Unmerge.getReg(1)); 3320 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3321 Scale0Unmerge.getReg(1)); 3322 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3323 } else { 3324 Scale = DivScale1.getReg(1); 3325 } 3326 3327 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3328 .addUse(Fma4.getReg(0)) 3329 .addUse(Fma3.getReg(0)) 3330 .addUse(Mul.getReg(0)) 3331 .addUse(Scale) 3332 .setMIFlags(Flags); 3333 3334 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3335 .addUse(Fmas.getReg(0)) 3336 .addUse(RHS) 3337 .addUse(LHS) 3338 .setMIFlags(Flags); 3339 3340 MI.eraseFromParent(); 3341 return true; 3342 } 3343 3344 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3345 MachineRegisterInfo &MRI, 3346 MachineIRBuilder &B) const { 3347 Register Res = MI.getOperand(0).getReg(); 3348 Register LHS = MI.getOperand(2).getReg(); 3349 Register RHS = MI.getOperand(3).getReg(); 3350 uint16_t Flags = MI.getFlags(); 3351 3352 LLT S32 = LLT::scalar(32); 3353 LLT S1 = LLT::scalar(1); 3354 3355 auto Abs = B.buildFAbs(S32, RHS, Flags); 3356 const APFloat C0Val(1.0f); 3357 3358 auto C0 = B.buildConstant(S32, 0x6f800000); 3359 auto C1 = B.buildConstant(S32, 0x2f800000); 3360 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3361 3362 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3363 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3364 3365 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3366 3367 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3368 .addUse(Mul0.getReg(0)) 3369 .setMIFlags(Flags); 3370 3371 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3372 3373 B.buildFMul(Res, Sel, Mul1, Flags); 3374 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 3379 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3380 // FIXME: Why do we handle this one but not other removed instructions? 3381 // 3382 // Reciprocal square root. The clamp prevents infinite results, clamping 3383 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3384 // +-max_float. 3385 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3386 MachineRegisterInfo &MRI, 3387 MachineIRBuilder &B) const { 3388 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3389 return true; 3390 3391 Register Dst = MI.getOperand(0).getReg(); 3392 Register Src = MI.getOperand(2).getReg(); 3393 auto Flags = MI.getFlags(); 3394 3395 LLT Ty = MRI.getType(Dst); 3396 3397 const fltSemantics *FltSemantics; 3398 if (Ty == LLT::scalar(32)) 3399 FltSemantics = &APFloat::IEEEsingle(); 3400 else if (Ty == LLT::scalar(64)) 3401 FltSemantics = &APFloat::IEEEdouble(); 3402 else 3403 return false; 3404 3405 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3406 .addUse(Src) 3407 .setMIFlags(Flags); 3408 3409 // We don't need to concern ourselves with the snan handling difference, since 3410 // the rsq quieted (or not) so use the one which will directly select. 3411 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3412 const bool UseIEEE = MFI->getMode().IEEE; 3413 3414 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3415 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3416 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3417 3418 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3419 3420 if (UseIEEE) 3421 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3422 else 3423 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3424 MI.eraseFromParent(); 3425 return true; 3426 } 3427 3428 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3429 switch (IID) { 3430 case Intrinsic::amdgcn_ds_fadd: 3431 return AMDGPU::G_ATOMICRMW_FADD; 3432 case Intrinsic::amdgcn_ds_fmin: 3433 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3434 case Intrinsic::amdgcn_ds_fmax: 3435 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3436 default: 3437 llvm_unreachable("not a DS FP intrinsic"); 3438 } 3439 } 3440 3441 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3442 MachineInstr &MI, 3443 Intrinsic::ID IID) const { 3444 GISelChangeObserver &Observer = Helper.Observer; 3445 Observer.changingInstr(MI); 3446 3447 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3448 3449 // The remaining operands were used to set fields in the MemOperand on 3450 // construction. 3451 for (int I = 6; I > 3; --I) 3452 MI.RemoveOperand(I); 3453 3454 MI.RemoveOperand(1); // Remove the intrinsic ID. 3455 Observer.changedInstr(MI); 3456 return true; 3457 } 3458 3459 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3460 MachineRegisterInfo &MRI, 3461 MachineIRBuilder &B) const { 3462 uint64_t Offset = 3463 ST.getTargetLowering()->getImplicitParameterOffset( 3464 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3465 LLT DstTy = MRI.getType(DstReg); 3466 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3467 3468 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3469 if (!loadInputValue(KernargPtrReg, B, 3470 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3471 return false; 3472 3473 // FIXME: This should be nuw 3474 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3475 return true; 3476 } 3477 3478 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3479 MachineRegisterInfo &MRI, 3480 MachineIRBuilder &B) const { 3481 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3482 if (!MFI->isEntryFunction()) { 3483 return legalizePreloadedArgIntrin(MI, MRI, B, 3484 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3485 } 3486 3487 Register DstReg = MI.getOperand(0).getReg(); 3488 if (!getImplicitArgPtr(DstReg, MRI, B)) 3489 return false; 3490 3491 MI.eraseFromParent(); 3492 return true; 3493 } 3494 3495 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3496 MachineRegisterInfo &MRI, 3497 MachineIRBuilder &B, 3498 unsigned AddrSpace) const { 3499 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3500 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3501 Register Hi32 = Unmerge.getReg(1); 3502 3503 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3504 MI.eraseFromParent(); 3505 return true; 3506 } 3507 3508 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3509 // offset (the offset that is included in bounds checking and swizzling, to be 3510 // split between the instruction's voffset and immoffset fields) and soffset 3511 // (the offset that is excluded from bounds checking and swizzling, to go in 3512 // the instruction's soffset field). This function takes the first kind of 3513 // offset and figures out how to split it between voffset and immoffset. 3514 std::tuple<Register, unsigned, unsigned> 3515 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3516 Register OrigOffset) const { 3517 const unsigned MaxImm = 4095; 3518 Register BaseReg; 3519 unsigned TotalConstOffset; 3520 const LLT S32 = LLT::scalar(32); 3521 3522 std::tie(BaseReg, TotalConstOffset) = 3523 AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3524 3525 unsigned ImmOffset = TotalConstOffset; 3526 3527 // If the immediate value is too big for the immoffset field, put the value 3528 // and -4096 into the immoffset field so that the value that is copied/added 3529 // for the voffset field is a multiple of 4096, and it stands more chance 3530 // of being CSEd with the copy/add for another similar load/store. 3531 // However, do not do that rounding down to a multiple of 4096 if that is a 3532 // negative number, as it appears to be illegal to have a negative offset 3533 // in the vgpr, even if adding the immediate offset makes it positive. 3534 unsigned Overflow = ImmOffset & ~MaxImm; 3535 ImmOffset -= Overflow; 3536 if ((int32_t)Overflow < 0) { 3537 Overflow += ImmOffset; 3538 ImmOffset = 0; 3539 } 3540 3541 if (Overflow != 0) { 3542 if (!BaseReg) { 3543 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3544 } else { 3545 auto OverflowVal = B.buildConstant(S32, Overflow); 3546 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3547 } 3548 } 3549 3550 if (!BaseReg) 3551 BaseReg = B.buildConstant(S32, 0).getReg(0); 3552 3553 return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3554 } 3555 3556 /// Handle register layout difference for f16 images for some subtargets. 3557 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3558 MachineRegisterInfo &MRI, 3559 Register Reg, 3560 bool ImageStore) const { 3561 const LLT S16 = LLT::scalar(16); 3562 const LLT S32 = LLT::scalar(32); 3563 LLT StoreVT = MRI.getType(Reg); 3564 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3565 3566 if (ST.hasUnpackedD16VMem()) { 3567 auto Unmerge = B.buildUnmerge(S16, Reg); 3568 3569 SmallVector<Register, 4> WideRegs; 3570 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3571 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3572 3573 int NumElts = StoreVT.getNumElements(); 3574 3575 return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 3576 } 3577 3578 if (ImageStore && ST.hasImageStoreD16Bug()) { 3579 if (StoreVT.getNumElements() == 2) { 3580 SmallVector<Register, 4> PackedRegs; 3581 Reg = B.buildBitcast(S32, Reg).getReg(0); 3582 PackedRegs.push_back(Reg); 3583 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3584 return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3585 } 3586 3587 if (StoreVT.getNumElements() == 3) { 3588 SmallVector<Register, 4> PackedRegs; 3589 auto Unmerge = B.buildUnmerge(S16, Reg); 3590 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3591 PackedRegs.push_back(Unmerge.getReg(I)); 3592 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3593 Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3594 return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3595 } 3596 3597 if (StoreVT.getNumElements() == 4) { 3598 SmallVector<Register, 4> PackedRegs; 3599 Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3600 auto Unmerge = B.buildUnmerge(S32, Reg); 3601 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3602 PackedRegs.push_back(Unmerge.getReg(I)); 3603 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3604 return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3605 } 3606 3607 llvm_unreachable("invalid data type"); 3608 } 3609 3610 return Reg; 3611 } 3612 3613 Register AMDGPULegalizerInfo::fixStoreSourceType( 3614 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3615 MachineRegisterInfo *MRI = B.getMRI(); 3616 LLT Ty = MRI->getType(VData); 3617 3618 const LLT S16 = LLT::scalar(16); 3619 3620 // Fixup illegal register types for i8 stores. 3621 if (Ty == LLT::scalar(8) || Ty == S16) { 3622 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3623 return AnyExt; 3624 } 3625 3626 if (Ty.isVector()) { 3627 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3628 if (IsFormat) 3629 return handleD16VData(B, *MRI, VData); 3630 } 3631 } 3632 3633 return VData; 3634 } 3635 3636 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3637 MachineRegisterInfo &MRI, 3638 MachineIRBuilder &B, 3639 bool IsTyped, 3640 bool IsFormat) const { 3641 Register VData = MI.getOperand(1).getReg(); 3642 LLT Ty = MRI.getType(VData); 3643 LLT EltTy = Ty.getScalarType(); 3644 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3645 const LLT S32 = LLT::scalar(32); 3646 3647 VData = fixStoreSourceType(B, VData, IsFormat); 3648 Register RSrc = MI.getOperand(2).getReg(); 3649 3650 MachineMemOperand *MMO = *MI.memoperands_begin(); 3651 const int MemSize = MMO->getSize(); 3652 3653 unsigned ImmOffset; 3654 unsigned TotalOffset; 3655 3656 // The typed intrinsics add an immediate after the registers. 3657 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3658 3659 // The struct intrinsic variants add one additional operand over raw. 3660 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3661 Register VIndex; 3662 int OpOffset = 0; 3663 if (HasVIndex) { 3664 VIndex = MI.getOperand(3).getReg(); 3665 OpOffset = 1; 3666 } 3667 3668 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3669 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3670 3671 unsigned Format = 0; 3672 if (IsTyped) { 3673 Format = MI.getOperand(5 + OpOffset).getImm(); 3674 ++OpOffset; 3675 } 3676 3677 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3678 3679 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3680 if (TotalOffset != 0) 3681 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3682 3683 unsigned Opc; 3684 if (IsTyped) { 3685 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3686 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3687 } else if (IsFormat) { 3688 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3689 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3690 } else { 3691 switch (MemSize) { 3692 case 1: 3693 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3694 break; 3695 case 2: 3696 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3697 break; 3698 default: 3699 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3700 break; 3701 } 3702 } 3703 3704 if (!VIndex) 3705 VIndex = B.buildConstant(S32, 0).getReg(0); 3706 3707 auto MIB = B.buildInstr(Opc) 3708 .addUse(VData) // vdata 3709 .addUse(RSrc) // rsrc 3710 .addUse(VIndex) // vindex 3711 .addUse(VOffset) // voffset 3712 .addUse(SOffset) // soffset 3713 .addImm(ImmOffset); // offset(imm) 3714 3715 if (IsTyped) 3716 MIB.addImm(Format); 3717 3718 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3719 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3720 .addMemOperand(MMO); 3721 3722 MI.eraseFromParent(); 3723 return true; 3724 } 3725 3726 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3727 MachineRegisterInfo &MRI, 3728 MachineIRBuilder &B, 3729 bool IsFormat, 3730 bool IsTyped) const { 3731 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3732 MachineMemOperand *MMO = *MI.memoperands_begin(); 3733 const int MemSize = MMO->getSize(); 3734 const LLT S32 = LLT::scalar(32); 3735 3736 Register Dst = MI.getOperand(0).getReg(); 3737 Register RSrc = MI.getOperand(2).getReg(); 3738 3739 // The typed intrinsics add an immediate after the registers. 3740 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3741 3742 // The struct intrinsic variants add one additional operand over raw. 3743 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3744 Register VIndex; 3745 int OpOffset = 0; 3746 if (HasVIndex) { 3747 VIndex = MI.getOperand(3).getReg(); 3748 OpOffset = 1; 3749 } 3750 3751 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3752 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3753 3754 unsigned Format = 0; 3755 if (IsTyped) { 3756 Format = MI.getOperand(5 + OpOffset).getImm(); 3757 ++OpOffset; 3758 } 3759 3760 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3761 unsigned ImmOffset; 3762 unsigned TotalOffset; 3763 3764 LLT Ty = MRI.getType(Dst); 3765 LLT EltTy = Ty.getScalarType(); 3766 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3767 const bool Unpacked = ST.hasUnpackedD16VMem(); 3768 3769 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3770 if (TotalOffset != 0) 3771 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3772 3773 unsigned Opc; 3774 3775 if (IsTyped) { 3776 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3777 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3778 } else if (IsFormat) { 3779 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3780 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3781 } else { 3782 switch (MemSize) { 3783 case 1: 3784 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3785 break; 3786 case 2: 3787 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3788 break; 3789 default: 3790 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3791 break; 3792 } 3793 } 3794 3795 Register LoadDstReg; 3796 3797 bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3798 LLT UnpackedTy = Ty.changeElementSize(32); 3799 3800 if (IsExtLoad) 3801 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3802 else if (Unpacked && IsD16 && Ty.isVector()) 3803 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3804 else 3805 LoadDstReg = Dst; 3806 3807 if (!VIndex) 3808 VIndex = B.buildConstant(S32, 0).getReg(0); 3809 3810 auto MIB = B.buildInstr(Opc) 3811 .addDef(LoadDstReg) // vdata 3812 .addUse(RSrc) // rsrc 3813 .addUse(VIndex) // vindex 3814 .addUse(VOffset) // voffset 3815 .addUse(SOffset) // soffset 3816 .addImm(ImmOffset); // offset(imm) 3817 3818 if (IsTyped) 3819 MIB.addImm(Format); 3820 3821 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3822 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3823 .addMemOperand(MMO); 3824 3825 if (LoadDstReg != Dst) { 3826 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3827 3828 // Widen result for extending loads was widened. 3829 if (IsExtLoad) 3830 B.buildTrunc(Dst, LoadDstReg); 3831 else { 3832 // Repack to original 16-bit vector result 3833 // FIXME: G_TRUNC should work, but legalization currently fails 3834 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3835 SmallVector<Register, 4> Repack; 3836 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3837 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3838 B.buildMerge(Dst, Repack); 3839 } 3840 } 3841 3842 MI.eraseFromParent(); 3843 return true; 3844 } 3845 3846 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3847 MachineIRBuilder &B, 3848 bool IsInc) const { 3849 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3850 AMDGPU::G_AMDGPU_ATOMIC_DEC; 3851 B.buildInstr(Opc) 3852 .addDef(MI.getOperand(0).getReg()) 3853 .addUse(MI.getOperand(2).getReg()) 3854 .addUse(MI.getOperand(3).getReg()) 3855 .cloneMemRefs(MI); 3856 MI.eraseFromParent(); 3857 return true; 3858 } 3859 3860 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3861 switch (IntrID) { 3862 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3863 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3864 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3865 case Intrinsic::amdgcn_raw_buffer_atomic_add: 3866 case Intrinsic::amdgcn_struct_buffer_atomic_add: 3867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3868 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3869 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3870 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3871 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3872 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3873 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3874 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3875 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3876 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3877 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3878 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3879 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3880 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3881 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3883 case Intrinsic::amdgcn_raw_buffer_atomic_and: 3884 case Intrinsic::amdgcn_struct_buffer_atomic_and: 3885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3886 case Intrinsic::amdgcn_raw_buffer_atomic_or: 3887 case Intrinsic::amdgcn_struct_buffer_atomic_or: 3888 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3889 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3890 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3891 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3892 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3893 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3894 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3895 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3896 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3898 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3899 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3901 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3902 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3903 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 3904 default: 3905 llvm_unreachable("unhandled atomic opcode"); 3906 } 3907 } 3908 3909 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3910 MachineIRBuilder &B, 3911 Intrinsic::ID IID) const { 3912 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3913 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3914 const bool HasReturn = MI.getNumExplicitDefs() != 0; 3915 3916 Register Dst; 3917 3918 int OpOffset = 0; 3919 if (HasReturn) { 3920 // A few FP atomics do not support return values. 3921 Dst = MI.getOperand(0).getReg(); 3922 } else { 3923 OpOffset = -1; 3924 } 3925 3926 Register VData = MI.getOperand(2 + OpOffset).getReg(); 3927 Register CmpVal; 3928 3929 if (IsCmpSwap) { 3930 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3931 ++OpOffset; 3932 } 3933 3934 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3935 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 3936 3937 // The struct intrinsic variants add one additional operand over raw. 3938 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3939 Register VIndex; 3940 if (HasVIndex) { 3941 VIndex = MI.getOperand(4 + OpOffset).getReg(); 3942 ++OpOffset; 3943 } 3944 3945 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3946 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3947 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3948 3949 MachineMemOperand *MMO = *MI.memoperands_begin(); 3950 3951 unsigned ImmOffset; 3952 unsigned TotalOffset; 3953 std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3954 if (TotalOffset != 0) 3955 MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3956 3957 if (!VIndex) 3958 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3959 3960 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3961 3962 if (HasReturn) 3963 MIB.addDef(Dst); 3964 3965 MIB.addUse(VData); // vdata 3966 3967 if (IsCmpSwap) 3968 MIB.addReg(CmpVal); 3969 3970 MIB.addUse(RSrc) // rsrc 3971 .addUse(VIndex) // vindex 3972 .addUse(VOffset) // voffset 3973 .addUse(SOffset) // soffset 3974 .addImm(ImmOffset) // offset(imm) 3975 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3976 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3977 .addMemOperand(MMO); 3978 3979 MI.eraseFromParent(); 3980 return true; 3981 } 3982 3983 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3984 /// vector with s16 typed elements. 3985 static void packImageA16AddressToDwords( 3986 MachineIRBuilder &B, MachineInstr &MI, 3987 SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 3988 const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 3989 const LLT S16 = LLT::scalar(16); 3990 const LLT V2S16 = LLT::vector(2, 16); 3991 3992 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 3993 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 3994 if (!SrcOp.isReg()) 3995 continue; // _L to _LZ may have eliminated this. 3996 3997 Register AddrReg = SrcOp.getReg(); 3998 3999 if (I < Intr->GradientStart) { 4000 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4001 PackedAddrs.push_back(AddrReg); 4002 } else { 4003 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4004 // derivatives dx/dh and dx/dv are packed with undef. 4005 if (((I + 1) >= EndIdx) || 4006 ((Intr->NumGradients / 2) % 2 == 1 && 4007 (I == static_cast<unsigned>(Intr->GradientStart + 4008 (Intr->NumGradients / 2) - 1) || 4009 I == static_cast<unsigned>(Intr->GradientStart + 4010 Intr->NumGradients - 1))) || 4011 // Check for _L to _LZ optimization 4012 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4013 PackedAddrs.push_back( 4014 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4015 .getReg(0)); 4016 } else { 4017 PackedAddrs.push_back( 4018 B.buildBuildVector( 4019 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4020 .getReg(0)); 4021 ++I; 4022 } 4023 } 4024 } 4025 } 4026 4027 /// Convert from separate vaddr components to a single vector address register, 4028 /// and replace the remaining operands with $noreg. 4029 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4030 int DimIdx, int NumVAddrs) { 4031 const LLT S32 = LLT::scalar(32); 4032 4033 SmallVector<Register, 8> AddrRegs; 4034 for (int I = 0; I != NumVAddrs; ++I) { 4035 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4036 if (SrcOp.isReg()) { 4037 AddrRegs.push_back(SrcOp.getReg()); 4038 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4039 } 4040 } 4041 4042 int NumAddrRegs = AddrRegs.size(); 4043 if (NumAddrRegs != 1) { 4044 // Round up to 8 elements for v5-v7 4045 // FIXME: Missing intermediate sized register classes and instructions. 4046 if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 4047 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4048 auto Undef = B.buildUndef(S32); 4049 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4050 NumAddrRegs = RoundedNumRegs; 4051 } 4052 4053 auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 4054 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4055 } 4056 4057 for (int I = 1; I != NumVAddrs; ++I) { 4058 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4059 if (SrcOp.isReg()) 4060 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4061 } 4062 } 4063 4064 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4065 /// 4066 /// Depending on the subtarget, load/store with 16-bit element data need to be 4067 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4068 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4069 /// registers. 4070 /// 4071 /// We don't want to directly select image instructions just yet, but also want 4072 /// to exposes all register repacking to the legalizer/combiners. We also don't 4073 /// want a selected instrution entering RegBankSelect. In order to avoid 4074 /// defining a multitude of intermediate image instructions, directly hack on 4075 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4076 /// now unnecessary arguments with $noreg. 4077 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4078 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4079 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4080 4081 const unsigned NumDefs = MI.getNumExplicitDefs(); 4082 const unsigned ArgOffset = NumDefs + 1; 4083 bool IsTFE = NumDefs == 2; 4084 // We are only processing the operands of d16 image operations on subtargets 4085 // that use the unpacked register layout, or need to repack the TFE result. 4086 4087 // TODO: Do we need to guard against already legalized intrinsics? 4088 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4089 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4090 4091 MachineRegisterInfo *MRI = B.getMRI(); 4092 const LLT S32 = LLT::scalar(32); 4093 const LLT S16 = LLT::scalar(16); 4094 const LLT V2S16 = LLT::vector(2, 16); 4095 4096 unsigned DMask = 0; 4097 4098 // Check for 16 bit addresses and pack if true. 4099 LLT GradTy = 4100 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4101 LLT AddrTy = 4102 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4103 const bool IsG16 = GradTy == S16; 4104 const bool IsA16 = AddrTy == S16; 4105 4106 int DMaskLanes = 0; 4107 if (!BaseOpcode->Atomic) { 4108 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4109 if (BaseOpcode->Gather4) { 4110 DMaskLanes = 4; 4111 } else if (DMask != 0) { 4112 DMaskLanes = countPopulation(DMask); 4113 } else if (!IsTFE && !BaseOpcode->Store) { 4114 // If dmask is 0, this is a no-op load. This can be eliminated. 4115 B.buildUndef(MI.getOperand(0)); 4116 MI.eraseFromParent(); 4117 return true; 4118 } 4119 } 4120 4121 Observer.changingInstr(MI); 4122 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4123 4124 unsigned NewOpcode = NumDefs == 0 ? 4125 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4126 4127 // Track that we legalized this 4128 MI.setDesc(B.getTII().get(NewOpcode)); 4129 4130 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4131 // dmask to be at least 1 otherwise the instruction will fail 4132 if (IsTFE && DMask == 0) { 4133 DMask = 0x1; 4134 DMaskLanes = 1; 4135 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4136 } 4137 4138 if (BaseOpcode->Atomic) { 4139 Register VData0 = MI.getOperand(2).getReg(); 4140 LLT Ty = MRI->getType(VData0); 4141 4142 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4143 if (Ty.isVector()) 4144 return false; 4145 4146 if (BaseOpcode->AtomicX2) { 4147 Register VData1 = MI.getOperand(3).getReg(); 4148 // The two values are packed in one register. 4149 LLT PackedTy = LLT::vector(2, Ty); 4150 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4151 MI.getOperand(2).setReg(Concat.getReg(0)); 4152 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4153 } 4154 } 4155 4156 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4157 4158 // Optimize _L to _LZ when _L is zero 4159 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4160 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4161 const ConstantFP *ConstantLod; 4162 4163 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4164 m_GFCst(ConstantLod))) { 4165 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4166 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4167 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4168 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4169 Intr->Dim); 4170 4171 // The starting indexes should remain in the same place. 4172 --CorrectedNumVAddrs; 4173 4174 MI.getOperand(MI.getNumExplicitDefs()) 4175 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4176 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4177 Intr = NewImageDimIntr; 4178 } 4179 } 4180 } 4181 4182 // Optimize _mip away, when 'lod' is zero 4183 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4184 int64_t ConstantLod; 4185 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4186 m_ICst(ConstantLod))) { 4187 if (ConstantLod == 0) { 4188 // TODO: Change intrinsic opcode and remove operand instead or replacing 4189 // it with 0, as the _L to _LZ handling is done above. 4190 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4191 --CorrectedNumVAddrs; 4192 } 4193 } 4194 } 4195 4196 // Rewrite the addressing register layout before doing anything else. 4197 if (IsA16 || IsG16) { 4198 if (IsA16) { 4199 // Target must support the feature and gradients need to be 16 bit too 4200 if (!ST.hasA16() || !IsG16) 4201 return false; 4202 } else if (!ST.hasG16()) 4203 return false; 4204 4205 if (Intr->NumVAddrs > 1) { 4206 SmallVector<Register, 4> PackedRegs; 4207 // Don't compress addresses for G16 4208 const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4209 packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4210 PackEndIdx); 4211 4212 if (!IsA16) { 4213 // Add uncompressed address 4214 for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4215 int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 4216 assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 4217 PackedRegs.push_back(AddrReg); 4218 } 4219 } 4220 4221 // See also below in the non-a16 branch 4222 const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 4223 4224 if (!UseNSA && PackedRegs.size() > 1) { 4225 LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 4226 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4227 PackedRegs[0] = Concat.getReg(0); 4228 PackedRegs.resize(1); 4229 } 4230 4231 const unsigned NumPacked = PackedRegs.size(); 4232 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4233 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4234 if (!SrcOp.isReg()) { 4235 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4236 continue; 4237 } 4238 4239 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4240 4241 if (I - Intr->VAddrStart < NumPacked) 4242 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4243 else 4244 SrcOp.setReg(AMDGPU::NoRegister); 4245 } 4246 } 4247 } else { 4248 // If the register allocator cannot place the address registers contiguously 4249 // without introducing moves, then using the non-sequential address encoding 4250 // is always preferable, since it saves VALU instructions and is usually a 4251 // wash in terms of code size or even better. 4252 // 4253 // However, we currently have no way of hinting to the register allocator 4254 // that MIMG addresses should be placed contiguously when it is possible to 4255 // do so, so force non-NSA for the common 2-address case as a heuristic. 4256 // 4257 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4258 // allocation when possible. 4259 const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 4260 4261 if (!UseNSA && Intr->NumVAddrs > 1) 4262 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4263 Intr->NumVAddrs); 4264 } 4265 4266 int Flags = 0; 4267 if (IsA16) 4268 Flags |= 1; 4269 if (IsG16) 4270 Flags |= 2; 4271 MI.addOperand(MachineOperand::CreateImm(Flags)); 4272 4273 if (BaseOpcode->Store) { // No TFE for stores? 4274 // TODO: Handle dmask trim 4275 Register VData = MI.getOperand(1).getReg(); 4276 LLT Ty = MRI->getType(VData); 4277 if (!Ty.isVector() || Ty.getElementType() != S16) 4278 return true; 4279 4280 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4281 if (RepackedReg != VData) { 4282 MI.getOperand(1).setReg(RepackedReg); 4283 } 4284 4285 return true; 4286 } 4287 4288 Register DstReg = MI.getOperand(0).getReg(); 4289 LLT Ty = MRI->getType(DstReg); 4290 const LLT EltTy = Ty.getScalarType(); 4291 const bool IsD16 = Ty.getScalarType() == S16; 4292 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4293 4294 // Confirm that the return type is large enough for the dmask specified 4295 if (NumElts < DMaskLanes) 4296 return false; 4297 4298 if (NumElts > 4 || DMaskLanes > 4) 4299 return false; 4300 4301 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4302 const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 4303 4304 // The raw dword aligned data component of the load. The only legal cases 4305 // where this matters should be when using the packed D16 format, for 4306 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4307 LLT RoundedTy; 4308 4309 // S32 vector to to cover all data, plus TFE result element. 4310 LLT TFETy; 4311 4312 // Register type to use for each loaded component. Will be S32 or V2S16. 4313 LLT RegTy; 4314 4315 if (IsD16 && ST.hasUnpackedD16VMem()) { 4316 RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 4317 TFETy = LLT::vector(AdjustedNumElts + 1, 32); 4318 RegTy = S32; 4319 } else { 4320 unsigned EltSize = EltTy.getSizeInBits(); 4321 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4322 unsigned RoundedSize = 32 * RoundedElts; 4323 RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 4324 TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 4325 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4326 } 4327 4328 // The return type does not need adjustment. 4329 // TODO: Should we change s16 case to s32 or <2 x s16>? 4330 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4331 return true; 4332 4333 Register Dst1Reg; 4334 4335 // Insert after the instruction. 4336 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4337 4338 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4339 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4340 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4341 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4342 4343 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4344 4345 MI.getOperand(0).setReg(NewResultReg); 4346 4347 // In the IR, TFE is supposed to be used with a 2 element struct return 4348 // type. The intruction really returns these two values in one contiguous 4349 // register, with one additional dword beyond the loaded data. Rewrite the 4350 // return type to use a single register result. 4351 4352 if (IsTFE) { 4353 Dst1Reg = MI.getOperand(1).getReg(); 4354 if (MRI->getType(Dst1Reg) != S32) 4355 return false; 4356 4357 // TODO: Make sure the TFE operand bit is set. 4358 MI.RemoveOperand(1); 4359 4360 // Handle the easy case that requires no repack instructions. 4361 if (Ty == S32) { 4362 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4363 return true; 4364 } 4365 } 4366 4367 // Now figure out how to copy the new result register back into the old 4368 // result. 4369 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4370 4371 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4372 4373 if (ResultNumRegs == 1) { 4374 assert(!IsTFE); 4375 ResultRegs[0] = NewResultReg; 4376 } else { 4377 // We have to repack into a new vector of some kind. 4378 for (int I = 0; I != NumDataRegs; ++I) 4379 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4380 B.buildUnmerge(ResultRegs, NewResultReg); 4381 4382 // Drop the final TFE element to get the data part. The TFE result is 4383 // directly written to the right place already. 4384 if (IsTFE) 4385 ResultRegs.resize(NumDataRegs); 4386 } 4387 4388 // For an s16 scalar result, we form an s32 result with a truncate regardless 4389 // of packed vs. unpacked. 4390 if (IsD16 && !Ty.isVector()) { 4391 B.buildTrunc(DstReg, ResultRegs[0]); 4392 return true; 4393 } 4394 4395 // Avoid a build/concat_vector of 1 entry. 4396 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4397 B.buildBitcast(DstReg, ResultRegs[0]); 4398 return true; 4399 } 4400 4401 assert(Ty.isVector()); 4402 4403 if (IsD16) { 4404 // For packed D16 results with TFE enabled, all the data components are 4405 // S32. Cast back to the expected type. 4406 // 4407 // TODO: We don't really need to use load s32 elements. We would only need one 4408 // cast for the TFE result if a multiple of v2s16 was used. 4409 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4410 for (Register &Reg : ResultRegs) 4411 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4412 } else if (ST.hasUnpackedD16VMem()) { 4413 for (Register &Reg : ResultRegs) 4414 Reg = B.buildTrunc(S16, Reg).getReg(0); 4415 } 4416 } 4417 4418 auto padWithUndef = [&](LLT Ty, int NumElts) { 4419 if (NumElts == 0) 4420 return; 4421 Register Undef = B.buildUndef(Ty).getReg(0); 4422 for (int I = 0; I != NumElts; ++I) 4423 ResultRegs.push_back(Undef); 4424 }; 4425 4426 // Pad out any elements eliminated due to the dmask. 4427 LLT ResTy = MRI->getType(ResultRegs[0]); 4428 if (!ResTy.isVector()) { 4429 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4430 B.buildBuildVector(DstReg, ResultRegs); 4431 return true; 4432 } 4433 4434 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4435 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4436 4437 // Deal with the one annoying legal case. 4438 const LLT V3S16 = LLT::vector(3, 16); 4439 if (Ty == V3S16) { 4440 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4441 auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4442 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4443 return true; 4444 } 4445 4446 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4447 B.buildConcatVectors(DstReg, ResultRegs); 4448 return true; 4449 } 4450 4451 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4452 LegalizerHelper &Helper, MachineInstr &MI) const { 4453 MachineIRBuilder &B = Helper.MIRBuilder; 4454 GISelChangeObserver &Observer = Helper.Observer; 4455 4456 Register Dst = MI.getOperand(0).getReg(); 4457 LLT Ty = B.getMRI()->getType(Dst); 4458 unsigned Size = Ty.getSizeInBits(); 4459 MachineFunction &MF = B.getMF(); 4460 4461 Observer.changingInstr(MI); 4462 4463 if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4464 Ty = getBitcastRegisterType(Ty); 4465 Helper.bitcastDst(MI, Ty, 0); 4466 Dst = MI.getOperand(0).getReg(); 4467 B.setInsertPt(B.getMBB(), MI); 4468 } 4469 4470 // FIXME: We don't really need this intermediate instruction. The intrinsic 4471 // should be fixed to have a memory operand. Since it's readnone, we're not 4472 // allowed to add one. 4473 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4474 MI.RemoveOperand(1); // Remove intrinsic ID 4475 4476 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4477 // TODO: Should this use datalayout alignment? 4478 const unsigned MemSize = (Size + 7) / 8; 4479 const Align MemAlign(4); 4480 MachineMemOperand *MMO = MF.getMachineMemOperand( 4481 MachinePointerInfo(), 4482 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4483 MachineMemOperand::MOInvariant, 4484 MemSize, MemAlign); 4485 MI.addMemOperand(MF, MMO); 4486 4487 // There are no 96-bit result scalar loads, but widening to 128-bit should 4488 // always be legal. We may need to restore this to a 96-bit result if it turns 4489 // out this needs to be converted to a vector load during RegBankSelect. 4490 if (!isPowerOf2_32(Size)) { 4491 if (Ty.isVector()) 4492 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4493 else 4494 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4495 } 4496 4497 Observer.changedInstr(MI); 4498 return true; 4499 } 4500 4501 // TODO: Move to selection 4502 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4503 MachineRegisterInfo &MRI, 4504 MachineIRBuilder &B) const { 4505 // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4506 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4507 !ST.isTrapHandlerEnabled()) { 4508 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4509 } else { 4510 // Pass queue pointer to trap handler as input, and insert trap instruction 4511 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4512 MachineRegisterInfo &MRI = *B.getMRI(); 4513 4514 Register LiveIn = 4515 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4516 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4517 return false; 4518 4519 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4520 B.buildCopy(SGPR01, LiveIn); 4521 B.buildInstr(AMDGPU::S_TRAP) 4522 .addImm(GCNSubtarget::TrapIDLLVMTrap) 4523 .addReg(SGPR01, RegState::Implicit); 4524 } 4525 4526 MI.eraseFromParent(); 4527 return true; 4528 } 4529 4530 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4531 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4532 // Is non-HSA path or trap-handler disabled? then, report a warning 4533 // accordingly 4534 if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4535 !ST.isTrapHandlerEnabled()) { 4536 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4537 "debugtrap handler not supported", 4538 MI.getDebugLoc(), DS_Warning); 4539 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4540 Ctx.diagnose(NoTrap); 4541 } else { 4542 // Insert debug-trap instruction 4543 B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4544 } 4545 4546 MI.eraseFromParent(); 4547 return true; 4548 } 4549 4550 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4551 MachineIRBuilder &B) const { 4552 MachineRegisterInfo &MRI = *B.getMRI(); 4553 const LLT S16 = LLT::scalar(16); 4554 const LLT S32 = LLT::scalar(32); 4555 4556 Register DstReg = MI.getOperand(0).getReg(); 4557 Register NodePtr = MI.getOperand(2).getReg(); 4558 Register RayExtent = MI.getOperand(3).getReg(); 4559 Register RayOrigin = MI.getOperand(4).getReg(); 4560 Register RayDir = MI.getOperand(5).getReg(); 4561 Register RayInvDir = MI.getOperand(6).getReg(); 4562 Register TDescr = MI.getOperand(7).getReg(); 4563 4564 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4565 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4566 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4567 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4568 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4569 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4570 4571 SmallVector<Register, 12> Ops; 4572 if (Is64) { 4573 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4574 Ops.push_back(Unmerge.getReg(0)); 4575 Ops.push_back(Unmerge.getReg(1)); 4576 } else { 4577 Ops.push_back(NodePtr); 4578 } 4579 Ops.push_back(RayExtent); 4580 4581 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4582 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4583 Ops.push_back(Unmerge.getReg(0)); 4584 Ops.push_back(Unmerge.getReg(1)); 4585 Ops.push_back(Unmerge.getReg(2)); 4586 }; 4587 4588 packLanes(RayOrigin); 4589 if (IsA16) { 4590 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4591 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4592 Register R1 = MRI.createGenericVirtualRegister(S32); 4593 Register R2 = MRI.createGenericVirtualRegister(S32); 4594 Register R3 = MRI.createGenericVirtualRegister(S32); 4595 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4596 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4597 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4598 Ops.push_back(R1); 4599 Ops.push_back(R2); 4600 Ops.push_back(R3); 4601 } else { 4602 packLanes(RayDir); 4603 packLanes(RayInvDir); 4604 } 4605 4606 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4607 .addDef(DstReg) 4608 .addImm(Opcode); 4609 4610 for (Register R : Ops) { 4611 MIB.addUse(R); 4612 } 4613 4614 MIB.addUse(TDescr) 4615 .addImm(IsA16 ? 1 : 0) 4616 .cloneMemRefs(MI); 4617 4618 MI.eraseFromParent(); 4619 return true; 4620 } 4621 4622 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4623 MachineInstr &MI) const { 4624 MachineIRBuilder &B = Helper.MIRBuilder; 4625 MachineRegisterInfo &MRI = *B.getMRI(); 4626 4627 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4628 auto IntrID = MI.getIntrinsicID(); 4629 switch (IntrID) { 4630 case Intrinsic::amdgcn_if: 4631 case Intrinsic::amdgcn_else: { 4632 MachineInstr *Br = nullptr; 4633 MachineBasicBlock *UncondBrTarget = nullptr; 4634 bool Negated = false; 4635 if (MachineInstr *BrCond = 4636 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4637 const SIRegisterInfo *TRI 4638 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4639 4640 Register Def = MI.getOperand(1).getReg(); 4641 Register Use = MI.getOperand(3).getReg(); 4642 4643 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4644 4645 if (Negated) 4646 std::swap(CondBrTarget, UncondBrTarget); 4647 4648 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4649 if (IntrID == Intrinsic::amdgcn_if) { 4650 B.buildInstr(AMDGPU::SI_IF) 4651 .addDef(Def) 4652 .addUse(Use) 4653 .addMBB(UncondBrTarget); 4654 } else { 4655 B.buildInstr(AMDGPU::SI_ELSE) 4656 .addDef(Def) 4657 .addUse(Use) 4658 .addMBB(UncondBrTarget); 4659 } 4660 4661 if (Br) { 4662 Br->getOperand(0).setMBB(CondBrTarget); 4663 } else { 4664 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4665 // since we're swapping branch targets it needs to be reinserted. 4666 // FIXME: IRTranslator should probably not do this 4667 B.buildBr(*CondBrTarget); 4668 } 4669 4670 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4671 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4672 MI.eraseFromParent(); 4673 BrCond->eraseFromParent(); 4674 return true; 4675 } 4676 4677 return false; 4678 } 4679 case Intrinsic::amdgcn_loop: { 4680 MachineInstr *Br = nullptr; 4681 MachineBasicBlock *UncondBrTarget = nullptr; 4682 bool Negated = false; 4683 if (MachineInstr *BrCond = 4684 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4685 const SIRegisterInfo *TRI 4686 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4687 4688 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4689 Register Reg = MI.getOperand(2).getReg(); 4690 4691 if (Negated) 4692 std::swap(CondBrTarget, UncondBrTarget); 4693 4694 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4695 B.buildInstr(AMDGPU::SI_LOOP) 4696 .addUse(Reg) 4697 .addMBB(UncondBrTarget); 4698 4699 if (Br) 4700 Br->getOperand(0).setMBB(CondBrTarget); 4701 else 4702 B.buildBr(*CondBrTarget); 4703 4704 MI.eraseFromParent(); 4705 BrCond->eraseFromParent(); 4706 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4707 return true; 4708 } 4709 4710 return false; 4711 } 4712 case Intrinsic::amdgcn_kernarg_segment_ptr: 4713 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4714 // This only makes sense to call in a kernel, so just lower to null. 4715 B.buildConstant(MI.getOperand(0).getReg(), 0); 4716 MI.eraseFromParent(); 4717 return true; 4718 } 4719 4720 return legalizePreloadedArgIntrin( 4721 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4722 case Intrinsic::amdgcn_implicitarg_ptr: 4723 return legalizeImplicitArgPtr(MI, MRI, B); 4724 case Intrinsic::amdgcn_workitem_id_x: 4725 return legalizePreloadedArgIntrin(MI, MRI, B, 4726 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4727 case Intrinsic::amdgcn_workitem_id_y: 4728 return legalizePreloadedArgIntrin(MI, MRI, B, 4729 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4730 case Intrinsic::amdgcn_workitem_id_z: 4731 return legalizePreloadedArgIntrin(MI, MRI, B, 4732 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4733 case Intrinsic::amdgcn_workgroup_id_x: 4734 return legalizePreloadedArgIntrin(MI, MRI, B, 4735 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4736 case Intrinsic::amdgcn_workgroup_id_y: 4737 return legalizePreloadedArgIntrin(MI, MRI, B, 4738 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4739 case Intrinsic::amdgcn_workgroup_id_z: 4740 return legalizePreloadedArgIntrin(MI, MRI, B, 4741 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4742 case Intrinsic::amdgcn_dispatch_ptr: 4743 return legalizePreloadedArgIntrin(MI, MRI, B, 4744 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4745 case Intrinsic::amdgcn_queue_ptr: 4746 return legalizePreloadedArgIntrin(MI, MRI, B, 4747 AMDGPUFunctionArgInfo::QUEUE_PTR); 4748 case Intrinsic::amdgcn_implicit_buffer_ptr: 4749 return legalizePreloadedArgIntrin( 4750 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4751 case Intrinsic::amdgcn_dispatch_id: 4752 return legalizePreloadedArgIntrin(MI, MRI, B, 4753 AMDGPUFunctionArgInfo::DISPATCH_ID); 4754 case Intrinsic::amdgcn_fdiv_fast: 4755 return legalizeFDIVFastIntrin(MI, MRI, B); 4756 case Intrinsic::amdgcn_is_shared: 4757 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4758 case Intrinsic::amdgcn_is_private: 4759 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4760 case Intrinsic::amdgcn_wavefrontsize: { 4761 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4762 MI.eraseFromParent(); 4763 return true; 4764 } 4765 case Intrinsic::amdgcn_s_buffer_load: 4766 return legalizeSBufferLoad(Helper, MI); 4767 case Intrinsic::amdgcn_raw_buffer_store: 4768 case Intrinsic::amdgcn_struct_buffer_store: 4769 return legalizeBufferStore(MI, MRI, B, false, false); 4770 case Intrinsic::amdgcn_raw_buffer_store_format: 4771 case Intrinsic::amdgcn_struct_buffer_store_format: 4772 return legalizeBufferStore(MI, MRI, B, false, true); 4773 case Intrinsic::amdgcn_raw_tbuffer_store: 4774 case Intrinsic::amdgcn_struct_tbuffer_store: 4775 return legalizeBufferStore(MI, MRI, B, true, true); 4776 case Intrinsic::amdgcn_raw_buffer_load: 4777 case Intrinsic::amdgcn_struct_buffer_load: 4778 return legalizeBufferLoad(MI, MRI, B, false, false); 4779 case Intrinsic::amdgcn_raw_buffer_load_format: 4780 case Intrinsic::amdgcn_struct_buffer_load_format: 4781 return legalizeBufferLoad(MI, MRI, B, true, false); 4782 case Intrinsic::amdgcn_raw_tbuffer_load: 4783 case Intrinsic::amdgcn_struct_tbuffer_load: 4784 return legalizeBufferLoad(MI, MRI, B, true, true); 4785 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4786 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4787 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4788 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4789 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4790 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4791 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4792 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4793 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4794 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4795 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4796 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4797 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4798 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4799 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4800 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4801 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4802 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4803 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4804 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4805 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4806 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4807 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4808 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4809 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4810 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4811 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4812 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4813 return legalizeBufferAtomic(MI, B, IntrID); 4814 case Intrinsic::amdgcn_atomic_inc: 4815 return legalizeAtomicIncDec(MI, B, true); 4816 case Intrinsic::amdgcn_atomic_dec: 4817 return legalizeAtomicIncDec(MI, B, false); 4818 case Intrinsic::trap: 4819 return legalizeTrapIntrinsic(MI, MRI, B); 4820 case Intrinsic::debugtrap: 4821 return legalizeDebugTrapIntrinsic(MI, MRI, B); 4822 case Intrinsic::amdgcn_rsq_clamp: 4823 return legalizeRsqClampIntrinsic(MI, MRI, B); 4824 case Intrinsic::amdgcn_ds_fadd: 4825 case Intrinsic::amdgcn_ds_fmin: 4826 case Intrinsic::amdgcn_ds_fmax: 4827 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4828 case Intrinsic::amdgcn_image_bvh_intersect_ray: 4829 return legalizeBVHIntrinsic(MI, B); 4830 default: { 4831 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4832 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4833 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 4834 return true; 4835 } 4836 } 4837 4838 return true; 4839 } 4840