1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "Utils/AMDGPUBaseInfo.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/BinaryFormat/ELF.h" 24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/IntrinsicsAMDGPU.h" 29 30 #define DEBUG_TYPE "amdgpu-legalinfo" 31 32 using namespace llvm; 33 using namespace LegalizeActions; 34 using namespace LegalizeMutations; 35 using namespace LegalityPredicates; 36 using namespace MIPatternMatch; 37 38 // Hack until load/store selection patterns support any tuple of legal types. 39 static cl::opt<bool> EnableNewLegality( 40 "amdgpu-global-isel-new-legality", 41 cl::desc("Use GlobalISel desired legality, rather than try to use" 42 "rules compatible with selection patterns"), 43 cl::init(false), 44 cl::ReallyHidden); 45 46 static constexpr unsigned MaxRegisterSize = 1024; 47 48 // Round the number of elements to the next power of two elements 49 static LLT getPow2VectorType(LLT Ty) { 50 unsigned NElts = Ty.getNumElements(); 51 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 52 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 53 } 54 55 // Round the number of bits to the next power of two bits 56 static LLT getPow2ScalarType(LLT Ty) { 57 unsigned Bits = Ty.getSizeInBits(); 58 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 59 return LLT::scalar(Pow2Bits); 60 } 61 62 /// \returs true if this is an odd sized vector which should widen by adding an 63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 64 /// excludes s1 vectors, which should always be scalarized. 65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 66 return [=](const LegalityQuery &Query) { 67 const LLT Ty = Query.Types[TypeIdx]; 68 if (!Ty.isVector()) 69 return false; 70 71 const LLT EltTy = Ty.getElementType(); 72 const unsigned EltSize = EltTy.getSizeInBits(); 73 return Ty.getNumElements() % 2 != 0 && 74 EltSize > 1 && EltSize < 32 && 75 Ty.getSizeInBits() % 32 != 0; 76 }; 77 } 78 79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 80 return [=](const LegalityQuery &Query) { 81 const LLT Ty = Query.Types[TypeIdx]; 82 return Ty.getSizeInBits() % 32 == 0; 83 }; 84 } 85 86 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 const LLT EltTy = Ty.getScalarType(); 90 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 91 }; 92 } 93 94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 95 return [=](const LegalityQuery &Query) { 96 const LLT Ty = Query.Types[TypeIdx]; 97 const LLT EltTy = Ty.getElementType(); 98 return std::make_pair(TypeIdx, 99 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 100 }; 101 } 102 103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 104 return [=](const LegalityQuery &Query) { 105 const LLT Ty = Query.Types[TypeIdx]; 106 const LLT EltTy = Ty.getElementType(); 107 unsigned Size = Ty.getSizeInBits(); 108 unsigned Pieces = (Size + 63) / 64; 109 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 110 return std::make_pair( 111 TypeIdx, 112 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 113 }; 114 } 115 116 // Increase the number of vector elements to reach the next multiple of 32-bit 117 // type. 118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 119 return [=](const LegalityQuery &Query) { 120 const LLT Ty = Query.Types[TypeIdx]; 121 122 const LLT EltTy = Ty.getElementType(); 123 const int Size = Ty.getSizeInBits(); 124 const int EltSize = EltTy.getSizeInBits(); 125 const int NextMul32 = (Size + 31) / 32; 126 127 assert(EltSize < 32); 128 129 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 131 }; 132 } 133 134 static LLT getBitcastRegisterType(const LLT Ty) { 135 const unsigned Size = Ty.getSizeInBits(); 136 137 LLT CoercedTy; 138 if (Size <= 32) { 139 // <2 x s8> -> s16 140 // <4 x s8> -> s32 141 return LLT::scalar(Size); 142 } 143 144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145 } 146 147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148 return [=](const LegalityQuery &Query) { 149 const LLT Ty = Query.Types[TypeIdx]; 150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151 }; 152 } 153 154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155 return [=](const LegalityQuery &Query) { 156 const LLT Ty = Query.Types[TypeIdx]; 157 unsigned Size = Ty.getSizeInBits(); 158 assert(Size % 32 == 0); 159 return std::make_pair( 160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 161 }; 162 } 163 164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 165 return [=](const LegalityQuery &Query) { 166 const LLT QueryTy = Query.Types[TypeIdx]; 167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 168 }; 169 } 170 171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 172 return [=](const LegalityQuery &Query) { 173 const LLT QueryTy = Query.Types[TypeIdx]; 174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 175 }; 176 } 177 178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 179 return [=](const LegalityQuery &Query) { 180 const LLT QueryTy = Query.Types[TypeIdx]; 181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 182 }; 183 } 184 185 static bool isRegisterSize(unsigned Size) { 186 return Size % 32 == 0 && Size <= MaxRegisterSize; 187 } 188 189 static bool isRegisterVectorElementType(LLT EltTy) { 190 const int EltSize = EltTy.getSizeInBits(); 191 return EltSize == 16 || EltSize % 32 == 0; 192 } 193 194 static bool isRegisterVectorType(LLT Ty) { 195 const int EltSize = Ty.getElementType().getSizeInBits(); 196 return EltSize == 32 || EltSize == 64 || 197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 198 EltSize == 128 || EltSize == 256; 199 } 200 201 static bool isRegisterType(LLT Ty) { 202 if (!isRegisterSize(Ty.getSizeInBits())) 203 return false; 204 205 if (Ty.isVector()) 206 return isRegisterVectorType(Ty); 207 208 return true; 209 } 210 211 // Any combination of 32 or 64-bit elements up the maximum register size, and 212 // multiples of v2s16. 213 static LegalityPredicate isRegisterType(unsigned TypeIdx) { 214 return [=](const LegalityQuery &Query) { 215 return isRegisterType(Query.Types[TypeIdx]); 216 }; 217 } 218 219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 if (!QueryTy.isVector()) 223 return false; 224 const LLT EltTy = QueryTy.getElementType(); 225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 226 }; 227 } 228 229 // If we have a truncating store or an extending load with a data size larger 230 // than 32-bits, we need to reduce to a 32-bit type. 231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 232 return [=](const LegalityQuery &Query) { 233 const LLT Ty = Query.Types[TypeIdx]; 234 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 236 }; 237 } 238 239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 240 // handle some operations by just promoting the register during 241 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 243 bool IsLoad) { 244 switch (AS) { 245 case AMDGPUAS::PRIVATE_ADDRESS: 246 // FIXME: Private element size. 247 return ST.enableFlatScratch() ? 128 : 32; 248 case AMDGPUAS::LOCAL_ADDRESS: 249 return ST.useDS128() ? 128 : 64; 250 case AMDGPUAS::GLOBAL_ADDRESS: 251 case AMDGPUAS::CONSTANT_ADDRESS: 252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 253 // Treat constant and global as identical. SMRD loads are sometimes usable for 254 // global loads (ideally constant address space should be eliminated) 255 // depending on the context. Legality cannot be context dependent, but 256 // RegBankSelect can split the load as necessary depending on the pointer 257 // register bank/uniformity and if the memory is invariant or not written in a 258 // kernel. 259 return IsLoad ? 512 : 128; 260 default: 261 // Flat addresses may contextually need to be split to 32-bit parts if they 262 // may alias scratch depending on the subtarget. 263 return 128; 264 } 265 } 266 267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268 const LegalityQuery &Query) { 269 const LLT Ty = Query.Types[0]; 270 271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 273 274 unsigned RegSize = Ty.getSizeInBits(); 275 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 276 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 277 unsigned AS = Query.Types[1].getAddressSpace(); 278 279 // All of these need to be custom lowered to cast the pointer operand. 280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 281 return false; 282 283 // Do not handle extending vector loads. 284 if (Ty.isVector() && MemSize != RegSize) 285 return false; 286 287 // TODO: We should be able to widen loads if the alignment is high enough, but 288 // we also need to modify the memory access size. 289 #if 0 290 // Accept widening loads based on alignment. 291 if (IsLoad && MemSize < Size) 292 MemSize = std::max(MemSize, Align); 293 #endif 294 295 // Only 1-byte and 2-byte to 32-bit extloads are valid. 296 if (MemSize != RegSize && RegSize != 32) 297 return false; 298 299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 300 return false; 301 302 switch (MemSize) { 303 case 8: 304 case 16: 305 case 32: 306 case 64: 307 case 128: 308 break; 309 case 96: 310 if (!ST.hasDwordx3LoadStores()) 311 return false; 312 break; 313 case 256: 314 case 512: 315 // These may contextually need to be broken down. 316 break; 317 default: 318 return false; 319 } 320 321 assert(RegSize >= MemSize); 322 323 if (AlignBits < MemSize) { 324 const SITargetLowering *TLI = ST.getTargetLowering(); 325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326 Align(AlignBits / 8))) 327 return false; 328 } 329 330 return true; 331 } 332 333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 334 // workaround this. Eventually it should ignore the type for loads and only care 335 // about the size. Return true in cases where we will workaround this for now by 336 // bitcasting. 337 static bool loadStoreBitcastWorkaround(const LLT Ty) { 338 if (EnableNewLegality) 339 return false; 340 341 const unsigned Size = Ty.getSizeInBits(); 342 if (Size <= 64) 343 return false; 344 if (!Ty.isVector()) 345 return true; 346 347 LLT EltTy = Ty.getElementType(); 348 if (EltTy.isPointer()) 349 return true; 350 351 unsigned EltSize = EltTy.getSizeInBits(); 352 return EltSize != 32 && EltSize != 64; 353 } 354 355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 356 const LLT Ty = Query.Types[0]; 357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 358 !loadStoreBitcastWorkaround(Ty); 359 } 360 361 /// Return true if a load or store of the type should be lowered with a bitcast 362 /// to a different type. 363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364 const LLT MemTy) { 365 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366 const unsigned Size = Ty.getSizeInBits(); 367 if (Size != MemSizeInBits) 368 return Size <= 32 && Ty.isVector(); 369 370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371 return true; 372 373 // Don't try to handle bitcasting vector ext loads for now. 374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375 (Size <= 32 || isRegisterSize(Size)) && 376 !isRegisterVectorElementType(Ty.getElementType()); 377 } 378 379 /// Return true if we should legalize a load by widening an odd sized memory 380 /// access up to the alignment. Note this case when the memory access itself 381 /// changes, not the size of the result register. 382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 383 unsigned AlignInBits, unsigned AddrSpace, 384 unsigned Opcode) { 385 unsigned SizeInBits = MemoryTy.getSizeInBits(); 386 // We don't want to widen cases that are naturally legal. 387 if (isPowerOf2_32(SizeInBits)) 388 return false; 389 390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391 // end up widening these for a scalar load during RegBankSelect, since there 392 // aren't 96-bit scalar loads. 393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394 return false; 395 396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397 return false; 398 399 // A load is known dereferenceable up to the alignment, so it's legal to widen 400 // to it. 401 // 402 // TODO: Could check dereferenceable for less aligned cases. 403 unsigned RoundedSize = NextPowerOf2(SizeInBits); 404 if (AlignInBits < RoundedSize) 405 return false; 406 407 // Do not widen if it would introduce a slow unaligned load. 408 const SITargetLowering *TLI = ST.getTargetLowering(); 409 bool Fast = false; 410 return TLI->allowsMisalignedMemoryAccessesImpl( 411 RoundedSize, AddrSpace, Align(AlignInBits / 8), 412 MachineMemOperand::MOLoad, &Fast) && 413 Fast; 414 } 415 416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417 unsigned Opcode) { 418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419 return false; 420 421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422 Query.MMODescrs[0].AlignInBits, 423 Query.Types[1].getAddressSpace(), Opcode); 424 } 425 426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 427 const GCNTargetMachine &TM) 428 : ST(ST_) { 429 using namespace TargetOpcode; 430 431 auto GetAddrSpacePtr = [&TM](unsigned AS) { 432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 433 }; 434 435 const LLT S1 = LLT::scalar(1); 436 const LLT S8 = LLT::scalar(8); 437 const LLT S16 = LLT::scalar(16); 438 const LLT S32 = LLT::scalar(32); 439 const LLT S64 = LLT::scalar(64); 440 const LLT S128 = LLT::scalar(128); 441 const LLT S256 = LLT::scalar(256); 442 const LLT S512 = LLT::scalar(512); 443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 444 445 const LLT V2S8 = LLT::fixed_vector(2, 8); 446 const LLT V2S16 = LLT::fixed_vector(2, 16); 447 const LLT V4S16 = LLT::fixed_vector(4, 16); 448 449 const LLT V2S32 = LLT::fixed_vector(2, 32); 450 const LLT V3S32 = LLT::fixed_vector(3, 32); 451 const LLT V4S32 = LLT::fixed_vector(4, 32); 452 const LLT V5S32 = LLT::fixed_vector(5, 32); 453 const LLT V6S32 = LLT::fixed_vector(6, 32); 454 const LLT V7S32 = LLT::fixed_vector(7, 32); 455 const LLT V8S32 = LLT::fixed_vector(8, 32); 456 const LLT V9S32 = LLT::fixed_vector(9, 32); 457 const LLT V10S32 = LLT::fixed_vector(10, 32); 458 const LLT V11S32 = LLT::fixed_vector(11, 32); 459 const LLT V12S32 = LLT::fixed_vector(12, 32); 460 const LLT V13S32 = LLT::fixed_vector(13, 32); 461 const LLT V14S32 = LLT::fixed_vector(14, 32); 462 const LLT V15S32 = LLT::fixed_vector(15, 32); 463 const LLT V16S32 = LLT::fixed_vector(16, 32); 464 const LLT V32S32 = LLT::fixed_vector(32, 32); 465 466 const LLT V2S64 = LLT::fixed_vector(2, 64); 467 const LLT V3S64 = LLT::fixed_vector(3, 64); 468 const LLT V4S64 = LLT::fixed_vector(4, 64); 469 const LLT V5S64 = LLT::fixed_vector(5, 64); 470 const LLT V6S64 = LLT::fixed_vector(6, 64); 471 const LLT V7S64 = LLT::fixed_vector(7, 64); 472 const LLT V8S64 = LLT::fixed_vector(8, 64); 473 const LLT V16S64 = LLT::fixed_vector(16, 64); 474 475 std::initializer_list<LLT> AllS32Vectors = 476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 478 std::initializer_list<LLT> AllS64Vectors = 479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 480 481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 488 489 const LLT CodePtr = FlatPtr; 490 491 const std::initializer_list<LLT> AddrSpaces64 = { 492 GlobalPtr, ConstantPtr, FlatPtr 493 }; 494 495 const std::initializer_list<LLT> AddrSpaces32 = { 496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 497 }; 498 499 const std::initializer_list<LLT> FPTypesBase = { 500 S32, S64 501 }; 502 503 const std::initializer_list<LLT> FPTypes16 = { 504 S32, S64, S16 505 }; 506 507 const std::initializer_list<LLT> FPTypesPK16 = { 508 S32, S64, S16, V2S16 509 }; 510 511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 512 513 // s1 for VCC branches, s32 for SCC branches. 514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 515 516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 517 // elements for v3s16 518 getActionDefinitionsBuilder(G_PHI) 519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 520 .legalFor(AllS32Vectors) 521 .legalFor(AllS64Vectors) 522 .legalFor(AddrSpaces64) 523 .legalFor(AddrSpaces32) 524 .legalIf(isPointer(0)) 525 .clampScalar(0, S16, S256) 526 .widenScalarToNextPow2(0, 32) 527 .clampMaxNumElements(0, S32, 16) 528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529 .scalarize(0); 530 531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532 // Full set of gfx9 features. 533 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 534 .legalFor({S32, S16, V2S16}) 535 .clampScalar(0, S16, S32) 536 .clampMaxNumElements(0, S16, 2) 537 .scalarize(0) 538 .widenScalarToNextPow2(0, 32); 539 540 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 541 .legalFor({S32, S16, V2S16}) // Clamp modifier 542 .minScalarOrElt(0, S16) 543 .clampMaxNumElements(0, S16, 2) 544 .scalarize(0) 545 .widenScalarToNextPow2(0, 32) 546 .lower(); 547 } else if (ST.has16BitInsts()) { 548 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 549 .legalFor({S32, S16}) 550 .clampScalar(0, S16, S32) 551 .scalarize(0) 552 .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 553 554 // Technically the saturating operations require clamp bit support, but this 555 // was introduced at the same time as 16-bit operations. 556 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 557 .legalFor({S32, S16}) // Clamp modifier 558 .minScalar(0, S16) 559 .scalarize(0) 560 .widenScalarToNextPow2(0, 16) 561 .lower(); 562 563 // We're just lowering this, but it helps get a better result to try to 564 // coerce to the desired type first. 565 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 566 .minScalar(0, S16) 567 .scalarize(0) 568 .lower(); 569 } else { 570 getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 571 .legalFor({S32}) 572 .clampScalar(0, S32, S32) 573 .scalarize(0); 574 575 if (ST.hasIntClamp()) { 576 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 577 .legalFor({S32}) // Clamp modifier. 578 .scalarize(0) 579 .minScalarOrElt(0, S32) 580 .lower(); 581 } else { 582 // Clamp bit support was added in VI, along with 16-bit operations. 583 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 584 .minScalar(0, S32) 585 .scalarize(0) 586 .lower(); 587 } 588 589 // FIXME: DAG expansion gets better results. The widening uses the smaller 590 // range values and goes for the min/max lowering directly. 591 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 592 .minScalar(0, S32) 593 .scalarize(0) 594 .lower(); 595 } 596 597 getActionDefinitionsBuilder( 598 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 599 .customFor({S32, S64}) 600 .clampScalar(0, S32, S64) 601 .widenScalarToNextPow2(0, 32) 602 .scalarize(0); 603 604 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 605 .legalFor({S32}) 606 .maxScalarOrElt(0, S32); 607 608 if (ST.hasVOP3PInsts()) { 609 Mulh 610 .clampMaxNumElements(0, S8, 2) 611 .lowerFor({V2S8}); 612 } 613 614 Mulh 615 .scalarize(0) 616 .lower(); 617 618 // Report legal for any types we can handle anywhere. For the cases only legal 619 // on the SALU, RegBankSelect will be able to re-legalize. 620 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 621 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 622 .clampScalar(0, S32, S64) 623 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 624 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 625 .widenScalarToNextPow2(0) 626 .scalarize(0); 627 628 getActionDefinitionsBuilder({G_UADDO, G_USUBO, 629 G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 630 .legalFor({{S32, S1}, {S32, S32}}) 631 .minScalar(0, S32) 632 // TODO: .scalarize(0) 633 .lower(); 634 635 getActionDefinitionsBuilder(G_BITCAST) 636 // Don't worry about the size constraint. 637 .legalIf(all(isRegisterType(0), isRegisterType(1))) 638 .lower(); 639 640 641 getActionDefinitionsBuilder(G_CONSTANT) 642 .legalFor({S1, S32, S64, S16, GlobalPtr, 643 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 644 .legalIf(isPointer(0)) 645 .clampScalar(0, S32, S64) 646 .widenScalarToNextPow2(0); 647 648 getActionDefinitionsBuilder(G_FCONSTANT) 649 .legalFor({S32, S64, S16}) 650 .clampScalar(0, S16, S64); 651 652 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 653 .legalIf(isRegisterType(0)) 654 // s1 and s16 are special cases because they have legal operations on 655 // them, but don't really occupy registers in the normal way. 656 .legalFor({S1, S16}) 657 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 658 .clampScalarOrElt(0, S32, MaxScalar) 659 .widenScalarToNextPow2(0, 32) 660 .clampMaxNumElements(0, S32, 16); 661 662 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 663 664 // If the amount is divergent, we have to do a wave reduction to get the 665 // maximum value, so this is expanded during RegBankSelect. 666 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 667 .legalFor({{PrivatePtr, S32}}); 668 669 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 670 .customIf(typeIsNot(0, PrivatePtr)); 671 672 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 673 674 auto &FPOpActions = getActionDefinitionsBuilder( 675 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 676 .legalFor({S32, S64}); 677 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 678 .customFor({S32, S64}); 679 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 680 .customFor({S32, S64}); 681 682 if (ST.has16BitInsts()) { 683 if (ST.hasVOP3PInsts()) 684 FPOpActions.legalFor({S16, V2S16}); 685 else 686 FPOpActions.legalFor({S16}); 687 688 TrigActions.customFor({S16}); 689 FDIVActions.customFor({S16}); 690 } 691 692 auto &MinNumMaxNum = getActionDefinitionsBuilder({ 693 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 694 695 if (ST.hasVOP3PInsts()) { 696 MinNumMaxNum.customFor(FPTypesPK16) 697 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 698 .clampMaxNumElements(0, S16, 2) 699 .clampScalar(0, S16, S64) 700 .scalarize(0); 701 } else if (ST.has16BitInsts()) { 702 MinNumMaxNum.customFor(FPTypes16) 703 .clampScalar(0, S16, S64) 704 .scalarize(0); 705 } else { 706 MinNumMaxNum.customFor(FPTypesBase) 707 .clampScalar(0, S32, S64) 708 .scalarize(0); 709 } 710 711 if (ST.hasVOP3PInsts()) 712 FPOpActions.clampMaxNumElements(0, S16, 2); 713 714 FPOpActions 715 .scalarize(0) 716 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 717 718 TrigActions 719 .scalarize(0) 720 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 721 722 FDIVActions 723 .scalarize(0) 724 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 725 726 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 727 .legalFor(FPTypesPK16) 728 .clampMaxNumElements(0, S16, 2) 729 .scalarize(0) 730 .clampScalar(0, S16, S64); 731 732 if (ST.has16BitInsts()) { 733 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 734 .legalFor({S32, S64, S16}) 735 .scalarize(0) 736 .clampScalar(0, S16, S64); 737 } else { 738 getActionDefinitionsBuilder(G_FSQRT) 739 .legalFor({S32, S64}) 740 .scalarize(0) 741 .clampScalar(0, S32, S64); 742 743 if (ST.hasFractBug()) { 744 getActionDefinitionsBuilder(G_FFLOOR) 745 .customFor({S64}) 746 .legalFor({S32, S64}) 747 .scalarize(0) 748 .clampScalar(0, S32, S64); 749 } else { 750 getActionDefinitionsBuilder(G_FFLOOR) 751 .legalFor({S32, S64}) 752 .scalarize(0) 753 .clampScalar(0, S32, S64); 754 } 755 } 756 757 getActionDefinitionsBuilder(G_FPTRUNC) 758 .legalFor({{S32, S64}, {S16, S32}}) 759 .scalarize(0) 760 .lower(); 761 762 getActionDefinitionsBuilder(G_FPEXT) 763 .legalFor({{S64, S32}, {S32, S16}}) 764 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 765 .scalarize(0); 766 767 getActionDefinitionsBuilder(G_FSUB) 768 // Use actual fsub instruction 769 .legalFor({S32}) 770 // Must use fadd + fneg 771 .lowerFor({S64, S16, V2S16}) 772 .scalarize(0) 773 .clampScalar(0, S32, S64); 774 775 // Whether this is legal depends on the floating point mode for the function. 776 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 777 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 778 FMad.customFor({S32, S16}); 779 else if (ST.hasMadMacF32Insts()) 780 FMad.customFor({S32}); 781 else if (ST.hasMadF16()) 782 FMad.customFor({S16}); 783 FMad.scalarize(0) 784 .lower(); 785 786 auto &FRem = getActionDefinitionsBuilder(G_FREM); 787 if (ST.has16BitInsts()) { 788 FRem.customFor({S16, S32, S64}); 789 } else { 790 FRem.minScalar(0, S32) 791 .customFor({S32, S64}); 792 } 793 FRem.scalarize(0); 794 795 // TODO: Do we need to clamp maximum bitwidth? 796 getActionDefinitionsBuilder(G_TRUNC) 797 .legalIf(isScalar(0)) 798 .legalFor({{V2S16, V2S32}}) 799 .clampMaxNumElements(0, S16, 2) 800 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 801 // situations (like an invalid implicit use), we don't want to infinite loop 802 // in the legalizer. 803 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 804 .alwaysLegal(); 805 806 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 807 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 808 {S32, S1}, {S64, S1}, {S16, S1}}) 809 .scalarize(0) 810 .clampScalar(0, S32, S64) 811 .widenScalarToNextPow2(1, 32); 812 813 // TODO: Split s1->s64 during regbankselect for VALU. 814 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 815 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 816 .lowerFor({{S32, S64}}) 817 .lowerIf(typeIs(1, S1)) 818 .customFor({{S64, S64}}); 819 if (ST.has16BitInsts()) 820 IToFP.legalFor({{S16, S16}}); 821 IToFP.clampScalar(1, S32, S64) 822 .minScalar(0, S32) 823 .scalarize(0) 824 .widenScalarToNextPow2(1); 825 826 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 827 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 828 .customFor({{S64, S32}, {S64, S64}}) 829 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 830 if (ST.has16BitInsts()) 831 FPToI.legalFor({{S16, S16}}); 832 else 833 FPToI.minScalar(1, S32); 834 835 FPToI.minScalar(0, S32) 836 .widenScalarToNextPow2(0, 32) 837 .scalarize(0) 838 .lower(); 839 840 // Lower roundeven into G_FRINT 841 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 842 .scalarize(0) 843 .lower(); 844 845 if (ST.has16BitInsts()) { 846 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 847 .legalFor({S16, S32, S64}) 848 .clampScalar(0, S16, S64) 849 .scalarize(0); 850 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 851 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 852 .legalFor({S32, S64}) 853 .clampScalar(0, S32, S64) 854 .scalarize(0); 855 } else { 856 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 857 .legalFor({S32}) 858 .customFor({S64}) 859 .clampScalar(0, S32, S64) 860 .scalarize(0); 861 } 862 863 getActionDefinitionsBuilder(G_PTR_ADD) 864 .legalIf(all(isPointer(0), sameSize(0, 1))) 865 .scalarize(0) 866 .scalarSameSizeAs(1, 0); 867 868 getActionDefinitionsBuilder(G_PTRMASK) 869 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 870 .scalarSameSizeAs(1, 0) 871 .scalarize(0); 872 873 auto &CmpBuilder = 874 getActionDefinitionsBuilder(G_ICMP) 875 // The compare output type differs based on the register bank of the output, 876 // so make both s1 and s32 legal. 877 // 878 // Scalar compares producing output in scc will be promoted to s32, as that 879 // is the allocatable register type that will be needed for the copy from 880 // scc. This will be promoted during RegBankSelect, and we assume something 881 // before that won't try to use s32 result types. 882 // 883 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 884 // bank. 885 .legalForCartesianProduct( 886 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 887 .legalForCartesianProduct( 888 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 889 if (ST.has16BitInsts()) { 890 CmpBuilder.legalFor({{S1, S16}}); 891 } 892 893 CmpBuilder 894 .widenScalarToNextPow2(1) 895 .clampScalar(1, S32, S64) 896 .scalarize(0) 897 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 898 899 getActionDefinitionsBuilder(G_FCMP) 900 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 901 .widenScalarToNextPow2(1) 902 .clampScalar(1, S32, S64) 903 .scalarize(0); 904 905 // FIXME: fpow has a selection pattern that should move to custom lowering. 906 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 907 if (ST.has16BitInsts()) 908 Exp2Ops.legalFor({S32, S16}); 909 else 910 Exp2Ops.legalFor({S32}); 911 Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 912 Exp2Ops.scalarize(0); 913 914 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 915 if (ST.has16BitInsts()) 916 ExpOps.customFor({{S32}, {S16}}); 917 else 918 ExpOps.customFor({S32}); 919 ExpOps.clampScalar(0, MinScalarFPTy, S32) 920 .scalarize(0); 921 922 getActionDefinitionsBuilder(G_FPOWI) 923 .clampScalar(0, MinScalarFPTy, S32) 924 .lower(); 925 926 // The 64-bit versions produce 32-bit results, but only on the SALU. 927 getActionDefinitionsBuilder(G_CTPOP) 928 .legalFor({{S32, S32}, {S32, S64}}) 929 .clampScalar(0, S32, S32) 930 .clampScalar(1, S32, S64) 931 .scalarize(0) 932 .widenScalarToNextPow2(0, 32) 933 .widenScalarToNextPow2(1, 32); 934 935 // The hardware instructions return a different result on 0 than the generic 936 // instructions expect. The hardware produces -1, but these produce the 937 // bitwidth. 938 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 939 .scalarize(0) 940 .clampScalar(0, S32, S32) 941 .clampScalar(1, S32, S64) 942 .widenScalarToNextPow2(0, 32) 943 .widenScalarToNextPow2(1, 32) 944 .lower(); 945 946 // The 64-bit versions produce 32-bit results, but only on the SALU. 947 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 948 .legalFor({{S32, S32}, {S32, S64}}) 949 .clampScalar(0, S32, S32) 950 .clampScalar(1, S32, S64) 951 .scalarize(0) 952 .widenScalarToNextPow2(0, 32) 953 .widenScalarToNextPow2(1, 32); 954 955 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 956 // RegBankSelect. 957 getActionDefinitionsBuilder(G_BITREVERSE) 958 .legalFor({S32, S64}) 959 .clampScalar(0, S32, S64) 960 .scalarize(0) 961 .widenScalarToNextPow2(0); 962 963 if (ST.has16BitInsts()) { 964 getActionDefinitionsBuilder(G_BSWAP) 965 .legalFor({S16, S32, V2S16}) 966 .clampMaxNumElements(0, S16, 2) 967 // FIXME: Fixing non-power-of-2 before clamp is workaround for 968 // narrowScalar limitation. 969 .widenScalarToNextPow2(0) 970 .clampScalar(0, S16, S32) 971 .scalarize(0); 972 973 if (ST.hasVOP3PInsts()) { 974 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 975 .legalFor({S32, S16, V2S16}) 976 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 977 .clampMaxNumElements(0, S16, 2) 978 .minScalar(0, S16) 979 .widenScalarToNextPow2(0) 980 .scalarize(0) 981 .lower(); 982 } else { 983 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 984 .legalFor({S32, S16}) 985 .widenScalarToNextPow2(0) 986 .minScalar(0, S16) 987 .scalarize(0) 988 .lower(); 989 } 990 } else { 991 // TODO: Should have same legality without v_perm_b32 992 getActionDefinitionsBuilder(G_BSWAP) 993 .legalFor({S32}) 994 .lowerIf(scalarNarrowerThan(0, 32)) 995 // FIXME: Fixing non-power-of-2 before clamp is workaround for 996 // narrowScalar limitation. 997 .widenScalarToNextPow2(0) 998 .maxScalar(0, S32) 999 .scalarize(0) 1000 .lower(); 1001 1002 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1003 .legalFor({S32}) 1004 .minScalar(0, S32) 1005 .widenScalarToNextPow2(0) 1006 .scalarize(0) 1007 .lower(); 1008 } 1009 1010 getActionDefinitionsBuilder(G_INTTOPTR) 1011 // List the common cases 1012 .legalForCartesianProduct(AddrSpaces64, {S64}) 1013 .legalForCartesianProduct(AddrSpaces32, {S32}) 1014 .scalarize(0) 1015 // Accept any address space as long as the size matches 1016 .legalIf(sameSize(0, 1)) 1017 .widenScalarIf(smallerThan(1, 0), 1018 [](const LegalityQuery &Query) { 1019 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1020 }) 1021 .narrowScalarIf(largerThan(1, 0), 1022 [](const LegalityQuery &Query) { 1023 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1024 }); 1025 1026 getActionDefinitionsBuilder(G_PTRTOINT) 1027 // List the common cases 1028 .legalForCartesianProduct(AddrSpaces64, {S64}) 1029 .legalForCartesianProduct(AddrSpaces32, {S32}) 1030 .scalarize(0) 1031 // Accept any address space as long as the size matches 1032 .legalIf(sameSize(0, 1)) 1033 .widenScalarIf(smallerThan(0, 1), 1034 [](const LegalityQuery &Query) { 1035 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1036 }) 1037 .narrowScalarIf( 1038 largerThan(0, 1), 1039 [](const LegalityQuery &Query) { 1040 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1041 }); 1042 1043 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1044 .scalarize(0) 1045 .custom(); 1046 1047 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1048 bool IsLoad) -> bool { 1049 const LLT DstTy = Query.Types[0]; 1050 1051 // Split vector extloads. 1052 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1053 unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1054 1055 if (MemSize < DstTy.getSizeInBits()) 1056 MemSize = std::max(MemSize, AlignBits); 1057 1058 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1059 return true; 1060 1061 const LLT PtrTy = Query.Types[1]; 1062 unsigned AS = PtrTy.getAddressSpace(); 1063 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 1064 return true; 1065 1066 // Catch weird sized loads that don't evenly divide into the access sizes 1067 // TODO: May be able to widen depending on alignment etc. 1068 unsigned NumRegs = (MemSize + 31) / 32; 1069 if (NumRegs == 3) { 1070 if (!ST.hasDwordx3LoadStores()) 1071 return true; 1072 } else { 1073 // If the alignment allows, these should have been widened. 1074 if (!isPowerOf2_32(NumRegs)) 1075 return true; 1076 } 1077 1078 if (AlignBits < MemSize) { 1079 const SITargetLowering *TLI = ST.getTargetLowering(); 1080 return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1081 Align(AlignBits / 8)); 1082 } 1083 1084 return false; 1085 }; 1086 1087 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1088 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1089 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1090 1091 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1092 // LDS 1093 // TODO: Unsupported flat for SI. 1094 1095 for (unsigned Op : {G_LOAD, G_STORE}) { 1096 const bool IsStore = Op == G_STORE; 1097 1098 auto &Actions = getActionDefinitionsBuilder(Op); 1099 // Explicitly list some common cases. 1100 // TODO: Does this help compile time at all? 1101 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1102 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1103 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1104 {S64, GlobalPtr, S64, GlobalAlign32}, 1105 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1106 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1107 {S32, GlobalPtr, S8, GlobalAlign8}, 1108 {S32, GlobalPtr, S16, GlobalAlign16}, 1109 1110 {S32, LocalPtr, S32, 32}, 1111 {S64, LocalPtr, S64, 32}, 1112 {V2S32, LocalPtr, V2S32, 32}, 1113 {S32, LocalPtr, S8, 8}, 1114 {S32, LocalPtr, S16, 16}, 1115 {V2S16, LocalPtr, S32, 32}, 1116 1117 {S32, PrivatePtr, S32, 32}, 1118 {S32, PrivatePtr, S8, 8}, 1119 {S32, PrivatePtr, S16, 16}, 1120 {V2S16, PrivatePtr, S32, 32}, 1121 1122 {S32, ConstantPtr, S32, GlobalAlign32}, 1123 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1124 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1125 {S64, ConstantPtr, S64, GlobalAlign32}, 1126 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1127 Actions.legalIf( 1128 [=](const LegalityQuery &Query) -> bool { 1129 return isLoadStoreLegal(ST, Query); 1130 }); 1131 1132 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1133 // 64-bits. 1134 // 1135 // TODO: Should generalize bitcast action into coerce, which will also cover 1136 // inserting addrspacecasts. 1137 Actions.customIf(typeIs(1, Constant32Ptr)); 1138 1139 // Turn any illegal element vectors into something easier to deal 1140 // with. These will ultimately produce 32-bit scalar shifts to extract the 1141 // parts anyway. 1142 // 1143 // For odd 16-bit element vectors, prefer to split those into pieces with 1144 // 16-bit vector parts. 1145 Actions.bitcastIf( 1146 [=](const LegalityQuery &Query) -> bool { 1147 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1148 Query.MMODescrs[0].MemoryTy); 1149 }, bitcastToRegisterType(0)); 1150 1151 if (!IsStore) { 1152 // Widen suitably aligned loads by loading extra bytes. The standard 1153 // legalization actions can't properly express widening memory operands. 1154 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1155 return shouldWidenLoad(ST, Query, G_LOAD); 1156 }); 1157 } 1158 1159 // FIXME: load/store narrowing should be moved to lower action 1160 Actions 1161 .narrowScalarIf( 1162 [=](const LegalityQuery &Query) -> bool { 1163 return !Query.Types[0].isVector() && 1164 needToSplitMemOp(Query, Op == G_LOAD); 1165 }, 1166 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1167 const LLT DstTy = Query.Types[0]; 1168 const LLT PtrTy = Query.Types[1]; 1169 1170 const unsigned DstSize = DstTy.getSizeInBits(); 1171 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1172 1173 // Split extloads. 1174 if (DstSize > MemSize) 1175 return std::make_pair(0, LLT::scalar(MemSize)); 1176 1177 if (!isPowerOf2_32(DstSize)) { 1178 // We're probably decomposing an odd sized store. Try to split 1179 // to the widest type. TODO: Account for alignment. As-is it 1180 // should be OK, since the new parts will be further legalized. 1181 unsigned FloorSize = PowerOf2Floor(DstSize); 1182 return std::make_pair(0, LLT::scalar(FloorSize)); 1183 } 1184 1185 if (DstSize > 32 && (DstSize % 32 != 0)) { 1186 // FIXME: Need a way to specify non-extload of larger size if 1187 // suitably aligned. 1188 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 1189 } 1190 1191 unsigned MaxSize = maxSizeForAddrSpace(ST, 1192 PtrTy.getAddressSpace(), 1193 Op == G_LOAD); 1194 if (MemSize > MaxSize) 1195 return std::make_pair(0, LLT::scalar(MaxSize)); 1196 1197 unsigned Align = Query.MMODescrs[0].AlignInBits; 1198 return std::make_pair(0, LLT::scalar(Align)); 1199 }) 1200 .fewerElementsIf( 1201 [=](const LegalityQuery &Query) -> bool { 1202 return Query.Types[0].isVector() && 1203 needToSplitMemOp(Query, Op == G_LOAD); 1204 }, 1205 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1206 const LLT DstTy = Query.Types[0]; 1207 const LLT PtrTy = Query.Types[1]; 1208 1209 LLT EltTy = DstTy.getElementType(); 1210 unsigned MaxSize = maxSizeForAddrSpace(ST, 1211 PtrTy.getAddressSpace(), 1212 Op == G_LOAD); 1213 1214 // FIXME: Handle widened to power of 2 results better. This ends 1215 // up scalarizing. 1216 // FIXME: 3 element stores scalarized on SI 1217 1218 // Split if it's too large for the address space. 1219 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1220 if (MemSize > MaxSize) { 1221 unsigned NumElts = DstTy.getNumElements(); 1222 unsigned EltSize = EltTy.getSizeInBits(); 1223 1224 if (MaxSize % EltSize == 0) { 1225 return std::make_pair( 1226 0, LLT::scalarOrVector( 1227 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1228 } 1229 1230 unsigned NumPieces = MemSize / MaxSize; 1231 1232 // FIXME: Refine when odd breakdowns handled 1233 // The scalars will need to be re-legalized. 1234 if (NumPieces == 1 || NumPieces >= NumElts || 1235 NumElts % NumPieces != 0) 1236 return std::make_pair(0, EltTy); 1237 1238 return std::make_pair( 1239 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1240 } 1241 1242 // FIXME: We could probably handle weird extending loads better. 1243 if (DstTy.getSizeInBits() > MemSize) 1244 return std::make_pair(0, EltTy); 1245 1246 unsigned EltSize = EltTy.getSizeInBits(); 1247 unsigned DstSize = DstTy.getSizeInBits(); 1248 if (!isPowerOf2_32(DstSize)) { 1249 // We're probably decomposing an odd sized store. Try to split 1250 // to the widest type. TODO: Account for alignment. As-is it 1251 // should be OK, since the new parts will be further legalized. 1252 unsigned FloorSize = PowerOf2Floor(DstSize); 1253 return std::make_pair( 1254 0, LLT::scalarOrVector( 1255 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1256 } 1257 1258 // Need to split because of alignment. 1259 unsigned Align = Query.MMODescrs[0].AlignInBits; 1260 if (EltSize > Align && 1261 (EltSize / Align < DstTy.getNumElements())) { 1262 return std::make_pair( 1263 0, LLT::fixed_vector(EltSize / Align, EltTy)); 1264 } 1265 1266 // May need relegalization for the scalars. 1267 return std::make_pair(0, EltTy); 1268 }) 1269 .lowerIfMemSizeNotPow2() 1270 .minScalar(0, S32) 1271 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1272 .widenScalarToNextPow2(0) 1273 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1274 .lower(); 1275 } 1276 1277 // FIXME: Unaligned accesses not lowered. 1278 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1279 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1280 {S32, GlobalPtr, S16, 2 * 8}, 1281 {S32, LocalPtr, S8, 8}, 1282 {S32, LocalPtr, S16, 16}, 1283 {S32, PrivatePtr, S8, 8}, 1284 {S32, PrivatePtr, S16, 16}, 1285 {S32, ConstantPtr, S8, 8}, 1286 {S32, ConstantPtr, S16, 2 * 8}}) 1287 .legalIf( 1288 [=](const LegalityQuery &Query) -> bool { 1289 return isLoadStoreLegal(ST, Query); 1290 }); 1291 1292 if (ST.hasFlatAddressSpace()) { 1293 ExtLoads.legalForTypesWithMemDesc( 1294 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1295 } 1296 1297 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1298 // 64-bits. 1299 // 1300 // TODO: Should generalize bitcast action into coerce, which will also cover 1301 // inserting addrspacecasts. 1302 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1303 1304 ExtLoads.clampScalar(0, S32, S32) 1305 .widenScalarToNextPow2(0) 1306 .lower(); 1307 1308 auto &Atomics = getActionDefinitionsBuilder( 1309 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1310 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1311 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1312 G_ATOMICRMW_UMIN}) 1313 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1314 {S64, GlobalPtr}, {S64, LocalPtr}, 1315 {S32, RegionPtr}, {S64, RegionPtr}}); 1316 if (ST.hasFlatAddressSpace()) { 1317 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1318 } 1319 1320 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1321 if (ST.hasLDSFPAtomics()) { 1322 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1323 if (ST.hasGFX90AInsts()) 1324 Atomic.legalFor({{S64, LocalPtr}}); 1325 } 1326 if (ST.hasAtomicFaddInsts()) 1327 Atomic.legalFor({{S32, GlobalPtr}}); 1328 1329 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1330 // demarshalling 1331 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1332 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1333 {S32, FlatPtr}, {S64, FlatPtr}}) 1334 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1335 {S32, RegionPtr}, {S64, RegionPtr}}); 1336 // TODO: Pointer types, any 32-bit or 64-bit vector 1337 1338 // Condition should be s32 for scalar, s1 for vector. 1339 getActionDefinitionsBuilder(G_SELECT) 1340 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1341 LocalPtr, FlatPtr, PrivatePtr, 1342 LLT::fixed_vector(2, LocalPtr), 1343 LLT::fixed_vector(2, PrivatePtr)}, 1344 {S1, S32}) 1345 .clampScalar(0, S16, S64) 1346 .scalarize(1) 1347 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1348 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1349 .clampMaxNumElements(0, S32, 2) 1350 .clampMaxNumElements(0, LocalPtr, 2) 1351 .clampMaxNumElements(0, PrivatePtr, 2) 1352 .scalarize(0) 1353 .widenScalarToNextPow2(0) 1354 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1355 1356 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1357 // be more flexible with the shift amount type. 1358 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1359 .legalFor({{S32, S32}, {S64, S32}}); 1360 if (ST.has16BitInsts()) { 1361 if (ST.hasVOP3PInsts()) { 1362 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1363 .clampMaxNumElements(0, S16, 2); 1364 } else 1365 Shifts.legalFor({{S16, S16}}); 1366 1367 // TODO: Support 16-bit shift amounts for all types 1368 Shifts.widenScalarIf( 1369 [=](const LegalityQuery &Query) { 1370 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1371 // 32-bit amount. 1372 const LLT ValTy = Query.Types[0]; 1373 const LLT AmountTy = Query.Types[1]; 1374 return ValTy.getSizeInBits() <= 16 && 1375 AmountTy.getSizeInBits() < 16; 1376 }, changeTo(1, S16)); 1377 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1378 Shifts.clampScalar(1, S32, S32); 1379 Shifts.clampScalar(0, S16, S64); 1380 Shifts.widenScalarToNextPow2(0, 16); 1381 1382 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1383 .minScalar(0, S16) 1384 .scalarize(0) 1385 .lower(); 1386 } else { 1387 // Make sure we legalize the shift amount type first, as the general 1388 // expansion for the shifted type will produce much worse code if it hasn't 1389 // been truncated already. 1390 Shifts.clampScalar(1, S32, S32); 1391 Shifts.clampScalar(0, S32, S64); 1392 Shifts.widenScalarToNextPow2(0, 32); 1393 1394 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1395 .minScalar(0, S32) 1396 .scalarize(0) 1397 .lower(); 1398 } 1399 Shifts.scalarize(0); 1400 1401 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1402 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1403 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1404 unsigned IdxTypeIdx = 2; 1405 1406 getActionDefinitionsBuilder(Op) 1407 .customIf([=](const LegalityQuery &Query) { 1408 const LLT EltTy = Query.Types[EltTypeIdx]; 1409 const LLT VecTy = Query.Types[VecTypeIdx]; 1410 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1411 const unsigned EltSize = EltTy.getSizeInBits(); 1412 return (EltSize == 32 || EltSize == 64) && 1413 VecTy.getSizeInBits() % 32 == 0 && 1414 VecTy.getSizeInBits() <= MaxRegisterSize && 1415 IdxTy.getSizeInBits() == 32; 1416 }) 1417 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1418 bitcastToVectorElement32(VecTypeIdx)) 1419 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1420 .bitcastIf( 1421 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1422 [=](const LegalityQuery &Query) { 1423 // For > 64-bit element types, try to turn this into a 64-bit 1424 // element vector since we may be able to do better indexing 1425 // if this is scalar. If not, fall back to 32. 1426 const LLT EltTy = Query.Types[EltTypeIdx]; 1427 const LLT VecTy = Query.Types[VecTypeIdx]; 1428 const unsigned DstEltSize = EltTy.getSizeInBits(); 1429 const unsigned VecSize = VecTy.getSizeInBits(); 1430 1431 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1432 return std::make_pair( 1433 VecTypeIdx, 1434 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1435 }) 1436 .clampScalar(EltTypeIdx, S32, S64) 1437 .clampScalar(VecTypeIdx, S32, S64) 1438 .clampScalar(IdxTypeIdx, S32, S32) 1439 .clampMaxNumElements(VecTypeIdx, S32, 32) 1440 // TODO: Clamp elements for 64-bit vectors? 1441 // It should only be necessary with variable indexes. 1442 // As a last resort, lower to the stack 1443 .lower(); 1444 } 1445 1446 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1447 .unsupportedIf([=](const LegalityQuery &Query) { 1448 const LLT &EltTy = Query.Types[1].getElementType(); 1449 return Query.Types[0] != EltTy; 1450 }); 1451 1452 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1453 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1454 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1455 1456 // FIXME: Doesn't handle extract of illegal sizes. 1457 getActionDefinitionsBuilder(Op) 1458 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1459 // FIXME: Multiples of 16 should not be legal. 1460 .legalIf([=](const LegalityQuery &Query) { 1461 const LLT BigTy = Query.Types[BigTyIdx]; 1462 const LLT LitTy = Query.Types[LitTyIdx]; 1463 return (BigTy.getSizeInBits() % 32 == 0) && 1464 (LitTy.getSizeInBits() % 16 == 0); 1465 }) 1466 .widenScalarIf( 1467 [=](const LegalityQuery &Query) { 1468 const LLT BigTy = Query.Types[BigTyIdx]; 1469 return (BigTy.getScalarSizeInBits() < 16); 1470 }, 1471 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1472 .widenScalarIf( 1473 [=](const LegalityQuery &Query) { 1474 const LLT LitTy = Query.Types[LitTyIdx]; 1475 return (LitTy.getScalarSizeInBits() < 16); 1476 }, 1477 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1478 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1479 .widenScalarToNextPow2(BigTyIdx, 32); 1480 1481 } 1482 1483 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 1484 .legalForCartesianProduct(AllS32Vectors, {S32}) 1485 .legalForCartesianProduct(AllS64Vectors, {S64}) 1486 .clampNumElements(0, V16S32, V32S32) 1487 .clampNumElements(0, V2S64, V16S64) 1488 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 1489 1490 if (ST.hasScalarPackInsts()) { 1491 BuildVector 1492 // FIXME: Should probably widen s1 vectors straight to s32 1493 .minScalarOrElt(0, S16) 1494 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1495 .minScalar(1, S32); 1496 1497 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1498 .legalFor({V2S16, S32}) 1499 .lower(); 1500 BuildVector.minScalarOrElt(0, S32); 1501 } else { 1502 BuildVector.customFor({V2S16, S16}); 1503 BuildVector.minScalarOrElt(0, S32); 1504 1505 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1506 .customFor({V2S16, S32}) 1507 .lower(); 1508 } 1509 1510 BuildVector.legalIf(isRegisterType(0)); 1511 1512 // FIXME: Clamp maximum size 1513 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1514 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1515 .clampMaxNumElements(0, S32, 32) 1516 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1517 .clampMaxNumElements(0, S16, 64); 1518 1519 // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1520 // pre-legalize. 1521 if (ST.hasVOP3PInsts()) { 1522 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1523 .customFor({V2S16, V2S16}) 1524 .lower(); 1525 } else 1526 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1527 1528 // Merge/Unmerge 1529 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1530 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1531 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1532 1533 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1534 const LLT Ty = Query.Types[TypeIdx]; 1535 if (Ty.isVector()) { 1536 const LLT &EltTy = Ty.getElementType(); 1537 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1538 return true; 1539 if (!isPowerOf2_32(EltTy.getSizeInBits())) 1540 return true; 1541 } 1542 return false; 1543 }; 1544 1545 auto &Builder = getActionDefinitionsBuilder(Op) 1546 .legalIf(all(isRegisterType(0), isRegisterType(1))) 1547 .lowerFor({{S16, V2S16}}) 1548 .lowerIf([=](const LegalityQuery &Query) { 1549 const LLT BigTy = Query.Types[BigTyIdx]; 1550 return BigTy.getSizeInBits() == 32; 1551 }) 1552 // Try to widen to s16 first for small types. 1553 // TODO: Only do this on targets with legal s16 shifts 1554 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1555 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1556 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1557 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1558 elementTypeIs(1, S16)), 1559 changeTo(1, V2S16)) 1560 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1561 // worth considering the multiples of 64 since 2*192 and 2*384 are not 1562 // valid. 1563 .clampScalar(LitTyIdx, S32, S512) 1564 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1565 // Break up vectors with weird elements into scalars 1566 .fewerElementsIf( 1567 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 1568 scalarize(0)) 1569 .fewerElementsIf( 1570 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 1571 scalarize(1)) 1572 .clampScalar(BigTyIdx, S32, MaxScalar); 1573 1574 if (Op == G_MERGE_VALUES) { 1575 Builder.widenScalarIf( 1576 // TODO: Use 16-bit shifts if legal for 8-bit values? 1577 [=](const LegalityQuery &Query) { 1578 const LLT Ty = Query.Types[LitTyIdx]; 1579 return Ty.getSizeInBits() < 32; 1580 }, 1581 changeTo(LitTyIdx, S32)); 1582 } 1583 1584 Builder.widenScalarIf( 1585 [=](const LegalityQuery &Query) { 1586 const LLT Ty = Query.Types[BigTyIdx]; 1587 return !isPowerOf2_32(Ty.getSizeInBits()) && 1588 Ty.getSizeInBits() % 16 != 0; 1589 }, 1590 [=](const LegalityQuery &Query) { 1591 // Pick the next power of 2, or a multiple of 64 over 128. 1592 // Whichever is smaller. 1593 const LLT &Ty = Query.Types[BigTyIdx]; 1594 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 1595 if (NewSizeInBits >= 256) { 1596 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 1597 if (RoundedTo < NewSizeInBits) 1598 NewSizeInBits = RoundedTo; 1599 } 1600 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 1601 }) 1602 // Any vectors left are the wrong size. Scalarize them. 1603 .scalarize(0) 1604 .scalarize(1); 1605 } 1606 1607 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1608 // RegBankSelect. 1609 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1610 .legalFor({{S32}, {S64}}); 1611 1612 if (ST.hasVOP3PInsts()) { 1613 SextInReg.lowerFor({{V2S16}}) 1614 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1615 // get more vector shift opportunities, since we'll get those when 1616 // expanded. 1617 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1618 } else if (ST.has16BitInsts()) { 1619 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1620 } else { 1621 // Prefer to promote to s32 before lowering if we don't have 16-bit 1622 // shifts. This avoid a lot of intermediate truncate and extend operations. 1623 SextInReg.lowerFor({{S32}, {S64}}); 1624 } 1625 1626 SextInReg 1627 .scalarize(0) 1628 .clampScalar(0, S32, S64) 1629 .lower(); 1630 1631 // TODO: Only Try to form v2s16 with legal packed instructions. 1632 getActionDefinitionsBuilder(G_FSHR) 1633 .legalFor({{S32, S32}}) 1634 .lowerFor({{V2S16, V2S16}}) 1635 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1636 .scalarize(0) 1637 .lower(); 1638 1639 if (ST.hasVOP3PInsts()) { 1640 getActionDefinitionsBuilder(G_FSHL) 1641 .lowerFor({{V2S16, V2S16}}) 1642 .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)) 1643 .scalarize(0) 1644 .lower(); 1645 } else { 1646 getActionDefinitionsBuilder(G_FSHL) 1647 .scalarize(0) 1648 .lower(); 1649 } 1650 1651 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1652 .legalFor({S64}); 1653 1654 getActionDefinitionsBuilder(G_FENCE) 1655 .alwaysLegal(); 1656 1657 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1658 .scalarize(0) 1659 .minScalar(0, S32) 1660 .lower(); 1661 1662 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1663 .legalFor({{S32, S32}, {S64, S32}}) 1664 .clampScalar(1, S32, S32) 1665 .clampScalar(0, S32, S64) 1666 .widenScalarToNextPow2(0) 1667 .scalarize(0); 1668 1669 getActionDefinitionsBuilder({ 1670 // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1671 G_FCOPYSIGN, 1672 1673 G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1674 G_ATOMICRMW_NAND, 1675 G_ATOMICRMW_FSUB, 1676 G_READ_REGISTER, 1677 G_WRITE_REGISTER, 1678 1679 G_SADDO, G_SSUBO, 1680 1681 // TODO: Implement 1682 G_FMINIMUM, G_FMAXIMUM}).lower(); 1683 1684 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1685 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1686 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1687 .unsupported(); 1688 1689 getLegacyLegalizerInfo().computeTables(); 1690 verify(*ST.getInstrInfo()); 1691 } 1692 1693 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1694 MachineInstr &MI) const { 1695 MachineIRBuilder &B = Helper.MIRBuilder; 1696 MachineRegisterInfo &MRI = *B.getMRI(); 1697 1698 switch (MI.getOpcode()) { 1699 case TargetOpcode::G_ADDRSPACE_CAST: 1700 return legalizeAddrSpaceCast(MI, MRI, B); 1701 case TargetOpcode::G_FRINT: 1702 return legalizeFrint(MI, MRI, B); 1703 case TargetOpcode::G_FCEIL: 1704 return legalizeFceil(MI, MRI, B); 1705 case TargetOpcode::G_FREM: 1706 return legalizeFrem(MI, MRI, B); 1707 case TargetOpcode::G_INTRINSIC_TRUNC: 1708 return legalizeIntrinsicTrunc(MI, MRI, B); 1709 case TargetOpcode::G_SITOFP: 1710 return legalizeITOFP(MI, MRI, B, true); 1711 case TargetOpcode::G_UITOFP: 1712 return legalizeITOFP(MI, MRI, B, false); 1713 case TargetOpcode::G_FPTOSI: 1714 return legalizeFPTOI(MI, MRI, B, true); 1715 case TargetOpcode::G_FPTOUI: 1716 return legalizeFPTOI(MI, MRI, B, false); 1717 case TargetOpcode::G_FMINNUM: 1718 case TargetOpcode::G_FMAXNUM: 1719 case TargetOpcode::G_FMINNUM_IEEE: 1720 case TargetOpcode::G_FMAXNUM_IEEE: 1721 return legalizeMinNumMaxNum(Helper, MI); 1722 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1723 return legalizeExtractVectorElt(MI, MRI, B); 1724 case TargetOpcode::G_INSERT_VECTOR_ELT: 1725 return legalizeInsertVectorElt(MI, MRI, B); 1726 case TargetOpcode::G_SHUFFLE_VECTOR: 1727 return legalizeShuffleVector(MI, MRI, B); 1728 case TargetOpcode::G_FSIN: 1729 case TargetOpcode::G_FCOS: 1730 return legalizeSinCos(MI, MRI, B); 1731 case TargetOpcode::G_GLOBAL_VALUE: 1732 return legalizeGlobalValue(MI, MRI, B); 1733 case TargetOpcode::G_LOAD: 1734 case TargetOpcode::G_SEXTLOAD: 1735 case TargetOpcode::G_ZEXTLOAD: 1736 return legalizeLoad(Helper, MI); 1737 case TargetOpcode::G_FMAD: 1738 return legalizeFMad(MI, MRI, B); 1739 case TargetOpcode::G_FDIV: 1740 return legalizeFDIV(MI, MRI, B); 1741 case TargetOpcode::G_UDIV: 1742 case TargetOpcode::G_UREM: 1743 case TargetOpcode::G_UDIVREM: 1744 return legalizeUnsignedDIV_REM(MI, MRI, B); 1745 case TargetOpcode::G_SDIV: 1746 case TargetOpcode::G_SREM: 1747 case TargetOpcode::G_SDIVREM: 1748 return legalizeSignedDIV_REM(MI, MRI, B); 1749 case TargetOpcode::G_ATOMIC_CMPXCHG: 1750 return legalizeAtomicCmpXChg(MI, MRI, B); 1751 case TargetOpcode::G_FLOG: 1752 return legalizeFlog(MI, B, numbers::ln2f); 1753 case TargetOpcode::G_FLOG10: 1754 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1755 case TargetOpcode::G_FEXP: 1756 return legalizeFExp(MI, B); 1757 case TargetOpcode::G_FPOW: 1758 return legalizeFPow(MI, B); 1759 case TargetOpcode::G_FFLOOR: 1760 return legalizeFFloor(MI, MRI, B); 1761 case TargetOpcode::G_BUILD_VECTOR: 1762 return legalizeBuildVector(MI, MRI, B); 1763 default: 1764 return false; 1765 } 1766 1767 llvm_unreachable("expected switch to return"); 1768 } 1769 1770 Register AMDGPULegalizerInfo::getSegmentAperture( 1771 unsigned AS, 1772 MachineRegisterInfo &MRI, 1773 MachineIRBuilder &B) const { 1774 MachineFunction &MF = B.getMF(); 1775 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1776 const LLT S32 = LLT::scalar(32); 1777 1778 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1779 1780 if (ST.hasApertureRegs()) { 1781 // FIXME: Use inline constants (src_{shared, private}_base) instead of 1782 // getreg. 1783 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 1784 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 1785 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 1786 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 1787 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 1788 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 1789 unsigned Encoding = 1790 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 1791 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 1792 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 1793 1794 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 1795 1796 B.buildInstr(AMDGPU::S_GETREG_B32) 1797 .addDef(GetReg) 1798 .addImm(Encoding); 1799 MRI.setType(GetReg, S32); 1800 1801 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1802 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 1803 } 1804 1805 Register QueuePtr = MRI.createGenericVirtualRegister( 1806 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 1807 1808 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 1809 return Register(); 1810 1811 // Offset into amd_queue_t for group_segment_aperture_base_hi / 1812 // private_segment_aperture_base_hi. 1813 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 1814 1815 // TODO: can we be smarter about machine pointer info? 1816 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 1817 MachineMemOperand *MMO = MF.getMachineMemOperand( 1818 PtrInfo, 1819 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1820 MachineMemOperand::MOInvariant, 1821 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 1822 1823 Register LoadAddr; 1824 1825 B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1826 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 1827 } 1828 1829 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 1830 MachineInstr &MI, MachineRegisterInfo &MRI, 1831 MachineIRBuilder &B) const { 1832 MachineFunction &MF = B.getMF(); 1833 1834 const LLT S32 = LLT::scalar(32); 1835 Register Dst = MI.getOperand(0).getReg(); 1836 Register Src = MI.getOperand(1).getReg(); 1837 1838 LLT DstTy = MRI.getType(Dst); 1839 LLT SrcTy = MRI.getType(Src); 1840 unsigned DestAS = DstTy.getAddressSpace(); 1841 unsigned SrcAS = SrcTy.getAddressSpace(); 1842 1843 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 1844 // vector element. 1845 assert(!DstTy.isVector()); 1846 1847 const AMDGPUTargetMachine &TM 1848 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 1849 1850 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 1851 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1852 return true; 1853 } 1854 1855 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1856 // Truncate. 1857 B.buildExtract(Dst, Src, 0); 1858 MI.eraseFromParent(); 1859 return true; 1860 } 1861 1862 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1863 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1864 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1865 1866 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1867 // another. Merge operands are required to be the same type, but creating an 1868 // extra ptrtoint would be kind of pointless. 1869 auto HighAddr = B.buildConstant( 1870 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1871 B.buildMerge(Dst, {Src, HighAddr}); 1872 MI.eraseFromParent(); 1873 return true; 1874 } 1875 1876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 1877 assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 1878 DestAS == AMDGPUAS::PRIVATE_ADDRESS); 1879 unsigned NullVal = TM.getNullPointerValue(DestAS); 1880 1881 auto SegmentNull = B.buildConstant(DstTy, NullVal); 1882 auto FlatNull = B.buildConstant(SrcTy, 0); 1883 1884 // Extract low 32-bits of the pointer. 1885 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 1886 1887 auto CmpRes = 1888 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 1889 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 1890 1891 MI.eraseFromParent(); 1892 return true; 1893 } 1894 1895 if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1896 return false; 1897 1898 if (!ST.hasFlatAddressSpace()) 1899 return false; 1900 1901 auto SegmentNull = 1902 B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 1903 auto FlatNull = 1904 B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 1905 1906 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1907 if (!ApertureReg.isValid()) 1908 return false; 1909 1910 auto CmpRes = 1911 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 1912 1913 // Coerce the type of the low half of the result so we can use merge_values. 1914 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 1915 1916 // TODO: Should we allow mismatched types but matching sizes in merges to 1917 // avoid the ptrtoint? 1918 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1919 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 1920 1921 MI.eraseFromParent(); 1922 return true; 1923 } 1924 1925 bool AMDGPULegalizerInfo::legalizeFrint( 1926 MachineInstr &MI, MachineRegisterInfo &MRI, 1927 MachineIRBuilder &B) const { 1928 Register Src = MI.getOperand(1).getReg(); 1929 LLT Ty = MRI.getType(Src); 1930 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 1931 1932 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 1933 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 1934 1935 auto C1 = B.buildFConstant(Ty, C1Val); 1936 auto CopySign = B.buildFCopysign(Ty, C1, Src); 1937 1938 // TODO: Should this propagate fast-math-flags? 1939 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1940 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 1941 1942 auto C2 = B.buildFConstant(Ty, C2Val); 1943 auto Fabs = B.buildFAbs(Ty, Src); 1944 1945 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1946 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1947 MI.eraseFromParent(); 1948 return true; 1949 } 1950 1951 bool AMDGPULegalizerInfo::legalizeFceil( 1952 MachineInstr &MI, MachineRegisterInfo &MRI, 1953 MachineIRBuilder &B) const { 1954 1955 const LLT S1 = LLT::scalar(1); 1956 const LLT S64 = LLT::scalar(64); 1957 1958 Register Src = MI.getOperand(1).getReg(); 1959 assert(MRI.getType(Src) == S64); 1960 1961 // result = trunc(src) 1962 // if (src > 0.0 && src != result) 1963 // result += 1.0 1964 1965 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 1966 1967 const auto Zero = B.buildFConstant(S64, 0.0); 1968 const auto One = B.buildFConstant(S64, 1.0); 1969 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 1970 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 1971 auto And = B.buildAnd(S1, Lt0, NeTrunc); 1972 auto Add = B.buildSelect(S64, And, One, Zero); 1973 1974 // TODO: Should this propagate fast-math-flags? 1975 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 1976 return true; 1977 } 1978 1979 bool AMDGPULegalizerInfo::legalizeFrem( 1980 MachineInstr &MI, MachineRegisterInfo &MRI, 1981 MachineIRBuilder &B) const { 1982 Register DstReg = MI.getOperand(0).getReg(); 1983 Register Src0Reg = MI.getOperand(1).getReg(); 1984 Register Src1Reg = MI.getOperand(2).getReg(); 1985 auto Flags = MI.getFlags(); 1986 LLT Ty = MRI.getType(DstReg); 1987 1988 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1989 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1990 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1991 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1992 MI.eraseFromParent(); 1993 return true; 1994 } 1995 1996 static MachineInstrBuilder extractF64Exponent(Register Hi, 1997 MachineIRBuilder &B) { 1998 const unsigned FractBits = 52; 1999 const unsigned ExpBits = 11; 2000 LLT S32 = LLT::scalar(32); 2001 2002 auto Const0 = B.buildConstant(S32, FractBits - 32); 2003 auto Const1 = B.buildConstant(S32, ExpBits); 2004 2005 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2006 .addUse(Hi) 2007 .addUse(Const0.getReg(0)) 2008 .addUse(Const1.getReg(0)); 2009 2010 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2011 } 2012 2013 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2014 MachineInstr &MI, MachineRegisterInfo &MRI, 2015 MachineIRBuilder &B) const { 2016 const LLT S1 = LLT::scalar(1); 2017 const LLT S32 = LLT::scalar(32); 2018 const LLT S64 = LLT::scalar(64); 2019 2020 Register Src = MI.getOperand(1).getReg(); 2021 assert(MRI.getType(Src) == S64); 2022 2023 // TODO: Should this use extract since the low half is unused? 2024 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2025 Register Hi = Unmerge.getReg(1); 2026 2027 // Extract the upper half, since this is where we will find the sign and 2028 // exponent. 2029 auto Exp = extractF64Exponent(Hi, B); 2030 2031 const unsigned FractBits = 52; 2032 2033 // Extract the sign bit. 2034 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2035 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2036 2037 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2038 2039 const auto Zero32 = B.buildConstant(S32, 0); 2040 2041 // Extend back to 64-bits. 2042 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 2043 2044 auto Shr = B.buildAShr(S64, FractMask, Exp); 2045 auto Not = B.buildNot(S64, Shr); 2046 auto Tmp0 = B.buildAnd(S64, Src, Not); 2047 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2048 2049 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2050 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2051 2052 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2053 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2054 MI.eraseFromParent(); 2055 return true; 2056 } 2057 2058 bool AMDGPULegalizerInfo::legalizeITOFP( 2059 MachineInstr &MI, MachineRegisterInfo &MRI, 2060 MachineIRBuilder &B, bool Signed) const { 2061 2062 Register Dst = MI.getOperand(0).getReg(); 2063 Register Src = MI.getOperand(1).getReg(); 2064 2065 const LLT S64 = LLT::scalar(64); 2066 const LLT S32 = LLT::scalar(32); 2067 2068 assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 2069 2070 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2071 2072 auto CvtHi = Signed ? 2073 B.buildSITOFP(S64, Unmerge.getReg(1)) : 2074 B.buildUITOFP(S64, Unmerge.getReg(1)); 2075 2076 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2077 2078 auto ThirtyTwo = B.buildConstant(S32, 32); 2079 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 2080 .addUse(CvtHi.getReg(0)) 2081 .addUse(ThirtyTwo.getReg(0)); 2082 2083 // TODO: Should this propagate fast-math-flags? 2084 B.buildFAdd(Dst, LdExp, CvtLo); 2085 MI.eraseFromParent(); 2086 return true; 2087 } 2088 2089 // TODO: Copied from DAG implementation. Verify logic and document how this 2090 // actually works. 2091 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2092 MachineRegisterInfo &MRI, 2093 MachineIRBuilder &B, 2094 bool Signed) const { 2095 2096 Register Dst = MI.getOperand(0).getReg(); 2097 Register Src = MI.getOperand(1).getReg(); 2098 2099 const LLT S64 = LLT::scalar(64); 2100 const LLT S32 = LLT::scalar(32); 2101 2102 const LLT SrcLT = MRI.getType(Src); 2103 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2104 2105 unsigned Flags = MI.getFlags(); 2106 2107 // The basic idea of converting a floating point number into a pair of 32-bit 2108 // integers is illustrated as follows: 2109 // 2110 // tf := trunc(val); 2111 // hif := floor(tf * 2^-32); 2112 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2113 // hi := fptoi(hif); 2114 // lo := fptoi(lof); 2115 // 2116 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2117 MachineInstrBuilder Sign; 2118 if (Signed && SrcLT == S32) { 2119 // However, a 32-bit floating point number has only 23 bits mantissa and 2120 // it's not enough to hold all the significant bits of `lof` if val is 2121 // negative. To avoid the loss of precision, We need to take the absolute 2122 // value after truncating and flip the result back based on the original 2123 // signedness. 2124 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2125 Trunc = B.buildFAbs(S32, Trunc, Flags); 2126 } 2127 MachineInstrBuilder K0, K1; 2128 if (SrcLT == S64) { 2129 K0 = B.buildFConstant(S64, 2130 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2131 K1 = B.buildFConstant(S64, 2132 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2133 } else { 2134 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2135 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2136 } 2137 2138 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2139 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2140 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2141 2142 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2143 : B.buildFPTOUI(S32, FloorMul); 2144 auto Lo = B.buildFPTOUI(S32, Fma); 2145 2146 if (Signed && SrcLT == S32) { 2147 // Flip the result based on the signedness, which is either all 0s or 1s. 2148 Sign = B.buildMerge(S64, {Sign, Sign}); 2149 // r := xor({lo, hi}, sign) - sign; 2150 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2151 } else 2152 B.buildMerge(Dst, {Lo, Hi}); 2153 MI.eraseFromParent(); 2154 2155 return true; 2156 } 2157 2158 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2159 MachineInstr &MI) const { 2160 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2161 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2162 2163 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2164 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2165 2166 // With ieee_mode disabled, the instructions have the correct behavior 2167 // already for G_FMINNUM/G_FMAXNUM 2168 if (!MFI->getMode().IEEE) 2169 return !IsIEEEOp; 2170 2171 if (IsIEEEOp) 2172 return true; 2173 2174 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2175 } 2176 2177 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2178 MachineInstr &MI, MachineRegisterInfo &MRI, 2179 MachineIRBuilder &B) const { 2180 // TODO: Should move some of this into LegalizerHelper. 2181 2182 // TODO: Promote dynamic indexing of s16 to s32 2183 2184 // FIXME: Artifact combiner probably should have replaced the truncated 2185 // constant before this, so we shouldn't need 2186 // getConstantVRegValWithLookThrough. 2187 Optional<ValueAndVReg> MaybeIdxVal = 2188 getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2189 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2190 return true; 2191 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2192 2193 Register Dst = MI.getOperand(0).getReg(); 2194 Register Vec = MI.getOperand(1).getReg(); 2195 2196 LLT VecTy = MRI.getType(Vec); 2197 LLT EltTy = VecTy.getElementType(); 2198 assert(EltTy == MRI.getType(Dst)); 2199 2200 if (IdxVal < VecTy.getNumElements()) 2201 B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 2202 else 2203 B.buildUndef(Dst); 2204 2205 MI.eraseFromParent(); 2206 return true; 2207 } 2208 2209 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2210 MachineInstr &MI, MachineRegisterInfo &MRI, 2211 MachineIRBuilder &B) const { 2212 // TODO: Should move some of this into LegalizerHelper. 2213 2214 // TODO: Promote dynamic indexing of s16 to s32 2215 2216 // FIXME: Artifact combiner probably should have replaced the truncated 2217 // constant before this, so we shouldn't need 2218 // getConstantVRegValWithLookThrough. 2219 Optional<ValueAndVReg> MaybeIdxVal = 2220 getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2221 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2222 return true; 2223 2224 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 2225 Register Dst = MI.getOperand(0).getReg(); 2226 Register Vec = MI.getOperand(1).getReg(); 2227 Register Ins = MI.getOperand(2).getReg(); 2228 2229 LLT VecTy = MRI.getType(Vec); 2230 LLT EltTy = VecTy.getElementType(); 2231 assert(EltTy == MRI.getType(Ins)); 2232 2233 if (IdxVal < VecTy.getNumElements()) 2234 B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 2235 else 2236 B.buildUndef(Dst); 2237 2238 MI.eraseFromParent(); 2239 return true; 2240 } 2241 2242 bool AMDGPULegalizerInfo::legalizeShuffleVector( 2243 MachineInstr &MI, MachineRegisterInfo &MRI, 2244 MachineIRBuilder &B) const { 2245 const LLT V2S16 = LLT::fixed_vector(2, 16); 2246 2247 Register Dst = MI.getOperand(0).getReg(); 2248 Register Src0 = MI.getOperand(1).getReg(); 2249 LLT DstTy = MRI.getType(Dst); 2250 LLT SrcTy = MRI.getType(Src0); 2251 2252 if (SrcTy == V2S16 && DstTy == V2S16 && 2253 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 2254 return true; 2255 2256 MachineIRBuilder HelperBuilder(MI); 2257 GISelObserverWrapper DummyObserver; 2258 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 2259 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 2260 } 2261 2262 bool AMDGPULegalizerInfo::legalizeSinCos( 2263 MachineInstr &MI, MachineRegisterInfo &MRI, 2264 MachineIRBuilder &B) const { 2265 2266 Register DstReg = MI.getOperand(0).getReg(); 2267 Register SrcReg = MI.getOperand(1).getReg(); 2268 LLT Ty = MRI.getType(DstReg); 2269 unsigned Flags = MI.getFlags(); 2270 2271 Register TrigVal; 2272 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2273 if (ST.hasTrigReducedRange()) { 2274 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2275 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 2276 .addUse(MulVal.getReg(0)) 2277 .setMIFlags(Flags).getReg(0); 2278 } else 2279 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2280 2281 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2282 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2283 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 2284 .addUse(TrigVal) 2285 .setMIFlags(Flags); 2286 MI.eraseFromParent(); 2287 return true; 2288 } 2289 2290 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2291 MachineIRBuilder &B, 2292 const GlobalValue *GV, 2293 int64_t Offset, 2294 unsigned GAFlags) const { 2295 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2296 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2297 // to the following code sequence: 2298 // 2299 // For constant address space: 2300 // s_getpc_b64 s[0:1] 2301 // s_add_u32 s0, s0, $symbol 2302 // s_addc_u32 s1, s1, 0 2303 // 2304 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2305 // a fixup or relocation is emitted to replace $symbol with a literal 2306 // constant, which is a pc-relative offset from the encoding of the $symbol 2307 // operand to the global variable. 2308 // 2309 // For global address space: 2310 // s_getpc_b64 s[0:1] 2311 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2312 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2313 // 2314 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2315 // fixups or relocations are emitted to replace $symbol@*@lo and 2316 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2317 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2318 // operand to the global variable. 2319 // 2320 // What we want here is an offset from the value returned by s_getpc 2321 // (which is the address of the s_add_u32 instruction) to the global 2322 // variable, but since the encoding of $symbol starts 4 bytes after the start 2323 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 2324 // small. This requires us to add 4 to the global variable offset in order to 2325 // compute the correct address. Similarly for the s_addc_u32 instruction, the 2326 // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2327 // instruction. 2328 2329 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2330 2331 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2332 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2333 2334 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2335 .addDef(PCReg); 2336 2337 MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 2338 if (GAFlags == SIInstrInfo::MO_NONE) 2339 MIB.addImm(0); 2340 else 2341 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 2342 2343 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2344 2345 if (PtrTy.getSizeInBits() == 32) 2346 B.buildExtract(DstReg, PCReg, 0); 2347 return true; 2348 } 2349 2350 bool AMDGPULegalizerInfo::legalizeGlobalValue( 2351 MachineInstr &MI, MachineRegisterInfo &MRI, 2352 MachineIRBuilder &B) const { 2353 Register DstReg = MI.getOperand(0).getReg(); 2354 LLT Ty = MRI.getType(DstReg); 2355 unsigned AS = Ty.getAddressSpace(); 2356 2357 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 2358 MachineFunction &MF = B.getMF(); 2359 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2360 2361 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2362 if (!MFI->isModuleEntryFunction() && 2363 !GV->getName().equals("llvm.amdgcn.module.lds")) { 2364 const Function &Fn = MF.getFunction(); 2365 DiagnosticInfoUnsupported BadLDSDecl( 2366 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2367 DS_Warning); 2368 Fn.getContext().diagnose(BadLDSDecl); 2369 2370 // We currently don't have a way to correctly allocate LDS objects that 2371 // aren't directly associated with a kernel. We do force inlining of 2372 // functions that use local objects. However, if these dead functions are 2373 // not eliminated, we don't want a compile time error. Just emit a warning 2374 // and a trap, since there should be no callable path here. 2375 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2376 B.buildUndef(DstReg); 2377 MI.eraseFromParent(); 2378 return true; 2379 } 2380 2381 // TODO: We could emit code to handle the initialization somewhere. 2382 if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2383 const SITargetLowering *TLI = ST.getTargetLowering(); 2384 if (!TLI->shouldUseLDSConstAddress(GV)) { 2385 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2386 return true; // Leave in place; 2387 } 2388 2389 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2390 Type *Ty = GV->getValueType(); 2391 // HIP uses an unsized array `extern __shared__ T s[]` or similar 2392 // zero-sized type in other languages to declare the dynamic shared 2393 // memory which size is not known at the compile time. They will be 2394 // allocated by the runtime and placed directly after the static 2395 // allocated ones. They all share the same offset. 2396 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2397 // Adjust alignment for that dynamic shared memory array. 2398 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2399 LLT S32 = LLT::scalar(32); 2400 auto Sz = 2401 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2402 B.buildIntToPtr(DstReg, Sz); 2403 MI.eraseFromParent(); 2404 return true; 2405 } 2406 } 2407 2408 B.buildConstant( 2409 DstReg, 2410 MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 2411 MI.eraseFromParent(); 2412 return true; 2413 } 2414 2415 const Function &Fn = MF.getFunction(); 2416 DiagnosticInfoUnsupported BadInit( 2417 Fn, "unsupported initializer for address space", MI.getDebugLoc()); 2418 Fn.getContext().diagnose(BadInit); 2419 return true; 2420 } 2421 2422 const SITargetLowering *TLI = ST.getTargetLowering(); 2423 2424 if (TLI->shouldEmitFixup(GV)) { 2425 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 2426 MI.eraseFromParent(); 2427 return true; 2428 } 2429 2430 if (TLI->shouldEmitPCReloc(GV)) { 2431 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 2432 MI.eraseFromParent(); 2433 return true; 2434 } 2435 2436 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2437 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 2438 2439 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 2440 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 2441 MachinePointerInfo::getGOT(MF), 2442 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2443 MachineMemOperand::MOInvariant, 2444 LoadTy, Align(8)); 2445 2446 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 2447 2448 if (Ty.getSizeInBits() == 32) { 2449 // Truncate if this is a 32-bit constant adrdess. 2450 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 2451 B.buildExtract(DstReg, Load, 0); 2452 } else 2453 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 2454 2455 MI.eraseFromParent(); 2456 return true; 2457 } 2458 2459 static LLT widenToNextPowerOf2(LLT Ty) { 2460 if (Ty.isVector()) 2461 return Ty.changeElementCount( 2462 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2463 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2464 } 2465 2466 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2467 MachineInstr &MI) const { 2468 MachineIRBuilder &B = Helper.MIRBuilder; 2469 MachineRegisterInfo &MRI = *B.getMRI(); 2470 GISelChangeObserver &Observer = Helper.Observer; 2471 2472 Register PtrReg = MI.getOperand(1).getReg(); 2473 LLT PtrTy = MRI.getType(PtrReg); 2474 unsigned AddrSpace = PtrTy.getAddressSpace(); 2475 2476 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 2477 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2478 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 2479 Observer.changingInstr(MI); 2480 MI.getOperand(1).setReg(Cast.getReg(0)); 2481 Observer.changedInstr(MI); 2482 return true; 2483 } 2484 2485 if (MI.getOpcode() != AMDGPU::G_LOAD) 2486 return false; 2487 2488 Register ValReg = MI.getOperand(0).getReg(); 2489 LLT ValTy = MRI.getType(ValReg); 2490 2491 MachineMemOperand *MMO = *MI.memoperands_begin(); 2492 const unsigned ValSize = ValTy.getSizeInBits(); 2493 const LLT MemTy = MMO->getMemoryType(); 2494 const Align MemAlign = MMO->getAlign(); 2495 const unsigned MemSize = MemTy.getSizeInBits(); 2496 const unsigned AlignInBits = 8 * MemAlign.value(); 2497 2498 // Widen non-power-of-2 loads to the alignment if needed 2499 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2500 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2501 2502 // This was already the correct extending load result type, so just adjust 2503 // the memory type. 2504 if (WideMemSize == ValSize) { 2505 MachineFunction &MF = B.getMF(); 2506 2507 MachineMemOperand *WideMMO = 2508 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2509 Observer.changingInstr(MI); 2510 MI.setMemRefs(MF, {WideMMO}); 2511 Observer.changedInstr(MI); 2512 return true; 2513 } 2514 2515 // Don't bother handling edge case that should probably never be produced. 2516 if (ValSize > WideMemSize) 2517 return false; 2518 2519 LLT WideTy = widenToNextPowerOf2(ValTy); 2520 2521 Register WideLoad; 2522 if (!WideTy.isVector()) { 2523 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2524 B.buildTrunc(ValReg, WideLoad).getReg(0); 2525 } else { 2526 // Extract the subvector. 2527 2528 if (isRegisterType(ValTy)) { 2529 // If this a case where G_EXTRACT is legal, use it. 2530 // (e.g. <3 x s32> -> <4 x s32>) 2531 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2532 B.buildExtract(ValReg, WideLoad, 0); 2533 } else { 2534 // For cases where the widened type isn't a nice register value, unmerge 2535 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2536 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2537 WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2538 B.setInsertPt(B.getMBB(), MI.getIterator()); 2539 B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2540 } 2541 } 2542 2543 MI.eraseFromParent(); 2544 return true; 2545 } 2546 2547 return false; 2548 } 2549 2550 bool AMDGPULegalizerInfo::legalizeFMad( 2551 MachineInstr &MI, MachineRegisterInfo &MRI, 2552 MachineIRBuilder &B) const { 2553 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 2554 assert(Ty.isScalar()); 2555 2556 MachineFunction &MF = B.getMF(); 2557 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2558 2559 // TODO: Always legal with future ftz flag. 2560 // FIXME: Do we need just output? 2561 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 2562 return true; 2563 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 2564 return true; 2565 2566 MachineIRBuilder HelperBuilder(MI); 2567 GISelObserverWrapper DummyObserver; 2568 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 2569 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 2570 } 2571 2572 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2573 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2574 Register DstReg = MI.getOperand(0).getReg(); 2575 Register PtrReg = MI.getOperand(1).getReg(); 2576 Register CmpVal = MI.getOperand(2).getReg(); 2577 Register NewVal = MI.getOperand(3).getReg(); 2578 2579 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2580 "this should not have been custom lowered"); 2581 2582 LLT ValTy = MRI.getType(CmpVal); 2583 LLT VecTy = LLT::fixed_vector(2, ValTy); 2584 2585 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2586 2587 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2588 .addDef(DstReg) 2589 .addUse(PtrReg) 2590 .addUse(PackedVal) 2591 .setMemRefs(MI.memoperands()); 2592 2593 MI.eraseFromParent(); 2594 return true; 2595 } 2596 2597 bool AMDGPULegalizerInfo::legalizeFlog( 2598 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2599 Register Dst = MI.getOperand(0).getReg(); 2600 Register Src = MI.getOperand(1).getReg(); 2601 LLT Ty = B.getMRI()->getType(Dst); 2602 unsigned Flags = MI.getFlags(); 2603 2604 auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2605 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2606 2607 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2608 MI.eraseFromParent(); 2609 return true; 2610 } 2611 2612 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2613 MachineIRBuilder &B) const { 2614 Register Dst = MI.getOperand(0).getReg(); 2615 Register Src = MI.getOperand(1).getReg(); 2616 unsigned Flags = MI.getFlags(); 2617 LLT Ty = B.getMRI()->getType(Dst); 2618 2619 auto K = B.buildFConstant(Ty, numbers::log2e); 2620 auto Mul = B.buildFMul(Ty, Src, K, Flags); 2621 B.buildFExp2(Dst, Mul, Flags); 2622 MI.eraseFromParent(); 2623 return true; 2624 } 2625 2626 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2627 MachineIRBuilder &B) const { 2628 Register Dst = MI.getOperand(0).getReg(); 2629 Register Src0 = MI.getOperand(1).getReg(); 2630 Register Src1 = MI.getOperand(2).getReg(); 2631 unsigned Flags = MI.getFlags(); 2632 LLT Ty = B.getMRI()->getType(Dst); 2633 const LLT S16 = LLT::scalar(16); 2634 const LLT S32 = LLT::scalar(32); 2635 2636 if (Ty == S32) { 2637 auto Log = B.buildFLog2(S32, Src0, Flags); 2638 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2639 .addUse(Log.getReg(0)) 2640 .addUse(Src1) 2641 .setMIFlags(Flags); 2642 B.buildFExp2(Dst, Mul, Flags); 2643 } else if (Ty == S16) { 2644 // There's no f16 fmul_legacy, so we need to convert for it. 2645 auto Log = B.buildFLog2(S16, Src0, Flags); 2646 auto Ext0 = B.buildFPExt(S32, Log, Flags); 2647 auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2648 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2649 .addUse(Ext0.getReg(0)) 2650 .addUse(Ext1.getReg(0)) 2651 .setMIFlags(Flags); 2652 2653 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2654 } else 2655 return false; 2656 2657 MI.eraseFromParent(); 2658 return true; 2659 } 2660 2661 // Find a source register, ignoring any possible source modifiers. 2662 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2663 Register ModSrc = OrigSrc; 2664 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2665 ModSrc = SrcFNeg->getOperand(1).getReg(); 2666 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2667 ModSrc = SrcFAbs->getOperand(1).getReg(); 2668 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2669 ModSrc = SrcFAbs->getOperand(1).getReg(); 2670 return ModSrc; 2671 } 2672 2673 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2674 MachineRegisterInfo &MRI, 2675 MachineIRBuilder &B) const { 2676 2677 const LLT S1 = LLT::scalar(1); 2678 const LLT S64 = LLT::scalar(64); 2679 Register Dst = MI.getOperand(0).getReg(); 2680 Register OrigSrc = MI.getOperand(1).getReg(); 2681 unsigned Flags = MI.getFlags(); 2682 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2683 "this should not have been custom lowered"); 2684 2685 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2686 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2687 // efficient way to implement it is using V_FRACT_F64. The workaround for the 2688 // V_FRACT bug is: 2689 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2690 // 2691 // Convert floor(x) to (x - fract(x)) 2692 2693 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2694 .addUse(OrigSrc) 2695 .setMIFlags(Flags); 2696 2697 // Give source modifier matching some assistance before obscuring a foldable 2698 // pattern. 2699 2700 // TODO: We can avoid the neg on the fract? The input sign to fract 2701 // shouldn't matter? 2702 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2703 2704 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2705 2706 Register Min = MRI.createGenericVirtualRegister(S64); 2707 2708 // We don't need to concern ourselves with the snan handling difference, so 2709 // use the one which will directly select. 2710 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2711 if (MFI->getMode().IEEE) 2712 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2713 else 2714 B.buildFMinNum(Min, Fract, Const, Flags); 2715 2716 Register CorrectedFract = Min; 2717 if (!MI.getFlag(MachineInstr::FmNoNans)) { 2718 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2719 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2720 } 2721 2722 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2723 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2724 2725 MI.eraseFromParent(); 2726 return true; 2727 } 2728 2729 // Turn an illegal packed v2s16 build vector into bit operations. 2730 // TODO: This should probably be a bitcast action in LegalizerHelper. 2731 bool AMDGPULegalizerInfo::legalizeBuildVector( 2732 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2733 Register Dst = MI.getOperand(0).getReg(); 2734 const LLT S32 = LLT::scalar(32); 2735 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 2736 2737 Register Src0 = MI.getOperand(1).getReg(); 2738 Register Src1 = MI.getOperand(2).getReg(); 2739 assert(MRI.getType(Src0) == LLT::scalar(16)); 2740 2741 auto Merge = B.buildMerge(S32, {Src0, Src1}); 2742 B.buildBitcast(Dst, Merge); 2743 2744 MI.eraseFromParent(); 2745 return true; 2746 } 2747 2748 // Check that this is a G_XOR x, -1 2749 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2750 if (MI.getOpcode() != TargetOpcode::G_XOR) 2751 return false; 2752 auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2753 return ConstVal && *ConstVal == -1; 2754 } 2755 2756 // Return the use branch instruction, otherwise null if the usage is invalid. 2757 static MachineInstr * 2758 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2759 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 2760 Register CondDef = MI.getOperand(0).getReg(); 2761 if (!MRI.hasOneNonDBGUse(CondDef)) 2762 return nullptr; 2763 2764 MachineBasicBlock *Parent = MI.getParent(); 2765 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2766 2767 if (isNot(MRI, *UseMI)) { 2768 Register NegatedCond = UseMI->getOperand(0).getReg(); 2769 if (!MRI.hasOneNonDBGUse(NegatedCond)) 2770 return nullptr; 2771 2772 // We're deleting the def of this value, so we need to remove it. 2773 UseMI->eraseFromParent(); 2774 2775 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2776 Negated = true; 2777 } 2778 2779 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2780 return nullptr; 2781 2782 // Make sure the cond br is followed by a G_BR, or is the last instruction. 2783 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 2784 if (Next == Parent->end()) { 2785 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2786 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2787 return nullptr; 2788 UncondBrTarget = &*NextMBB; 2789 } else { 2790 if (Next->getOpcode() != AMDGPU::G_BR) 2791 return nullptr; 2792 Br = &*Next; 2793 UncondBrTarget = Br->getOperand(0).getMBB(); 2794 } 2795 2796 return UseMI; 2797 } 2798 2799 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2800 const ArgDescriptor *Arg, 2801 const TargetRegisterClass *ArgRC, 2802 LLT ArgTy) const { 2803 MCRegister SrcReg = Arg->getRegister(); 2804 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 2805 assert(DstReg.isVirtual() && "Virtual register expected"); 2806 2807 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2808 ArgTy); 2809 if (Arg->isMasked()) { 2810 // TODO: Should we try to emit this once in the entry block? 2811 const LLT S32 = LLT::scalar(32); 2812 const unsigned Mask = Arg->getMask(); 2813 const unsigned Shift = countTrailingZeros<unsigned>(Mask); 2814 2815 Register AndMaskSrc = LiveIn; 2816 2817 if (Shift != 0) { 2818 auto ShiftAmt = B.buildConstant(S32, Shift); 2819 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 2820 } 2821 2822 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2823 } else { 2824 B.buildCopy(DstReg, LiveIn); 2825 } 2826 2827 return true; 2828 } 2829 2830 bool AMDGPULegalizerInfo::loadInputValue( 2831 Register DstReg, MachineIRBuilder &B, 2832 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2833 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2834 const ArgDescriptor *Arg; 2835 const TargetRegisterClass *ArgRC; 2836 LLT ArgTy; 2837 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2838 2839 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2840 return false; // TODO: Handle these 2841 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2842 } 2843 2844 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2845 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 2846 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2847 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 2848 return false; 2849 2850 MI.eraseFromParent(); 2851 return true; 2852 } 2853 2854 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 2855 MachineRegisterInfo &MRI, 2856 MachineIRBuilder &B) const { 2857 Register Dst = MI.getOperand(0).getReg(); 2858 LLT DstTy = MRI.getType(Dst); 2859 LLT S16 = LLT::scalar(16); 2860 LLT S32 = LLT::scalar(32); 2861 LLT S64 = LLT::scalar(64); 2862 2863 if (DstTy == S16) 2864 return legalizeFDIV16(MI, MRI, B); 2865 if (DstTy == S32) 2866 return legalizeFDIV32(MI, MRI, B); 2867 if (DstTy == S64) 2868 return legalizeFDIV64(MI, MRI, B); 2869 2870 return false; 2871 } 2872 2873 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 2874 Register DstDivReg, 2875 Register DstRemReg, 2876 Register X, 2877 Register Y) const { 2878 const LLT S1 = LLT::scalar(1); 2879 const LLT S32 = LLT::scalar(32); 2880 2881 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2882 // algorithm used here. 2883 2884 // Initial estimate of inv(y). 2885 auto FloatY = B.buildUITOFP(S32, Y); 2886 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2887 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2888 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2889 auto Z = B.buildFPTOUI(S32, ScaledY); 2890 2891 // One round of UNR. 2892 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2893 auto NegYZ = B.buildMul(S32, NegY, Z); 2894 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2895 2896 // Quotient/remainder estimate. 2897 auto Q = B.buildUMulH(S32, X, Z); 2898 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2899 2900 // First quotient/remainder refinement. 2901 auto One = B.buildConstant(S32, 1); 2902 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2903 if (DstDivReg) 2904 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2905 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2906 2907 // Second quotient/remainder refinement. 2908 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2909 if (DstDivReg) 2910 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 2911 2912 if (DstRemReg) 2913 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 2914 } 2915 2916 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2917 // 2918 // Return lo, hi of result 2919 // 2920 // %cvt.lo = G_UITOFP Val.lo 2921 // %cvt.hi = G_UITOFP Val.hi 2922 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2923 // %rcp = G_AMDGPU_RCP_IFLAG %mad 2924 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2925 // %mul2 = G_FMUL %mul1, 2**(-32) 2926 // %trunc = G_INTRINSIC_TRUNC %mul2 2927 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2928 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2929 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2930 Register Val) { 2931 const LLT S32 = LLT::scalar(32); 2932 auto Unmerge = B.buildUnmerge(S32, Val); 2933 2934 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2935 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2936 2937 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2938 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2939 2940 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2941 auto Mul1 = 2942 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2943 2944 // 2**(-32) 2945 auto Mul2 = 2946 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2947 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2948 2949 // -(2**32) 2950 auto Mad2 = B.buildFMAD(S32, Trunc, 2951 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2952 2953 auto ResultLo = B.buildFPTOUI(S32, Mad2); 2954 auto ResultHi = B.buildFPTOUI(S32, Trunc); 2955 2956 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2957 } 2958 2959 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 2960 Register DstDivReg, 2961 Register DstRemReg, 2962 Register Numer, 2963 Register Denom) const { 2964 const LLT S32 = LLT::scalar(32); 2965 const LLT S64 = LLT::scalar(64); 2966 const LLT S1 = LLT::scalar(1); 2967 Register RcpLo, RcpHi; 2968 2969 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2970 2971 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2972 2973 auto Zero64 = B.buildConstant(S64, 0); 2974 auto NegDenom = B.buildSub(S64, Zero64, Denom); 2975 2976 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2977 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2978 2979 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2980 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2981 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2982 2983 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2984 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2985 auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2986 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2987 2988 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2989 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2990 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2991 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2992 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2993 2994 auto Zero32 = B.buildConstant(S32, 0); 2995 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2996 auto Add2_HiC = 2997 B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2998 auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2999 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 3000 3001 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 3002 Register NumerLo = UnmergeNumer.getReg(0); 3003 Register NumerHi = UnmergeNumer.getReg(1); 3004 3005 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 3006 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 3007 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 3008 Register Mul3_Lo = UnmergeMul3.getReg(0); 3009 Register Mul3_Hi = UnmergeMul3.getReg(1); 3010 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 3011 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 3012 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3013 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 3014 3015 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 3016 Register DenomLo = UnmergeDenom.getReg(0); 3017 Register DenomHi = UnmergeDenom.getReg(1); 3018 3019 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 3020 auto C1 = B.buildSExt(S32, CmpHi); 3021 3022 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 3023 auto C2 = B.buildSExt(S32, CmpLo); 3024 3025 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 3026 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 3027 3028 // TODO: Here and below portions of the code can be enclosed into if/endif. 3029 // Currently control flow is unconditional and we have 4 selects after 3030 // potential endif to substitute PHIs. 3031 3032 // if C3 != 0 ... 3033 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 3034 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 3035 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3036 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 3037 3038 auto One64 = B.buildConstant(S64, 1); 3039 auto Add3 = B.buildAdd(S64, MulHi3, One64); 3040 3041 auto C4 = 3042 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 3043 auto C5 = 3044 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 3045 auto C6 = B.buildSelect( 3046 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 3047 3048 // if (C6 != 0) 3049 auto Add4 = B.buildAdd(S64, Add3, One64); 3050 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 3051 3052 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 3053 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3054 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 3055 3056 // endif C6 3057 // endif C3 3058 3059 if (DstDivReg) { 3060 auto Sel1 = B.buildSelect( 3061 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3062 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3063 Sel1, MulHi3); 3064 } 3065 3066 if (DstRemReg) { 3067 auto Sel2 = B.buildSelect( 3068 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3069 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3070 Sel2, Sub1); 3071 } 3072 } 3073 3074 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 3075 MachineRegisterInfo &MRI, 3076 MachineIRBuilder &B) const { 3077 Register DstDivReg, DstRemReg; 3078 switch (MI.getOpcode()) { 3079 default: 3080 llvm_unreachable("Unexpected opcode!"); 3081 case AMDGPU::G_UDIV: { 3082 DstDivReg = MI.getOperand(0).getReg(); 3083 break; 3084 } 3085 case AMDGPU::G_UREM: { 3086 DstRemReg = MI.getOperand(0).getReg(); 3087 break; 3088 } 3089 case AMDGPU::G_UDIVREM: { 3090 DstDivReg = MI.getOperand(0).getReg(); 3091 DstRemReg = MI.getOperand(1).getReg(); 3092 break; 3093 } 3094 } 3095 3096 const LLT S64 = LLT::scalar(64); 3097 const LLT S32 = LLT::scalar(32); 3098 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3099 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3100 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3101 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3102 3103 if (Ty == S32) 3104 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 3105 else if (Ty == S64) 3106 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 3107 else 3108 return false; 3109 3110 MI.eraseFromParent(); 3111 return true; 3112 } 3113 3114 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 3115 MachineRegisterInfo &MRI, 3116 MachineIRBuilder &B) const { 3117 const LLT S64 = LLT::scalar(64); 3118 const LLT S32 = LLT::scalar(32); 3119 3120 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3121 if (Ty != S32 && Ty != S64) 3122 return false; 3123 3124 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3125 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3126 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3127 3128 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 3129 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 3130 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 3131 3132 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 3133 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 3134 3135 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 3136 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 3137 3138 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3139 switch (MI.getOpcode()) { 3140 default: 3141 llvm_unreachable("Unexpected opcode!"); 3142 case AMDGPU::G_SDIV: { 3143 DstDivReg = MI.getOperand(0).getReg(); 3144 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3145 break; 3146 } 3147 case AMDGPU::G_SREM: { 3148 DstRemReg = MI.getOperand(0).getReg(); 3149 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3150 break; 3151 } 3152 case AMDGPU::G_SDIVREM: { 3153 DstDivReg = MI.getOperand(0).getReg(); 3154 DstRemReg = MI.getOperand(1).getReg(); 3155 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3156 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3157 break; 3158 } 3159 } 3160 3161 if (Ty == S32) 3162 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3163 else 3164 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 3165 3166 if (DstDivReg) { 3167 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3168 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3169 B.buildSub(DstDivReg, SignXor, Sign); 3170 } 3171 3172 if (DstRemReg) { 3173 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3174 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3175 B.buildSub(DstRemReg, SignXor, Sign); 3176 } 3177 3178 MI.eraseFromParent(); 3179 return true; 3180 } 3181 3182 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 3183 MachineRegisterInfo &MRI, 3184 MachineIRBuilder &B) const { 3185 Register Res = MI.getOperand(0).getReg(); 3186 Register LHS = MI.getOperand(1).getReg(); 3187 Register RHS = MI.getOperand(2).getReg(); 3188 uint16_t Flags = MI.getFlags(); 3189 LLT ResTy = MRI.getType(Res); 3190 3191 const MachineFunction &MF = B.getMF(); 3192 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3193 MI.getFlag(MachineInstr::FmAfn); 3194 3195 if (!AllowInaccurateRcp) 3196 return false; 3197 3198 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 3199 // 1 / x -> RCP(x) 3200 if (CLHS->isExactlyValue(1.0)) { 3201 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3202 .addUse(RHS) 3203 .setMIFlags(Flags); 3204 3205 MI.eraseFromParent(); 3206 return true; 3207 } 3208 3209 // -1 / x -> RCP( FNEG(x) ) 3210 if (CLHS->isExactlyValue(-1.0)) { 3211 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 3212 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 3213 .addUse(FNeg.getReg(0)) 3214 .setMIFlags(Flags); 3215 3216 MI.eraseFromParent(); 3217 return true; 3218 } 3219 } 3220 3221 // x / y -> x * (1.0 / y) 3222 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3223 .addUse(RHS) 3224 .setMIFlags(Flags); 3225 B.buildFMul(Res, LHS, RCP, Flags); 3226 3227 MI.eraseFromParent(); 3228 return true; 3229 } 3230 3231 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3232 MachineRegisterInfo &MRI, 3233 MachineIRBuilder &B) const { 3234 Register Res = MI.getOperand(0).getReg(); 3235 Register X = MI.getOperand(1).getReg(); 3236 Register Y = MI.getOperand(2).getReg(); 3237 uint16_t Flags = MI.getFlags(); 3238 LLT ResTy = MRI.getType(Res); 3239 3240 const MachineFunction &MF = B.getMF(); 3241 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3242 MI.getFlag(MachineInstr::FmAfn); 3243 3244 if (!AllowInaccurateRcp) 3245 return false; 3246 3247 auto NegY = B.buildFNeg(ResTy, Y); 3248 auto One = B.buildFConstant(ResTy, 1.0); 3249 3250 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3251 .addUse(Y) 3252 .setMIFlags(Flags); 3253 3254 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3255 R = B.buildFMA(ResTy, Tmp0, R, R); 3256 3257 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3258 R = B.buildFMA(ResTy, Tmp1, R, R); 3259 3260 auto Ret = B.buildFMul(ResTy, X, R); 3261 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3262 3263 B.buildFMA(Res, Tmp2, R, Ret); 3264 MI.eraseFromParent(); 3265 return true; 3266 } 3267 3268 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3269 MachineRegisterInfo &MRI, 3270 MachineIRBuilder &B) const { 3271 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3272 return true; 3273 3274 Register Res = MI.getOperand(0).getReg(); 3275 Register LHS = MI.getOperand(1).getReg(); 3276 Register RHS = MI.getOperand(2).getReg(); 3277 3278 uint16_t Flags = MI.getFlags(); 3279 3280 LLT S16 = LLT::scalar(16); 3281 LLT S32 = LLT::scalar(32); 3282 3283 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3284 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3285 3286 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3287 .addUse(RHSExt.getReg(0)) 3288 .setMIFlags(Flags); 3289 3290 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3291 auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3292 3293 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3294 .addUse(RDst.getReg(0)) 3295 .addUse(RHS) 3296 .addUse(LHS) 3297 .setMIFlags(Flags); 3298 3299 MI.eraseFromParent(); 3300 return true; 3301 } 3302 3303 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3304 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3305 static void toggleSPDenormMode(bool Enable, 3306 MachineIRBuilder &B, 3307 const GCNSubtarget &ST, 3308 AMDGPU::SIModeRegisterDefaults Mode) { 3309 // Set SP denorm mode to this value. 3310 unsigned SPDenormMode = 3311 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3312 3313 if (ST.hasDenormModeInst()) { 3314 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 3315 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3316 3317 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3318 B.buildInstr(AMDGPU::S_DENORM_MODE) 3319 .addImm(NewDenormModeValue); 3320 3321 } else { 3322 // Select FP32 bit field in mode register. 3323 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3324 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3325 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3326 3327 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3328 .addImm(SPDenormMode) 3329 .addImm(SPDenormModeBitField); 3330 } 3331 } 3332 3333 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3334 MachineRegisterInfo &MRI, 3335 MachineIRBuilder &B) const { 3336 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3337 return true; 3338 3339 Register Res = MI.getOperand(0).getReg(); 3340 Register LHS = MI.getOperand(1).getReg(); 3341 Register RHS = MI.getOperand(2).getReg(); 3342 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3343 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3344 3345 uint16_t Flags = MI.getFlags(); 3346 3347 LLT S32 = LLT::scalar(32); 3348 LLT S1 = LLT::scalar(1); 3349 3350 auto One = B.buildFConstant(S32, 1.0f); 3351 3352 auto DenominatorScaled = 3353 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3354 .addUse(LHS) 3355 .addUse(RHS) 3356 .addImm(0) 3357 .setMIFlags(Flags); 3358 auto NumeratorScaled = 3359 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3360 .addUse(LHS) 3361 .addUse(RHS) 3362 .addImm(1) 3363 .setMIFlags(Flags); 3364 3365 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3366 .addUse(DenominatorScaled.getReg(0)) 3367 .setMIFlags(Flags); 3368 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3369 3370 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3371 // aren't modeled as reading it. 3372 if (!Mode.allFP32Denormals()) 3373 toggleSPDenormMode(true, B, ST, Mode); 3374 3375 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3376 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3377 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3378 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3379 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3380 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3381 3382 if (!Mode.allFP32Denormals()) 3383 toggleSPDenormMode(false, B, ST, Mode); 3384 3385 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3386 .addUse(Fma4.getReg(0)) 3387 .addUse(Fma1.getReg(0)) 3388 .addUse(Fma3.getReg(0)) 3389 .addUse(NumeratorScaled.getReg(1)) 3390 .setMIFlags(Flags); 3391 3392 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3393 .addUse(Fmas.getReg(0)) 3394 .addUse(RHS) 3395 .addUse(LHS) 3396 .setMIFlags(Flags); 3397 3398 MI.eraseFromParent(); 3399 return true; 3400 } 3401 3402 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3403 MachineRegisterInfo &MRI, 3404 MachineIRBuilder &B) const { 3405 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3406 return true; 3407 3408 Register Res = MI.getOperand(0).getReg(); 3409 Register LHS = MI.getOperand(1).getReg(); 3410 Register RHS = MI.getOperand(2).getReg(); 3411 3412 uint16_t Flags = MI.getFlags(); 3413 3414 LLT S64 = LLT::scalar(64); 3415 LLT S1 = LLT::scalar(1); 3416 3417 auto One = B.buildFConstant(S64, 1.0); 3418 3419 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3420 .addUse(LHS) 3421 .addUse(RHS) 3422 .addImm(0) 3423 .setMIFlags(Flags); 3424 3425 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3426 3427 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3428 .addUse(DivScale0.getReg(0)) 3429 .setMIFlags(Flags); 3430 3431 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3432 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3433 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3434 3435 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3436 .addUse(LHS) 3437 .addUse(RHS) 3438 .addImm(1) 3439 .setMIFlags(Flags); 3440 3441 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3442 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3443 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3444 3445 Register Scale; 3446 if (!ST.hasUsableDivScaleConditionOutput()) { 3447 // Workaround a hardware bug on SI where the condition output from div_scale 3448 // is not usable. 3449 3450 LLT S32 = LLT::scalar(32); 3451 3452 auto NumUnmerge = B.buildUnmerge(S32, LHS); 3453 auto DenUnmerge = B.buildUnmerge(S32, RHS); 3454 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3455 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3456 3457 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3458 Scale1Unmerge.getReg(1)); 3459 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3460 Scale0Unmerge.getReg(1)); 3461 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3462 } else { 3463 Scale = DivScale1.getReg(1); 3464 } 3465 3466 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3467 .addUse(Fma4.getReg(0)) 3468 .addUse(Fma3.getReg(0)) 3469 .addUse(Mul.getReg(0)) 3470 .addUse(Scale) 3471 .setMIFlags(Flags); 3472 3473 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3474 .addUse(Fmas.getReg(0)) 3475 .addUse(RHS) 3476 .addUse(LHS) 3477 .setMIFlags(Flags); 3478 3479 MI.eraseFromParent(); 3480 return true; 3481 } 3482 3483 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 3484 MachineRegisterInfo &MRI, 3485 MachineIRBuilder &B) const { 3486 Register Res = MI.getOperand(0).getReg(); 3487 Register LHS = MI.getOperand(2).getReg(); 3488 Register RHS = MI.getOperand(3).getReg(); 3489 uint16_t Flags = MI.getFlags(); 3490 3491 LLT S32 = LLT::scalar(32); 3492 LLT S1 = LLT::scalar(1); 3493 3494 auto Abs = B.buildFAbs(S32, RHS, Flags); 3495 const APFloat C0Val(1.0f); 3496 3497 auto C0 = B.buildConstant(S32, 0x6f800000); 3498 auto C1 = B.buildConstant(S32, 0x2f800000); 3499 auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 3500 3501 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 3502 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 3503 3504 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 3505 3506 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3507 .addUse(Mul0.getReg(0)) 3508 .setMIFlags(Flags); 3509 3510 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 3511 3512 B.buildFMul(Res, Sel, Mul1, Flags); 3513 3514 MI.eraseFromParent(); 3515 return true; 3516 } 3517 3518 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3519 // FIXME: Why do we handle this one but not other removed instructions? 3520 // 3521 // Reciprocal square root. The clamp prevents infinite results, clamping 3522 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3523 // +-max_float. 3524 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3525 MachineRegisterInfo &MRI, 3526 MachineIRBuilder &B) const { 3527 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3528 return true; 3529 3530 Register Dst = MI.getOperand(0).getReg(); 3531 Register Src = MI.getOperand(2).getReg(); 3532 auto Flags = MI.getFlags(); 3533 3534 LLT Ty = MRI.getType(Dst); 3535 3536 const fltSemantics *FltSemantics; 3537 if (Ty == LLT::scalar(32)) 3538 FltSemantics = &APFloat::IEEEsingle(); 3539 else if (Ty == LLT::scalar(64)) 3540 FltSemantics = &APFloat::IEEEdouble(); 3541 else 3542 return false; 3543 3544 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3545 .addUse(Src) 3546 .setMIFlags(Flags); 3547 3548 // We don't need to concern ourselves with the snan handling difference, since 3549 // the rsq quieted (or not) so use the one which will directly select. 3550 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3551 const bool UseIEEE = MFI->getMode().IEEE; 3552 3553 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3554 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3555 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3556 3557 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3558 3559 if (UseIEEE) 3560 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3561 else 3562 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3563 MI.eraseFromParent(); 3564 return true; 3565 } 3566 3567 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3568 switch (IID) { 3569 case Intrinsic::amdgcn_ds_fadd: 3570 return AMDGPU::G_ATOMICRMW_FADD; 3571 case Intrinsic::amdgcn_ds_fmin: 3572 return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3573 case Intrinsic::amdgcn_ds_fmax: 3574 return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3575 default: 3576 llvm_unreachable("not a DS FP intrinsic"); 3577 } 3578 } 3579 3580 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3581 MachineInstr &MI, 3582 Intrinsic::ID IID) const { 3583 GISelChangeObserver &Observer = Helper.Observer; 3584 Observer.changingInstr(MI); 3585 3586 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3587 3588 // The remaining operands were used to set fields in the MemOperand on 3589 // construction. 3590 for (int I = 6; I > 3; --I) 3591 MI.RemoveOperand(I); 3592 3593 MI.RemoveOperand(1); // Remove the intrinsic ID. 3594 Observer.changedInstr(MI); 3595 return true; 3596 } 3597 3598 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3599 MachineRegisterInfo &MRI, 3600 MachineIRBuilder &B) const { 3601 uint64_t Offset = 3602 ST.getTargetLowering()->getImplicitParameterOffset( 3603 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3604 LLT DstTy = MRI.getType(DstReg); 3605 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3606 3607 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3608 if (!loadInputValue(KernargPtrReg, B, 3609 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3610 return false; 3611 3612 // FIXME: This should be nuw 3613 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3614 return true; 3615 } 3616 3617 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 3618 MachineRegisterInfo &MRI, 3619 MachineIRBuilder &B) const { 3620 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3621 if (!MFI->isEntryFunction()) { 3622 return legalizePreloadedArgIntrin(MI, MRI, B, 3623 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 3624 } 3625 3626 Register DstReg = MI.getOperand(0).getReg(); 3627 if (!getImplicitArgPtr(DstReg, MRI, B)) 3628 return false; 3629 3630 MI.eraseFromParent(); 3631 return true; 3632 } 3633 3634 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 3635 MachineRegisterInfo &MRI, 3636 MachineIRBuilder &B, 3637 unsigned AddrSpace) const { 3638 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3639 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3640 Register Hi32 = Unmerge.getReg(1); 3641 3642 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 3643 MI.eraseFromParent(); 3644 return true; 3645 } 3646 3647 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3648 // offset (the offset that is included in bounds checking and swizzling, to be 3649 // split between the instruction's voffset and immoffset fields) and soffset 3650 // (the offset that is excluded from bounds checking and swizzling, to go in 3651 // the instruction's soffset field). This function takes the first kind of 3652 // offset and figures out how to split it between voffset and immoffset. 3653 std::pair<Register, unsigned> 3654 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3655 Register OrigOffset) const { 3656 const unsigned MaxImm = 4095; 3657 Register BaseReg; 3658 unsigned ImmOffset; 3659 const LLT S32 = LLT::scalar(32); 3660 MachineRegisterInfo &MRI = *B.getMRI(); 3661 3662 std::tie(BaseReg, ImmOffset) = 3663 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 3664 3665 // If BaseReg is a pointer, convert it to int. 3666 if (MRI.getType(BaseReg).isPointer()) 3667 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 3668 3669 // If the immediate value is too big for the immoffset field, put the value 3670 // and -4096 into the immoffset field so that the value that is copied/added 3671 // for the voffset field is a multiple of 4096, and it stands more chance 3672 // of being CSEd with the copy/add for another similar load/store. 3673 // However, do not do that rounding down to a multiple of 4096 if that is a 3674 // negative number, as it appears to be illegal to have a negative offset 3675 // in the vgpr, even if adding the immediate offset makes it positive. 3676 unsigned Overflow = ImmOffset & ~MaxImm; 3677 ImmOffset -= Overflow; 3678 if ((int32_t)Overflow < 0) { 3679 Overflow += ImmOffset; 3680 ImmOffset = 0; 3681 } 3682 3683 if (Overflow != 0) { 3684 if (!BaseReg) { 3685 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3686 } else { 3687 auto OverflowVal = B.buildConstant(S32, Overflow); 3688 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3689 } 3690 } 3691 3692 if (!BaseReg) 3693 BaseReg = B.buildConstant(S32, 0).getReg(0); 3694 3695 return std::make_pair(BaseReg, ImmOffset); 3696 } 3697 3698 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 3699 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 3700 Register VOffset, Register SOffset, 3701 unsigned ImmOffset, Register VIndex, 3702 MachineRegisterInfo &MRI) const { 3703 Optional<ValueAndVReg> MaybeVOffsetVal = 3704 getConstantVRegValWithLookThrough(VOffset, MRI); 3705 Optional<ValueAndVReg> MaybeSOffsetVal = 3706 getConstantVRegValWithLookThrough(SOffset, MRI); 3707 Optional<ValueAndVReg> MaybeVIndexVal = 3708 getConstantVRegValWithLookThrough(VIndex, MRI); 3709 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 3710 // update the MMO with that offset. The stride is unknown so we can only do 3711 // this if VIndex is constant 0. 3712 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 3713 MaybeVIndexVal->Value == 0) { 3714 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 3715 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 3716 MMO->setOffset(TotalOffset); 3717 } else { 3718 // We don't have a constant combined offset to use in the MMO. Give up. 3719 MMO->setValue((Value *)nullptr); 3720 } 3721 } 3722 3723 /// Handle register layout difference for f16 images for some subtargets. 3724 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 3725 MachineRegisterInfo &MRI, 3726 Register Reg, 3727 bool ImageStore) const { 3728 const LLT S16 = LLT::scalar(16); 3729 const LLT S32 = LLT::scalar(32); 3730 LLT StoreVT = MRI.getType(Reg); 3731 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 3732 3733 if (ST.hasUnpackedD16VMem()) { 3734 auto Unmerge = B.buildUnmerge(S16, Reg); 3735 3736 SmallVector<Register, 4> WideRegs; 3737 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3738 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 3739 3740 int NumElts = StoreVT.getNumElements(); 3741 3742 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 3743 .getReg(0); 3744 } 3745 3746 if (ImageStore && ST.hasImageStoreD16Bug()) { 3747 if (StoreVT.getNumElements() == 2) { 3748 SmallVector<Register, 4> PackedRegs; 3749 Reg = B.buildBitcast(S32, Reg).getReg(0); 3750 PackedRegs.push_back(Reg); 3751 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3752 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 3753 .getReg(0); 3754 } 3755 3756 if (StoreVT.getNumElements() == 3) { 3757 SmallVector<Register, 4> PackedRegs; 3758 auto Unmerge = B.buildUnmerge(S16, Reg); 3759 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3760 PackedRegs.push_back(Unmerge.getReg(I)); 3761 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3762 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 3763 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 3764 } 3765 3766 if (StoreVT.getNumElements() == 4) { 3767 SmallVector<Register, 4> PackedRegs; 3768 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 3769 auto Unmerge = B.buildUnmerge(S32, Reg); 3770 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3771 PackedRegs.push_back(Unmerge.getReg(I)); 3772 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3773 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 3774 .getReg(0); 3775 } 3776 3777 llvm_unreachable("invalid data type"); 3778 } 3779 3780 return Reg; 3781 } 3782 3783 Register AMDGPULegalizerInfo::fixStoreSourceType( 3784 MachineIRBuilder &B, Register VData, bool IsFormat) const { 3785 MachineRegisterInfo *MRI = B.getMRI(); 3786 LLT Ty = MRI->getType(VData); 3787 3788 const LLT S16 = LLT::scalar(16); 3789 3790 // Fixup illegal register types for i8 stores. 3791 if (Ty == LLT::scalar(8) || Ty == S16) { 3792 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3793 return AnyExt; 3794 } 3795 3796 if (Ty.isVector()) { 3797 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 3798 if (IsFormat) 3799 return handleD16VData(B, *MRI, VData); 3800 } 3801 } 3802 3803 return VData; 3804 } 3805 3806 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3807 MachineRegisterInfo &MRI, 3808 MachineIRBuilder &B, 3809 bool IsTyped, 3810 bool IsFormat) const { 3811 Register VData = MI.getOperand(1).getReg(); 3812 LLT Ty = MRI.getType(VData); 3813 LLT EltTy = Ty.getScalarType(); 3814 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3815 const LLT S32 = LLT::scalar(32); 3816 3817 VData = fixStoreSourceType(B, VData, IsFormat); 3818 Register RSrc = MI.getOperand(2).getReg(); 3819 3820 MachineMemOperand *MMO = *MI.memoperands_begin(); 3821 const int MemSize = MMO->getSize(); 3822 3823 unsigned ImmOffset; 3824 3825 // The typed intrinsics add an immediate after the registers. 3826 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3827 3828 // The struct intrinsic variants add one additional operand over raw. 3829 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3830 Register VIndex; 3831 int OpOffset = 0; 3832 if (HasVIndex) { 3833 VIndex = MI.getOperand(3).getReg(); 3834 OpOffset = 1; 3835 } else { 3836 VIndex = B.buildConstant(S32, 0).getReg(0); 3837 } 3838 3839 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3840 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3841 3842 unsigned Format = 0; 3843 if (IsTyped) { 3844 Format = MI.getOperand(5 + OpOffset).getImm(); 3845 ++OpOffset; 3846 } 3847 3848 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3849 3850 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3851 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3852 3853 unsigned Opc; 3854 if (IsTyped) { 3855 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3856 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3857 } else if (IsFormat) { 3858 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3859 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3860 } else { 3861 switch (MemSize) { 3862 case 1: 3863 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3864 break; 3865 case 2: 3866 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3867 break; 3868 default: 3869 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3870 break; 3871 } 3872 } 3873 3874 auto MIB = B.buildInstr(Opc) 3875 .addUse(VData) // vdata 3876 .addUse(RSrc) // rsrc 3877 .addUse(VIndex) // vindex 3878 .addUse(VOffset) // voffset 3879 .addUse(SOffset) // soffset 3880 .addImm(ImmOffset); // offset(imm) 3881 3882 if (IsTyped) 3883 MIB.addImm(Format); 3884 3885 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3886 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3887 .addMemOperand(MMO); 3888 3889 MI.eraseFromParent(); 3890 return true; 3891 } 3892 3893 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3894 MachineRegisterInfo &MRI, 3895 MachineIRBuilder &B, 3896 bool IsFormat, 3897 bool IsTyped) const { 3898 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3899 MachineMemOperand *MMO = *MI.memoperands_begin(); 3900 const LLT MemTy = MMO->getMemoryType(); 3901 const LLT S32 = LLT::scalar(32); 3902 3903 Register Dst = MI.getOperand(0).getReg(); 3904 Register RSrc = MI.getOperand(2).getReg(); 3905 3906 // The typed intrinsics add an immediate after the registers. 3907 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3908 3909 // The struct intrinsic variants add one additional operand over raw. 3910 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3911 Register VIndex; 3912 int OpOffset = 0; 3913 if (HasVIndex) { 3914 VIndex = MI.getOperand(3).getReg(); 3915 OpOffset = 1; 3916 } else { 3917 VIndex = B.buildConstant(S32, 0).getReg(0); 3918 } 3919 3920 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3921 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3922 3923 unsigned Format = 0; 3924 if (IsTyped) { 3925 Format = MI.getOperand(5 + OpOffset).getImm(); 3926 ++OpOffset; 3927 } 3928 3929 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3930 unsigned ImmOffset; 3931 3932 LLT Ty = MRI.getType(Dst); 3933 LLT EltTy = Ty.getScalarType(); 3934 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3935 const bool Unpacked = ST.hasUnpackedD16VMem(); 3936 3937 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 3938 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 3939 3940 unsigned Opc; 3941 3942 if (IsTyped) { 3943 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3944 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3945 } else if (IsFormat) { 3946 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3947 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3948 } else { 3949 switch (MemTy.getSizeInBits()) { 3950 case 8: 3951 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3952 break; 3953 case 16: 3954 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3955 break; 3956 default: 3957 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3958 break; 3959 } 3960 } 3961 3962 Register LoadDstReg; 3963 3964 bool IsExtLoad = 3965 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 3966 LLT UnpackedTy = Ty.changeElementSize(32); 3967 3968 if (IsExtLoad) 3969 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3970 else if (Unpacked && IsD16 && Ty.isVector()) 3971 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3972 else 3973 LoadDstReg = Dst; 3974 3975 auto MIB = B.buildInstr(Opc) 3976 .addDef(LoadDstReg) // vdata 3977 .addUse(RSrc) // rsrc 3978 .addUse(VIndex) // vindex 3979 .addUse(VOffset) // voffset 3980 .addUse(SOffset) // soffset 3981 .addImm(ImmOffset); // offset(imm) 3982 3983 if (IsTyped) 3984 MIB.addImm(Format); 3985 3986 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3987 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3988 .addMemOperand(MMO); 3989 3990 if (LoadDstReg != Dst) { 3991 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3992 3993 // Widen result for extending loads was widened. 3994 if (IsExtLoad) 3995 B.buildTrunc(Dst, LoadDstReg); 3996 else { 3997 // Repack to original 16-bit vector result 3998 // FIXME: G_TRUNC should work, but legalization currently fails 3999 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 4000 SmallVector<Register, 4> Repack; 4001 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 4002 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4003 B.buildMerge(Dst, Repack); 4004 } 4005 } 4006 4007 MI.eraseFromParent(); 4008 return true; 4009 } 4010 4011 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 4012 MachineIRBuilder &B, 4013 bool IsInc) const { 4014 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 4015 AMDGPU::G_AMDGPU_ATOMIC_DEC; 4016 B.buildInstr(Opc) 4017 .addDef(MI.getOperand(0).getReg()) 4018 .addUse(MI.getOperand(2).getReg()) 4019 .addUse(MI.getOperand(3).getReg()) 4020 .cloneMemRefs(MI); 4021 MI.eraseFromParent(); 4022 return true; 4023 } 4024 4025 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 4026 switch (IntrID) { 4027 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4028 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4029 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 4030 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4031 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4032 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 4033 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4034 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4035 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 4036 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4037 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4038 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 4039 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4040 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4041 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 4042 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4043 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4044 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 4045 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4046 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4047 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 4048 case Intrinsic::amdgcn_raw_buffer_atomic_and: 4049 case Intrinsic::amdgcn_struct_buffer_atomic_and: 4050 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 4051 case Intrinsic::amdgcn_raw_buffer_atomic_or: 4052 case Intrinsic::amdgcn_struct_buffer_atomic_or: 4053 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 4054 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4055 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4056 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 4057 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4058 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4059 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 4060 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4061 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4062 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 4063 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4064 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4065 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4066 case Intrinsic::amdgcn_buffer_atomic_fadd: 4067 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4068 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4070 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4071 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4072 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4073 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4074 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4075 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 4076 default: 4077 llvm_unreachable("unhandled atomic opcode"); 4078 } 4079 } 4080 4081 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 4082 MachineIRBuilder &B, 4083 Intrinsic::ID IID) const { 4084 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 4085 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4086 const bool HasReturn = MI.getNumExplicitDefs() != 0; 4087 4088 Register Dst; 4089 4090 int OpOffset = 0; 4091 if (HasReturn) { 4092 // A few FP atomics do not support return values. 4093 Dst = MI.getOperand(0).getReg(); 4094 } else { 4095 OpOffset = -1; 4096 } 4097 4098 Register VData = MI.getOperand(2 + OpOffset).getReg(); 4099 Register CmpVal; 4100 4101 if (IsCmpSwap) { 4102 CmpVal = MI.getOperand(3 + OpOffset).getReg(); 4103 ++OpOffset; 4104 } 4105 4106 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4107 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 4108 4109 // The struct intrinsic variants add one additional operand over raw. 4110 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 4111 Register VIndex; 4112 if (HasVIndex) { 4113 VIndex = MI.getOperand(4 + OpOffset).getReg(); 4114 ++OpOffset; 4115 } else { 4116 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 4117 } 4118 4119 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 4120 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 4121 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 4122 4123 MachineMemOperand *MMO = *MI.memoperands_begin(); 4124 4125 unsigned ImmOffset; 4126 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4127 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 4128 4129 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4130 4131 if (HasReturn) 4132 MIB.addDef(Dst); 4133 4134 MIB.addUse(VData); // vdata 4135 4136 if (IsCmpSwap) 4137 MIB.addReg(CmpVal); 4138 4139 MIB.addUse(RSrc) // rsrc 4140 .addUse(VIndex) // vindex 4141 .addUse(VOffset) // voffset 4142 .addUse(SOffset) // soffset 4143 .addImm(ImmOffset) // offset(imm) 4144 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4145 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4146 .addMemOperand(MMO); 4147 4148 MI.eraseFromParent(); 4149 return true; 4150 } 4151 4152 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 4153 /// vector with s16 typed elements. 4154 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4155 SmallVectorImpl<Register> &PackedAddrs, 4156 unsigned ArgOffset, 4157 const AMDGPU::ImageDimIntrinsicInfo *Intr, 4158 bool IsA16, bool IsG16) { 4159 const LLT S16 = LLT::scalar(16); 4160 const LLT V2S16 = LLT::fixed_vector(2, 16); 4161 auto EndIdx = Intr->VAddrEnd; 4162 4163 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4164 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4165 if (!SrcOp.isReg()) 4166 continue; // _L to _LZ may have eliminated this. 4167 4168 Register AddrReg = SrcOp.getReg(); 4169 4170 if ((I < Intr->GradientStart) || 4171 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4172 (I >= Intr->CoordStart && !IsA16)) { 4173 // Handle any gradient or coordinate operands that should not be packed 4174 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 4175 PackedAddrs.push_back(AddrReg); 4176 } else { 4177 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 4178 // derivatives dx/dh and dx/dv are packed with undef. 4179 if (((I + 1) >= EndIdx) || 4180 ((Intr->NumGradients / 2) % 2 == 1 && 4181 (I == static_cast<unsigned>(Intr->GradientStart + 4182 (Intr->NumGradients / 2) - 1) || 4183 I == static_cast<unsigned>(Intr->GradientStart + 4184 Intr->NumGradients - 1))) || 4185 // Check for _L to _LZ optimization 4186 !MI.getOperand(ArgOffset + I + 1).isReg()) { 4187 PackedAddrs.push_back( 4188 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 4189 .getReg(0)); 4190 } else { 4191 PackedAddrs.push_back( 4192 B.buildBuildVector( 4193 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 4194 .getReg(0)); 4195 ++I; 4196 } 4197 } 4198 } 4199 } 4200 4201 /// Convert from separate vaddr components to a single vector address register, 4202 /// and replace the remaining operands with $noreg. 4203 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 4204 int DimIdx, int NumVAddrs) { 4205 const LLT S32 = LLT::scalar(32); 4206 4207 SmallVector<Register, 8> AddrRegs; 4208 for (int I = 0; I != NumVAddrs; ++I) { 4209 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4210 if (SrcOp.isReg()) { 4211 AddrRegs.push_back(SrcOp.getReg()); 4212 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 4213 } 4214 } 4215 4216 int NumAddrRegs = AddrRegs.size(); 4217 if (NumAddrRegs != 1) { 4218 // Above 8 elements round up to next power of 2 (i.e. 16). 4219 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 4220 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 4221 auto Undef = B.buildUndef(S32); 4222 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 4223 NumAddrRegs = RoundedNumRegs; 4224 } 4225 4226 auto VAddr = 4227 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 4228 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 4229 } 4230 4231 for (int I = 1; I != NumVAddrs; ++I) { 4232 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 4233 if (SrcOp.isReg()) 4234 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 4235 } 4236 } 4237 4238 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 4239 /// 4240 /// Depending on the subtarget, load/store with 16-bit element data need to be 4241 /// rewritten to use the low half of 32-bit registers, or directly use a packed 4242 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 4243 /// registers. 4244 /// 4245 /// We don't want to directly select image instructions just yet, but also want 4246 /// to exposes all register repacking to the legalizer/combiners. We also don't 4247 /// want a selected instrution entering RegBankSelect. In order to avoid 4248 /// defining a multitude of intermediate image instructions, directly hack on 4249 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 4250 /// now unnecessary arguments with $noreg. 4251 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4252 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4253 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 4254 4255 const unsigned NumDefs = MI.getNumExplicitDefs(); 4256 const unsigned ArgOffset = NumDefs + 1; 4257 bool IsTFE = NumDefs == 2; 4258 // We are only processing the operands of d16 image operations on subtargets 4259 // that use the unpacked register layout, or need to repack the TFE result. 4260 4261 // TODO: Do we need to guard against already legalized intrinsics? 4262 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4263 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 4264 4265 MachineRegisterInfo *MRI = B.getMRI(); 4266 const LLT S32 = LLT::scalar(32); 4267 const LLT S16 = LLT::scalar(16); 4268 const LLT V2S16 = LLT::fixed_vector(2, 16); 4269 4270 unsigned DMask = 0; 4271 4272 // Check for 16 bit addresses and pack if true. 4273 LLT GradTy = 4274 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4275 LLT AddrTy = 4276 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 4277 const bool IsG16 = GradTy == S16; 4278 const bool IsA16 = AddrTy == S16; 4279 4280 int DMaskLanes = 0; 4281 if (!BaseOpcode->Atomic) { 4282 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 4283 if (BaseOpcode->Gather4) { 4284 DMaskLanes = 4; 4285 } else if (DMask != 0) { 4286 DMaskLanes = countPopulation(DMask); 4287 } else if (!IsTFE && !BaseOpcode->Store) { 4288 // If dmask is 0, this is a no-op load. This can be eliminated. 4289 B.buildUndef(MI.getOperand(0)); 4290 MI.eraseFromParent(); 4291 return true; 4292 } 4293 } 4294 4295 Observer.changingInstr(MI); 4296 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 4297 4298 unsigned NewOpcode = NumDefs == 0 ? 4299 AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 4300 4301 // Track that we legalized this 4302 MI.setDesc(B.getTII().get(NewOpcode)); 4303 4304 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 4305 // dmask to be at least 1 otherwise the instruction will fail 4306 if (IsTFE && DMask == 0) { 4307 DMask = 0x1; 4308 DMaskLanes = 1; 4309 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 4310 } 4311 4312 if (BaseOpcode->Atomic) { 4313 Register VData0 = MI.getOperand(2).getReg(); 4314 LLT Ty = MRI->getType(VData0); 4315 4316 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 4317 if (Ty.isVector()) 4318 return false; 4319 4320 if (BaseOpcode->AtomicX2) { 4321 Register VData1 = MI.getOperand(3).getReg(); 4322 // The two values are packed in one register. 4323 LLT PackedTy = LLT::fixed_vector(2, Ty); 4324 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 4325 MI.getOperand(2).setReg(Concat.getReg(0)); 4326 MI.getOperand(3).setReg(AMDGPU::NoRegister); 4327 } 4328 } 4329 4330 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 4331 4332 // Optimize _L to _LZ when _L is zero 4333 if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4334 AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 4335 const ConstantFP *ConstantLod; 4336 4337 if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4338 m_GFCst(ConstantLod))) { 4339 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 4340 // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4341 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4342 AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4343 Intr->Dim); 4344 4345 // The starting indexes should remain in the same place. 4346 --CorrectedNumVAddrs; 4347 4348 MI.getOperand(MI.getNumExplicitDefs()) 4349 .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4350 MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4351 Intr = NewImageDimIntr; 4352 } 4353 } 4354 } 4355 4356 // Optimize _mip away, when 'lod' is zero 4357 if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 4358 int64_t ConstantLod; 4359 if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4360 m_ICst(ConstantLod))) { 4361 if (ConstantLod == 0) { 4362 // TODO: Change intrinsic opcode and remove operand instead or replacing 4363 // it with 0, as the _L to _LZ handling is done above. 4364 MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 4365 --CorrectedNumVAddrs; 4366 } 4367 } 4368 } 4369 4370 // Rewrite the addressing register layout before doing anything else. 4371 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4372 // 16 bit gradients are supported, but are tied to the A16 control 4373 // so both gradients and addresses must be 16 bit 4374 return false; 4375 } 4376 4377 if (IsA16 && !ST.hasA16()) { 4378 // A16 not supported 4379 return false; 4380 } 4381 4382 if (IsA16 || IsG16) { 4383 if (Intr->NumVAddrs > 1) { 4384 SmallVector<Register, 4> PackedRegs; 4385 4386 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4387 IsG16); 4388 4389 // See also below in the non-a16 branch 4390 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4391 PackedRegs.size() <= ST.getNSAMaxSize(); 4392 4393 if (!UseNSA && PackedRegs.size() > 1) { 4394 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 4395 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 4396 PackedRegs[0] = Concat.getReg(0); 4397 PackedRegs.resize(1); 4398 } 4399 4400 const unsigned NumPacked = PackedRegs.size(); 4401 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4402 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 4403 if (!SrcOp.isReg()) { 4404 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 4405 continue; 4406 } 4407 4408 assert(SrcOp.getReg() != AMDGPU::NoRegister); 4409 4410 if (I - Intr->VAddrStart < NumPacked) 4411 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 4412 else 4413 SrcOp.setReg(AMDGPU::NoRegister); 4414 } 4415 } 4416 } else { 4417 // If the register allocator cannot place the address registers contiguously 4418 // without introducing moves, then using the non-sequential address encoding 4419 // is always preferable, since it saves VALU instructions and is usually a 4420 // wash in terms of code size or even better. 4421 // 4422 // However, we currently have no way of hinting to the register allocator 4423 // that MIMG addresses should be placed contiguously when it is possible to 4424 // do so, so force non-NSA for the common 2-address case as a heuristic. 4425 // 4426 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 4427 // allocation when possible. 4428 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 4429 CorrectedNumVAddrs <= ST.getNSAMaxSize(); 4430 4431 if (!UseNSA && Intr->NumVAddrs > 1) 4432 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4433 Intr->NumVAddrs); 4434 } 4435 4436 int Flags = 0; 4437 if (IsA16) 4438 Flags |= 1; 4439 if (IsG16) 4440 Flags |= 2; 4441 MI.addOperand(MachineOperand::CreateImm(Flags)); 4442 4443 if (BaseOpcode->Store) { // No TFE for stores? 4444 // TODO: Handle dmask trim 4445 Register VData = MI.getOperand(1).getReg(); 4446 LLT Ty = MRI->getType(VData); 4447 if (!Ty.isVector() || Ty.getElementType() != S16) 4448 return true; 4449 4450 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 4451 if (RepackedReg != VData) { 4452 MI.getOperand(1).setReg(RepackedReg); 4453 } 4454 4455 return true; 4456 } 4457 4458 Register DstReg = MI.getOperand(0).getReg(); 4459 LLT Ty = MRI->getType(DstReg); 4460 const LLT EltTy = Ty.getScalarType(); 4461 const bool IsD16 = Ty.getScalarType() == S16; 4462 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 4463 4464 // Confirm that the return type is large enough for the dmask specified 4465 if (NumElts < DMaskLanes) 4466 return false; 4467 4468 if (NumElts > 4 || DMaskLanes > 4) 4469 return false; 4470 4471 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 4472 const LLT AdjustedTy = 4473 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 4474 4475 // The raw dword aligned data component of the load. The only legal cases 4476 // where this matters should be when using the packed D16 format, for 4477 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 4478 LLT RoundedTy; 4479 4480 // S32 vector to to cover all data, plus TFE result element. 4481 LLT TFETy; 4482 4483 // Register type to use for each loaded component. Will be S32 or V2S16. 4484 LLT RegTy; 4485 4486 if (IsD16 && ST.hasUnpackedD16VMem()) { 4487 RoundedTy = 4488 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 4489 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 4490 RegTy = S32; 4491 } else { 4492 unsigned EltSize = EltTy.getSizeInBits(); 4493 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 4494 unsigned RoundedSize = 32 * RoundedElts; 4495 RoundedTy = LLT::scalarOrVector( 4496 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 4497 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 4498 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 4499 } 4500 4501 // The return type does not need adjustment. 4502 // TODO: Should we change s16 case to s32 or <2 x s16>? 4503 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 4504 return true; 4505 4506 Register Dst1Reg; 4507 4508 // Insert after the instruction. 4509 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 4510 4511 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 4512 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 4513 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 4514 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 4515 4516 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 4517 4518 MI.getOperand(0).setReg(NewResultReg); 4519 4520 // In the IR, TFE is supposed to be used with a 2 element struct return 4521 // type. The intruction really returns these two values in one contiguous 4522 // register, with one additional dword beyond the loaded data. Rewrite the 4523 // return type to use a single register result. 4524 4525 if (IsTFE) { 4526 Dst1Reg = MI.getOperand(1).getReg(); 4527 if (MRI->getType(Dst1Reg) != S32) 4528 return false; 4529 4530 // TODO: Make sure the TFE operand bit is set. 4531 MI.RemoveOperand(1); 4532 4533 // Handle the easy case that requires no repack instructions. 4534 if (Ty == S32) { 4535 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 4536 return true; 4537 } 4538 } 4539 4540 // Now figure out how to copy the new result register back into the old 4541 // result. 4542 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 4543 4544 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 4545 4546 if (ResultNumRegs == 1) { 4547 assert(!IsTFE); 4548 ResultRegs[0] = NewResultReg; 4549 } else { 4550 // We have to repack into a new vector of some kind. 4551 for (int I = 0; I != NumDataRegs; ++I) 4552 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 4553 B.buildUnmerge(ResultRegs, NewResultReg); 4554 4555 // Drop the final TFE element to get the data part. The TFE result is 4556 // directly written to the right place already. 4557 if (IsTFE) 4558 ResultRegs.resize(NumDataRegs); 4559 } 4560 4561 // For an s16 scalar result, we form an s32 result with a truncate regardless 4562 // of packed vs. unpacked. 4563 if (IsD16 && !Ty.isVector()) { 4564 B.buildTrunc(DstReg, ResultRegs[0]); 4565 return true; 4566 } 4567 4568 // Avoid a build/concat_vector of 1 entry. 4569 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4570 B.buildBitcast(DstReg, ResultRegs[0]); 4571 return true; 4572 } 4573 4574 assert(Ty.isVector()); 4575 4576 if (IsD16) { 4577 // For packed D16 results with TFE enabled, all the data components are 4578 // S32. Cast back to the expected type. 4579 // 4580 // TODO: We don't really need to use load s32 elements. We would only need one 4581 // cast for the TFE result if a multiple of v2s16 was used. 4582 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4583 for (Register &Reg : ResultRegs) 4584 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4585 } else if (ST.hasUnpackedD16VMem()) { 4586 for (Register &Reg : ResultRegs) 4587 Reg = B.buildTrunc(S16, Reg).getReg(0); 4588 } 4589 } 4590 4591 auto padWithUndef = [&](LLT Ty, int NumElts) { 4592 if (NumElts == 0) 4593 return; 4594 Register Undef = B.buildUndef(Ty).getReg(0); 4595 for (int I = 0; I != NumElts; ++I) 4596 ResultRegs.push_back(Undef); 4597 }; 4598 4599 // Pad out any elements eliminated due to the dmask. 4600 LLT ResTy = MRI->getType(ResultRegs[0]); 4601 if (!ResTy.isVector()) { 4602 padWithUndef(ResTy, NumElts - ResultRegs.size()); 4603 B.buildBuildVector(DstReg, ResultRegs); 4604 return true; 4605 } 4606 4607 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4608 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4609 4610 // Deal with the one annoying legal case. 4611 const LLT V3S16 = LLT::fixed_vector(3, 16); 4612 if (Ty == V3S16) { 4613 padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4614 auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs); 4615 B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4616 return true; 4617 } 4618 4619 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4620 B.buildConcatVectors(DstReg, ResultRegs); 4621 return true; 4622 } 4623 4624 bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4625 LegalizerHelper &Helper, MachineInstr &MI) const { 4626 MachineIRBuilder &B = Helper.MIRBuilder; 4627 GISelChangeObserver &Observer = Helper.Observer; 4628 4629 Register Dst = MI.getOperand(0).getReg(); 4630 LLT Ty = B.getMRI()->getType(Dst); 4631 unsigned Size = Ty.getSizeInBits(); 4632 MachineFunction &MF = B.getMF(); 4633 4634 Observer.changingInstr(MI); 4635 4636 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 4637 Ty = getBitcastRegisterType(Ty); 4638 Helper.bitcastDst(MI, Ty, 0); 4639 Dst = MI.getOperand(0).getReg(); 4640 B.setInsertPt(B.getMBB(), MI); 4641 } 4642 4643 // FIXME: We don't really need this intermediate instruction. The intrinsic 4644 // should be fixed to have a memory operand. Since it's readnone, we're not 4645 // allowed to add one. 4646 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4647 MI.RemoveOperand(1); // Remove intrinsic ID 4648 4649 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4650 // TODO: Should this use datalayout alignment? 4651 const unsigned MemSize = (Size + 7) / 8; 4652 const Align MemAlign(4); 4653 MachineMemOperand *MMO = MF.getMachineMemOperand( 4654 MachinePointerInfo(), 4655 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4656 MachineMemOperand::MOInvariant, 4657 MemSize, MemAlign); 4658 MI.addMemOperand(MF, MMO); 4659 4660 // There are no 96-bit result scalar loads, but widening to 128-bit should 4661 // always be legal. We may need to restore this to a 96-bit result if it turns 4662 // out this needs to be converted to a vector load during RegBankSelect. 4663 if (!isPowerOf2_32(Size)) { 4664 if (Ty.isVector()) 4665 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4666 else 4667 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4668 } 4669 4670 Observer.changedInstr(MI); 4671 return true; 4672 } 4673 4674 // TODO: Move to selection 4675 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 4676 MachineRegisterInfo &MRI, 4677 MachineIRBuilder &B) const { 4678 if (!ST.isTrapHandlerEnabled() || 4679 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 4680 return legalizeTrapEndpgm(MI, MRI, B); 4681 4682 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 4683 switch (*HsaAbiVer) { 4684 case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 4685 case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 4686 return legalizeTrapHsaQueuePtr(MI, MRI, B); 4687 case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 4688 return ST.supportsGetDoorbellID() ? 4689 legalizeTrapHsa(MI, MRI, B) : 4690 legalizeTrapHsaQueuePtr(MI, MRI, B); 4691 } 4692 } 4693 4694 llvm_unreachable("Unknown trap handler"); 4695 } 4696 4697 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 4698 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4699 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4700 MI.eraseFromParent(); 4701 return true; 4702 } 4703 4704 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 4705 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4706 // Pass queue pointer to trap handler as input, and insert trap instruction 4707 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4708 Register LiveIn = 4709 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4710 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 4711 return false; 4712 4713 Register SGPR01(AMDGPU::SGPR0_SGPR1); 4714 B.buildCopy(SGPR01, LiveIn); 4715 B.buildInstr(AMDGPU::S_TRAP) 4716 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 4717 .addReg(SGPR01, RegState::Implicit); 4718 4719 MI.eraseFromParent(); 4720 return true; 4721 } 4722 4723 bool AMDGPULegalizerInfo::legalizeTrapHsa( 4724 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4725 B.buildInstr(AMDGPU::S_TRAP) 4726 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 4727 MI.eraseFromParent(); 4728 return true; 4729 } 4730 4731 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4732 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4733 // Is non-HSA path or trap-handler disabled? then, report a warning 4734 // accordingly 4735 if (!ST.isTrapHandlerEnabled() || 4736 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 4737 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4738 "debugtrap handler not supported", 4739 MI.getDebugLoc(), DS_Warning); 4740 LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4741 Ctx.diagnose(NoTrap); 4742 } else { 4743 // Insert debug-trap instruction 4744 B.buildInstr(AMDGPU::S_TRAP) 4745 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 4746 } 4747 4748 MI.eraseFromParent(); 4749 return true; 4750 } 4751 4752 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4753 MachineIRBuilder &B) const { 4754 MachineRegisterInfo &MRI = *B.getMRI(); 4755 const LLT S16 = LLT::scalar(16); 4756 const LLT S32 = LLT::scalar(32); 4757 4758 Register DstReg = MI.getOperand(0).getReg(); 4759 Register NodePtr = MI.getOperand(2).getReg(); 4760 Register RayExtent = MI.getOperand(3).getReg(); 4761 Register RayOrigin = MI.getOperand(4).getReg(); 4762 Register RayDir = MI.getOperand(5).getReg(); 4763 Register RayInvDir = MI.getOperand(6).getReg(); 4764 Register TDescr = MI.getOperand(7).getReg(); 4765 4766 if (!ST.hasGFX10_AEncoding()) { 4767 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 4768 "intrinsic not supported on subtarget", 4769 MI.getDebugLoc()); 4770 B.getMF().getFunction().getContext().diagnose(BadIntrin); 4771 return false; 4772 } 4773 4774 bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4775 bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4776 unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4777 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4778 : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4779 : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4780 4781 SmallVector<Register, 12> Ops; 4782 if (Is64) { 4783 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4784 Ops.push_back(Unmerge.getReg(0)); 4785 Ops.push_back(Unmerge.getReg(1)); 4786 } else { 4787 Ops.push_back(NodePtr); 4788 } 4789 Ops.push_back(RayExtent); 4790 4791 auto packLanes = [&Ops, &S32, &B] (Register Src) { 4792 auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4793 Ops.push_back(Unmerge.getReg(0)); 4794 Ops.push_back(Unmerge.getReg(1)); 4795 Ops.push_back(Unmerge.getReg(2)); 4796 }; 4797 4798 packLanes(RayOrigin); 4799 if (IsA16) { 4800 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4801 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4802 Register R1 = MRI.createGenericVirtualRegister(S32); 4803 Register R2 = MRI.createGenericVirtualRegister(S32); 4804 Register R3 = MRI.createGenericVirtualRegister(S32); 4805 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4806 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4807 B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4808 Ops.push_back(R1); 4809 Ops.push_back(R2); 4810 Ops.push_back(R3); 4811 } else { 4812 packLanes(RayDir); 4813 packLanes(RayInvDir); 4814 } 4815 4816 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4817 .addDef(DstReg) 4818 .addImm(Opcode); 4819 4820 for (Register R : Ops) { 4821 MIB.addUse(R); 4822 } 4823 4824 MIB.addUse(TDescr) 4825 .addImm(IsA16 ? 1 : 0) 4826 .cloneMemRefs(MI); 4827 4828 MI.eraseFromParent(); 4829 return true; 4830 } 4831 4832 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4833 MachineInstr &MI) const { 4834 MachineIRBuilder &B = Helper.MIRBuilder; 4835 MachineRegisterInfo &MRI = *B.getMRI(); 4836 4837 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4838 auto IntrID = MI.getIntrinsicID(); 4839 switch (IntrID) { 4840 case Intrinsic::amdgcn_if: 4841 case Intrinsic::amdgcn_else: { 4842 MachineInstr *Br = nullptr; 4843 MachineBasicBlock *UncondBrTarget = nullptr; 4844 bool Negated = false; 4845 if (MachineInstr *BrCond = 4846 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4847 const SIRegisterInfo *TRI 4848 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4849 4850 Register Def = MI.getOperand(1).getReg(); 4851 Register Use = MI.getOperand(3).getReg(); 4852 4853 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4854 4855 if (Negated) 4856 std::swap(CondBrTarget, UncondBrTarget); 4857 4858 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4859 if (IntrID == Intrinsic::amdgcn_if) { 4860 B.buildInstr(AMDGPU::SI_IF) 4861 .addDef(Def) 4862 .addUse(Use) 4863 .addMBB(UncondBrTarget); 4864 } else { 4865 B.buildInstr(AMDGPU::SI_ELSE) 4866 .addDef(Def) 4867 .addUse(Use) 4868 .addMBB(UncondBrTarget); 4869 } 4870 4871 if (Br) { 4872 Br->getOperand(0).setMBB(CondBrTarget); 4873 } else { 4874 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4875 // since we're swapping branch targets it needs to be reinserted. 4876 // FIXME: IRTranslator should probably not do this 4877 B.buildBr(*CondBrTarget); 4878 } 4879 4880 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 4881 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 4882 MI.eraseFromParent(); 4883 BrCond->eraseFromParent(); 4884 return true; 4885 } 4886 4887 return false; 4888 } 4889 case Intrinsic::amdgcn_loop: { 4890 MachineInstr *Br = nullptr; 4891 MachineBasicBlock *UncondBrTarget = nullptr; 4892 bool Negated = false; 4893 if (MachineInstr *BrCond = 4894 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 4895 const SIRegisterInfo *TRI 4896 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 4897 4898 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4899 Register Reg = MI.getOperand(2).getReg(); 4900 4901 if (Negated) 4902 std::swap(CondBrTarget, UncondBrTarget); 4903 4904 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4905 B.buildInstr(AMDGPU::SI_LOOP) 4906 .addUse(Reg) 4907 .addMBB(UncondBrTarget); 4908 4909 if (Br) 4910 Br->getOperand(0).setMBB(CondBrTarget); 4911 else 4912 B.buildBr(*CondBrTarget); 4913 4914 MI.eraseFromParent(); 4915 BrCond->eraseFromParent(); 4916 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 4917 return true; 4918 } 4919 4920 return false; 4921 } 4922 case Intrinsic::amdgcn_kernarg_segment_ptr: 4923 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4924 // This only makes sense to call in a kernel, so just lower to null. 4925 B.buildConstant(MI.getOperand(0).getReg(), 0); 4926 MI.eraseFromParent(); 4927 return true; 4928 } 4929 4930 return legalizePreloadedArgIntrin( 4931 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 4932 case Intrinsic::amdgcn_implicitarg_ptr: 4933 return legalizeImplicitArgPtr(MI, MRI, B); 4934 case Intrinsic::amdgcn_workitem_id_x: 4935 return legalizePreloadedArgIntrin(MI, MRI, B, 4936 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 4937 case Intrinsic::amdgcn_workitem_id_y: 4938 return legalizePreloadedArgIntrin(MI, MRI, B, 4939 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 4940 case Intrinsic::amdgcn_workitem_id_z: 4941 return legalizePreloadedArgIntrin(MI, MRI, B, 4942 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 4943 case Intrinsic::amdgcn_workgroup_id_x: 4944 return legalizePreloadedArgIntrin(MI, MRI, B, 4945 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 4946 case Intrinsic::amdgcn_workgroup_id_y: 4947 return legalizePreloadedArgIntrin(MI, MRI, B, 4948 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 4949 case Intrinsic::amdgcn_workgroup_id_z: 4950 return legalizePreloadedArgIntrin(MI, MRI, B, 4951 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 4952 case Intrinsic::amdgcn_dispatch_ptr: 4953 return legalizePreloadedArgIntrin(MI, MRI, B, 4954 AMDGPUFunctionArgInfo::DISPATCH_PTR); 4955 case Intrinsic::amdgcn_queue_ptr: 4956 return legalizePreloadedArgIntrin(MI, MRI, B, 4957 AMDGPUFunctionArgInfo::QUEUE_PTR); 4958 case Intrinsic::amdgcn_implicit_buffer_ptr: 4959 return legalizePreloadedArgIntrin( 4960 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 4961 case Intrinsic::amdgcn_dispatch_id: 4962 return legalizePreloadedArgIntrin(MI, MRI, B, 4963 AMDGPUFunctionArgInfo::DISPATCH_ID); 4964 case Intrinsic::amdgcn_fdiv_fast: 4965 return legalizeFDIVFastIntrin(MI, MRI, B); 4966 case Intrinsic::amdgcn_is_shared: 4967 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 4968 case Intrinsic::amdgcn_is_private: 4969 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 4970 case Intrinsic::amdgcn_wavefrontsize: { 4971 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 4972 MI.eraseFromParent(); 4973 return true; 4974 } 4975 case Intrinsic::amdgcn_s_buffer_load: 4976 return legalizeSBufferLoad(Helper, MI); 4977 case Intrinsic::amdgcn_raw_buffer_store: 4978 case Intrinsic::amdgcn_struct_buffer_store: 4979 return legalizeBufferStore(MI, MRI, B, false, false); 4980 case Intrinsic::amdgcn_raw_buffer_store_format: 4981 case Intrinsic::amdgcn_struct_buffer_store_format: 4982 return legalizeBufferStore(MI, MRI, B, false, true); 4983 case Intrinsic::amdgcn_raw_tbuffer_store: 4984 case Intrinsic::amdgcn_struct_tbuffer_store: 4985 return legalizeBufferStore(MI, MRI, B, true, true); 4986 case Intrinsic::amdgcn_raw_buffer_load: 4987 case Intrinsic::amdgcn_struct_buffer_load: 4988 return legalizeBufferLoad(MI, MRI, B, false, false); 4989 case Intrinsic::amdgcn_raw_buffer_load_format: 4990 case Intrinsic::amdgcn_struct_buffer_load_format: 4991 return legalizeBufferLoad(MI, MRI, B, true, false); 4992 case Intrinsic::amdgcn_raw_tbuffer_load: 4993 case Intrinsic::amdgcn_struct_tbuffer_load: 4994 return legalizeBufferLoad(MI, MRI, B, true, true); 4995 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4996 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4997 case Intrinsic::amdgcn_raw_buffer_atomic_add: 4998 case Intrinsic::amdgcn_struct_buffer_atomic_add: 4999 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 5000 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 5001 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 5002 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 5003 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 5004 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 5005 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 5006 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 5007 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 5008 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 5009 case Intrinsic::amdgcn_raw_buffer_atomic_and: 5010 case Intrinsic::amdgcn_struct_buffer_atomic_and: 5011 case Intrinsic::amdgcn_raw_buffer_atomic_or: 5012 case Intrinsic::amdgcn_struct_buffer_atomic_or: 5013 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 5014 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 5015 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 5016 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 5017 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 5018 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 5019 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5020 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 5021 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 5022 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5023 case Intrinsic::amdgcn_buffer_atomic_fadd: 5024 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5025 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5026 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5027 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 5028 return legalizeBufferAtomic(MI, B, IntrID); 5029 case Intrinsic::amdgcn_atomic_inc: 5030 return legalizeAtomicIncDec(MI, B, true); 5031 case Intrinsic::amdgcn_atomic_dec: 5032 return legalizeAtomicIncDec(MI, B, false); 5033 case Intrinsic::trap: 5034 return legalizeTrapIntrinsic(MI, MRI, B); 5035 case Intrinsic::debugtrap: 5036 return legalizeDebugTrapIntrinsic(MI, MRI, B); 5037 case Intrinsic::amdgcn_rsq_clamp: 5038 return legalizeRsqClampIntrinsic(MI, MRI, B); 5039 case Intrinsic::amdgcn_ds_fadd: 5040 case Intrinsic::amdgcn_ds_fmin: 5041 case Intrinsic::amdgcn_ds_fmax: 5042 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5043 case Intrinsic::amdgcn_image_bvh_intersect_ray: 5044 return legalizeBVHIntrinsic(MI, B); 5045 default: { 5046 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 5047 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 5048 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 5049 return true; 5050 } 5051 } 5052 5053 return true; 5054 } 5055