1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This file implements the targeting of the Machinelegalizer class for 10 /// AMDGPU. 11 /// \todo This should be generated by TableGen. 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPULegalizerInfo.h" 15 16 #include "AMDGPU.h" 17 #include "AMDGPUGlobalISelUtils.h" 18 #include "AMDGPUInstrInfo.h" 19 #include "AMDGPUMemoryUtils.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIInstrInfo.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "Utils/AMDGPUBaseInfo.h" 26 #include "llvm/ADT/ScopeExit.h" 27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 31 #include "llvm/CodeGen/GlobalISel/Utils.h" 32 #include "llvm/CodeGen/TargetOpcodes.h" 33 #include "llvm/IR/DiagnosticInfo.h" 34 #include "llvm/IR/IntrinsicsAMDGPU.h" 35 #include "llvm/IR/IntrinsicsR600.h" 36 37 #define DEBUG_TYPE "amdgpu-legalinfo" 38 39 using namespace llvm; 40 using namespace LegalizeActions; 41 using namespace LegalizeMutations; 42 using namespace LegalityPredicates; 43 using namespace MIPatternMatch; 44 45 // Hack until load/store selection patterns support any tuple of legal types. 46 static cl::opt<bool> EnableNewLegality( 47 "amdgpu-global-isel-new-legality", 48 cl::desc("Use GlobalISel desired legality, rather than try to use" 49 "rules compatible with selection patterns"), 50 cl::init(false), 51 cl::ReallyHidden); 52 53 static constexpr unsigned MaxRegisterSize = 1024; 54 55 // Round the number of elements to the next power of two elements 56 static LLT getPow2VectorType(LLT Ty) { 57 unsigned NElts = Ty.getNumElements(); 58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 60 } 61 62 // Round the number of bits to the next power of two bits 63 static LLT getPow2ScalarType(LLT Ty) { 64 unsigned Bits = Ty.getSizeInBits(); 65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 66 return LLT::scalar(Pow2Bits); 67 } 68 69 /// \returns true if this is an odd sized vector which should widen by adding an 70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 71 /// excludes s1 vectors, which should always be scalarized. 72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 73 return [=](const LegalityQuery &Query) { 74 const LLT Ty = Query.Types[TypeIdx]; 75 if (!Ty.isVector()) 76 return false; 77 78 const LLT EltTy = Ty.getElementType(); 79 const unsigned EltSize = EltTy.getSizeInBits(); 80 return Ty.getNumElements() % 2 != 0 && 81 EltSize > 1 && EltSize < 32 && 82 Ty.getSizeInBits() % 32 != 0; 83 }; 84 } 85 86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 87 return [=](const LegalityQuery &Query) { 88 const LLT Ty = Query.Types[TypeIdx]; 89 return Ty.getSizeInBits() % 32 == 0; 90 }; 91 } 92 93 static LegalityPredicate isWideVec16(unsigned TypeIdx) { 94 return [=](const LegalityQuery &Query) { 95 const LLT Ty = Query.Types[TypeIdx]; 96 const LLT EltTy = Ty.getScalarType(); 97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 98 }; 99 } 100 101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 102 return [=](const LegalityQuery &Query) { 103 const LLT Ty = Query.Types[TypeIdx]; 104 const LLT EltTy = Ty.getElementType(); 105 return std::pair(TypeIdx, 106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 107 }; 108 } 109 110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 111 return [=](const LegalityQuery &Query) { 112 const LLT Ty = Query.Types[TypeIdx]; 113 const LLT EltTy = Ty.getElementType(); 114 unsigned Size = Ty.getSizeInBits(); 115 unsigned Pieces = (Size + 63) / 64; 116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 117 return std::pair(TypeIdx, LLT::scalarOrVector( 118 ElementCount::getFixed(NewNumElts), EltTy)); 119 }; 120 } 121 122 // Increase the number of vector elements to reach the next multiple of 32-bit 123 // type. 124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 125 return [=](const LegalityQuery &Query) { 126 const LLT Ty = Query.Types[TypeIdx]; 127 128 const LLT EltTy = Ty.getElementType(); 129 const int Size = Ty.getSizeInBits(); 130 const int EltSize = EltTy.getSizeInBits(); 131 const int NextMul32 = (Size + 31) / 32; 132 133 assert(EltSize < 32); 134 135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 137 }; 138 } 139 140 // Increase the number of vector elements to reach the next legal RegClass. 141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 142 return [=](const LegalityQuery &Query) { 143 const LLT Ty = Query.Types[TypeIdx]; 144 const unsigned NumElts = Ty.getNumElements(); 145 const unsigned EltSize = Ty.getElementType().getSizeInBits(); 146 const unsigned MaxNumElts = MaxRegisterSize / EltSize; 147 148 assert(EltSize == 32 || EltSize == 64); 149 assert(Ty.getSizeInBits() < MaxRegisterSize); 150 151 unsigned NewNumElts; 152 // Find the nearest legal RegClass that is larger than the current type. 153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 155 break; 156 } 157 return std::pair(TypeIdx, 158 LLT::fixed_vector(NewNumElts, Ty.getElementType())); 159 }; 160 } 161 162 static LLT getBufferRsrcScalarType(const LLT Ty) { 163 if (!Ty.isVector()) 164 return LLT::scalar(128); 165 const ElementCount NumElems = Ty.getElementCount(); 166 return LLT::vector(NumElems, LLT::scalar(128)); 167 } 168 169 static LLT getBufferRsrcRegisterType(const LLT Ty) { 170 if (!Ty.isVector()) 171 return LLT::fixed_vector(4, LLT::scalar(32)); 172 const unsigned NumElems = Ty.getElementCount().getFixedValue(); 173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 174 } 175 176 static LLT getBitcastRegisterType(const LLT Ty) { 177 const unsigned Size = Ty.getSizeInBits(); 178 179 if (Size <= 32) { 180 // <2 x s8> -> s16 181 // <4 x s8> -> s32 182 return LLT::scalar(Size); 183 } 184 185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 186 } 187 188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 189 return [=](const LegalityQuery &Query) { 190 const LLT Ty = Query.Types[TypeIdx]; 191 return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 192 }; 193 } 194 195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 196 return [=](const LegalityQuery &Query) { 197 const LLT Ty = Query.Types[TypeIdx]; 198 unsigned Size = Ty.getSizeInBits(); 199 assert(Size % 32 == 0); 200 return std::pair( 201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 202 }; 203 } 204 205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 206 return [=](const LegalityQuery &Query) { 207 const LLT QueryTy = Query.Types[TypeIdx]; 208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 209 }; 210 } 211 212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 213 return [=](const LegalityQuery &Query) { 214 const LLT QueryTy = Query.Types[TypeIdx]; 215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 216 }; 217 } 218 219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 220 return [=](const LegalityQuery &Query) { 221 const LLT QueryTy = Query.Types[TypeIdx]; 222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 223 }; 224 } 225 226 static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) { 227 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) && 228 Size <= MaxRegisterSize; 229 } 230 231 static bool isRegisterVectorElementType(LLT EltTy) { 232 const int EltSize = EltTy.getSizeInBits(); 233 return EltSize == 16 || EltSize % 32 == 0; 234 } 235 236 static bool isRegisterVectorType(LLT Ty) { 237 const int EltSize = Ty.getElementType().getSizeInBits(); 238 return EltSize == 32 || EltSize == 64 || 239 (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 240 EltSize == 128 || EltSize == 256; 241 } 242 243 // TODO: replace all uses of isRegisterType with isRegisterClassType 244 static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) { 245 if (!isRegisterSize(ST, Ty.getSizeInBits())) 246 return false; 247 248 if (Ty.isVector()) 249 return isRegisterVectorType(Ty); 250 251 return true; 252 } 253 254 // Any combination of 32 or 64-bit elements up the maximum register size, and 255 // multiples of v2s16. 256 static LegalityPredicate isRegisterType(const GCNSubtarget &ST, 257 unsigned TypeIdx) { 258 return [=, &ST](const LegalityQuery &Query) { 259 return isRegisterType(ST, Query.Types[TypeIdx]); 260 }; 261 } 262 263 // RegisterType that doesn't have a corresponding RegClass. 264 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this 265 // should be removed. 266 static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, 267 unsigned TypeIdx) { 268 return [=, &ST](const LegalityQuery &Query) { 269 LLT Ty = Query.Types[TypeIdx]; 270 return isRegisterType(ST, Ty) && 271 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 272 }; 273 } 274 275 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 276 return [=](const LegalityQuery &Query) { 277 const LLT QueryTy = Query.Types[TypeIdx]; 278 if (!QueryTy.isVector()) 279 return false; 280 const LLT EltTy = QueryTy.getElementType(); 281 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 282 }; 283 } 284 285 constexpr LLT S1 = LLT::scalar(1); 286 constexpr LLT S8 = LLT::scalar(8); 287 constexpr LLT S16 = LLT::scalar(16); 288 constexpr LLT S32 = LLT::scalar(32); 289 constexpr LLT F32 = LLT::float32(); 290 constexpr LLT S64 = LLT::scalar(64); 291 constexpr LLT F64 = LLT::float64(); 292 constexpr LLT S96 = LLT::scalar(96); 293 constexpr LLT S128 = LLT::scalar(128); 294 constexpr LLT S160 = LLT::scalar(160); 295 constexpr LLT S192 = LLT::scalar(192); 296 constexpr LLT S224 = LLT::scalar(224); 297 constexpr LLT S256 = LLT::scalar(256); 298 constexpr LLT S512 = LLT::scalar(512); 299 constexpr LLT S1024 = LLT::scalar(1024); 300 constexpr LLT MaxScalar = LLT::scalar(MaxRegisterSize); 301 302 constexpr LLT V2S8 = LLT::fixed_vector(2, 8); 303 constexpr LLT V2S16 = LLT::fixed_vector(2, 16); 304 constexpr LLT V4S16 = LLT::fixed_vector(4, 16); 305 constexpr LLT V6S16 = LLT::fixed_vector(6, 16); 306 constexpr LLT V8S16 = LLT::fixed_vector(8, 16); 307 constexpr LLT V10S16 = LLT::fixed_vector(10, 16); 308 constexpr LLT V12S16 = LLT::fixed_vector(12, 16); 309 constexpr LLT V16S16 = LLT::fixed_vector(16, 16); 310 311 constexpr LLT V2F16 = LLT::fixed_vector(2, LLT::float16()); 312 constexpr LLT V2BF16 = V2F16; // FIXME 313 314 constexpr LLT V2S32 = LLT::fixed_vector(2, 32); 315 constexpr LLT V3S32 = LLT::fixed_vector(3, 32); 316 constexpr LLT V4S32 = LLT::fixed_vector(4, 32); 317 constexpr LLT V5S32 = LLT::fixed_vector(5, 32); 318 constexpr LLT V6S32 = LLT::fixed_vector(6, 32); 319 constexpr LLT V7S32 = LLT::fixed_vector(7, 32); 320 constexpr LLT V8S32 = LLT::fixed_vector(8, 32); 321 constexpr LLT V9S32 = LLT::fixed_vector(9, 32); 322 constexpr LLT V10S32 = LLT::fixed_vector(10, 32); 323 constexpr LLT V11S32 = LLT::fixed_vector(11, 32); 324 constexpr LLT V12S32 = LLT::fixed_vector(12, 32); 325 constexpr LLT V16S32 = LLT::fixed_vector(16, 32); 326 constexpr LLT V32S32 = LLT::fixed_vector(32, 32); 327 328 constexpr LLT V2S64 = LLT::fixed_vector(2, 64); 329 constexpr LLT V3S64 = LLT::fixed_vector(3, 64); 330 constexpr LLT V4S64 = LLT::fixed_vector(4, 64); 331 constexpr LLT V5S64 = LLT::fixed_vector(5, 64); 332 constexpr LLT V6S64 = LLT::fixed_vector(6, 64); 333 constexpr LLT V7S64 = LLT::fixed_vector(7, 64); 334 constexpr LLT V8S64 = LLT::fixed_vector(8, 64); 335 constexpr LLT V16S64 = LLT::fixed_vector(16, 64); 336 337 constexpr LLT V2S128 = LLT::fixed_vector(2, 128); 338 constexpr LLT V4S128 = LLT::fixed_vector(4, 128); 339 340 constexpr std::initializer_list<LLT> AllScalarTypes = { 341 S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024}; 342 343 constexpr std::initializer_list<LLT> AllS16Vectors{ 344 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128}; 345 346 constexpr std::initializer_list<LLT> AllS32Vectors = { 347 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 348 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32}; 349 350 constexpr std::initializer_list<LLT> AllS64Vectors = { 351 V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 352 353 constexpr std::initializer_list<LLT> AllVectors{ 354 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, 355 V4S128, V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 356 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32, V2S64, V3S64, 357 V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 358 359 // Checks whether a type is in the list of legal register types. 360 static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) { 361 if (Ty.isPointerOrPointerVector()) 362 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits())); 363 364 return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) || 365 is_contained(AllScalarTypes, Ty) || 366 (ST.useRealTrue16Insts() && Ty == S16) || 367 is_contained(AllS16Vectors, Ty); 368 } 369 370 static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST, 371 unsigned TypeIdx) { 372 return [&ST, TypeIdx](const LegalityQuery &Query) { 373 return isRegisterClassType(ST, Query.Types[TypeIdx]); 374 }; 375 } 376 377 // If we have a truncating store or an extending load with a data size larger 378 // than 32-bits, we need to reduce to a 32-bit type. 379 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 380 return [=](const LegalityQuery &Query) { 381 const LLT Ty = Query.Types[TypeIdx]; 382 return !Ty.isVector() && Ty.getSizeInBits() > 32 && 383 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 384 }; 385 } 386 387 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 388 // handle some operations by just promoting the register during 389 // selection. There are also d16 loads on GFX9+ which preserve the high bits. 390 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 391 bool IsLoad, bool IsAtomic) { 392 switch (AS) { 393 case AMDGPUAS::PRIVATE_ADDRESS: 394 // FIXME: Private element size. 395 return ST.enableFlatScratch() ? 128 : 32; 396 case AMDGPUAS::LOCAL_ADDRESS: 397 return ST.useDS128() ? 128 : 64; 398 case AMDGPUAS::GLOBAL_ADDRESS: 399 case AMDGPUAS::CONSTANT_ADDRESS: 400 case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 401 case AMDGPUAS::BUFFER_RESOURCE: 402 // Treat constant and global as identical. SMRD loads are sometimes usable for 403 // global loads (ideally constant address space should be eliminated) 404 // depending on the context. Legality cannot be context dependent, but 405 // RegBankSelect can split the load as necessary depending on the pointer 406 // register bank/uniformity and if the memory is invariant or not written in a 407 // kernel. 408 return IsLoad ? 512 : 128; 409 default: 410 // FIXME: Flat addresses may contextually need to be split to 32-bit parts 411 // if they may alias scratch depending on the subtarget. This needs to be 412 // moved to custom handling to use addressMayBeAccessedAsPrivate 413 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 414 } 415 } 416 417 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 418 const LegalityQuery &Query) { 419 const LLT Ty = Query.Types[0]; 420 421 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 422 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 423 424 unsigned RegSize = Ty.getSizeInBits(); 425 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 426 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 427 unsigned AS = Query.Types[1].getAddressSpace(); 428 429 // All of these need to be custom lowered to cast the pointer operand. 430 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 431 return false; 432 433 // Do not handle extending vector loads. 434 if (Ty.isVector() && MemSize != RegSize) 435 return false; 436 437 // TODO: We should be able to widen loads if the alignment is high enough, but 438 // we also need to modify the memory access size. 439 #if 0 440 // Accept widening loads based on alignment. 441 if (IsLoad && MemSize < Size) 442 MemSize = std::max(MemSize, Align); 443 #endif 444 445 // Only 1-byte and 2-byte to 32-bit extloads are valid. 446 if (MemSize != RegSize && RegSize != 32) 447 return false; 448 449 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 450 Query.MMODescrs[0].Ordering != 451 AtomicOrdering::NotAtomic)) 452 return false; 453 454 switch (MemSize) { 455 case 8: 456 case 16: 457 case 32: 458 case 64: 459 case 128: 460 break; 461 case 96: 462 if (!ST.hasDwordx3LoadStores()) 463 return false; 464 break; 465 case 256: 466 case 512: 467 // These may contextually need to be broken down. 468 break; 469 default: 470 return false; 471 } 472 473 assert(RegSize >= MemSize); 474 475 if (AlignBits < MemSize) { 476 const SITargetLowering *TLI = ST.getTargetLowering(); 477 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 478 Align(AlignBits / 8))) 479 return false; 480 } 481 482 return true; 483 } 484 485 // The newer buffer intrinsic forms take their resource arguments as 486 // pointers in address space 8, aka s128 values. However, in order to not break 487 // SelectionDAG, the underlying operations have to continue to take v4i32 488 // arguments. Therefore, we convert resource pointers - or vectors of them 489 // to integer values here. 490 static bool hasBufferRsrcWorkaround(const LLT Ty) { 491 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 492 return true; 493 if (Ty.isVector()) { 494 const LLT ElemTy = Ty.getElementType(); 495 return hasBufferRsrcWorkaround(ElemTy); 496 } 497 return false; 498 } 499 500 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 501 // workaround this. Eventually it should ignore the type for loads and only care 502 // about the size. Return true in cases where we will workaround this for now by 503 // bitcasting. 504 static bool loadStoreBitcastWorkaround(const LLT Ty) { 505 if (EnableNewLegality) 506 return false; 507 508 const unsigned Size = Ty.getSizeInBits(); 509 if (Ty.isPointerVector()) 510 return true; 511 if (Size <= 64) 512 return false; 513 // Address space 8 pointers get their own workaround. 514 if (hasBufferRsrcWorkaround(Ty)) 515 return false; 516 if (!Ty.isVector()) 517 return true; 518 519 unsigned EltSize = Ty.getScalarSizeInBits(); 520 return EltSize != 32 && EltSize != 64; 521 } 522 523 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 524 const LLT Ty = Query.Types[0]; 525 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) && 526 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 527 } 528 529 /// Return true if a load or store of the type should be lowered with a bitcast 530 /// to a different type. 531 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 532 const LLT MemTy) { 533 const unsigned MemSizeInBits = MemTy.getSizeInBits(); 534 const unsigned Size = Ty.getSizeInBits(); 535 if (Size != MemSizeInBits) 536 return Size <= 32 && Ty.isVector(); 537 538 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty)) 539 return true; 540 541 // Don't try to handle bitcasting vector ext loads for now. 542 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 543 (Size <= 32 || isRegisterSize(ST, Size)) && 544 !isRegisterVectorElementType(Ty.getElementType()); 545 } 546 547 /// Return true if we should legalize a load by widening an odd sized memory 548 /// access up to the alignment. Note this case when the memory access itself 549 /// changes, not the size of the result register. 550 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 551 uint64_t AlignInBits, unsigned AddrSpace, 552 unsigned Opcode) { 553 unsigned SizeInBits = MemoryTy.getSizeInBits(); 554 // We don't want to widen cases that are naturally legal. 555 if (isPowerOf2_32(SizeInBits)) 556 return false; 557 558 // If we have 96-bit memory operations, we shouldn't touch them. Note we may 559 // end up widening these for a scalar load during RegBankSelect, if we don't 560 // have 96-bit scalar loads. 561 if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 562 return false; 563 564 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 565 return false; 566 567 // A load is known dereferenceable up to the alignment, so it's legal to widen 568 // to it. 569 // 570 // TODO: Could check dereferenceable for less aligned cases. 571 unsigned RoundedSize = NextPowerOf2(SizeInBits); 572 if (AlignInBits < RoundedSize) 573 return false; 574 575 // Do not widen if it would introduce a slow unaligned load. 576 const SITargetLowering *TLI = ST.getTargetLowering(); 577 unsigned Fast = 0; 578 return TLI->allowsMisalignedMemoryAccessesImpl( 579 RoundedSize, AddrSpace, Align(AlignInBits / 8), 580 MachineMemOperand::MOLoad, &Fast) && 581 Fast; 582 } 583 584 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 585 unsigned Opcode) { 586 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 587 return false; 588 589 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 590 Query.MMODescrs[0].AlignInBits, 591 Query.Types[1].getAddressSpace(), Opcode); 592 } 593 594 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 595 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 596 /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 597 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 598 MachineRegisterInfo &MRI, unsigned Idx) { 599 MachineOperand &MO = MI.getOperand(Idx); 600 601 const LLT PointerTy = MRI.getType(MO.getReg()); 602 603 // Paranoidly prevent us from doing this multiple times. 604 if (!hasBufferRsrcWorkaround(PointerTy)) 605 return PointerTy; 606 607 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 608 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 609 if (!PointerTy.isVector()) { 610 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 611 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 612 const LLT S32 = LLT::scalar(32); 613 614 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 615 std::array<Register, 4> VectorElems; 616 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 617 for (unsigned I = 0; I < NumParts; ++I) 618 VectorElems[I] = 619 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 620 B.buildMergeValues(MO, VectorElems); 621 MO.setReg(VectorReg); 622 return VectorTy; 623 } 624 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 625 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 626 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 627 B.buildIntToPtr(MO, Scalar); 628 MO.setReg(BitcastReg); 629 630 return VectorTy; 631 } 632 633 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 634 /// the form in which the value must be in order to be passed to the low-level 635 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 636 /// needed in order to account for the fact that we can't define a register 637 /// class for s128 without breaking SelectionDAG. 638 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 639 MachineRegisterInfo &MRI = *B.getMRI(); 640 const LLT PointerTy = MRI.getType(Pointer); 641 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 642 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 643 644 if (!PointerTy.isVector()) { 645 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 646 SmallVector<Register, 4> PointerParts; 647 const unsigned NumParts = PointerTy.getSizeInBits() / 32; 648 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 649 for (unsigned I = 0; I < NumParts; ++I) 650 PointerParts.push_back(Unmerged.getReg(I)); 651 return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 652 } 653 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 654 return B.buildBitcast(VectorTy, Scalar).getReg(0); 655 } 656 657 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 658 unsigned Idx) { 659 MachineOperand &MO = MI.getOperand(Idx); 660 661 const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 662 // Paranoidly prevent us from doing this multiple times. 663 if (!hasBufferRsrcWorkaround(PointerTy)) 664 return; 665 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 666 } 667 668 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 669 const GCNTargetMachine &TM) 670 : ST(ST_) { 671 using namespace TargetOpcode; 672 673 auto GetAddrSpacePtr = [&TM](unsigned AS) { 674 return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 675 }; 676 677 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 678 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 679 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 680 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 681 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 682 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 683 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 684 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 685 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 686 const LLT BufferStridedPtr = 687 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); 688 689 const LLT CodePtr = FlatPtr; 690 691 const std::initializer_list<LLT> AddrSpaces64 = { 692 GlobalPtr, ConstantPtr, FlatPtr 693 }; 694 695 const std::initializer_list<LLT> AddrSpaces32 = { 696 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 697 }; 698 699 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 700 701 const std::initializer_list<LLT> FPTypesBase = { 702 S32, S64 703 }; 704 705 const std::initializer_list<LLT> FPTypes16 = { 706 S32, S64, S16 707 }; 708 709 const std::initializer_list<LLT> FPTypesPK16 = { 710 S32, S64, S16, V2S16 711 }; 712 713 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 714 715 // s1 for VCC branches, s32 for SCC branches. 716 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 717 718 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 719 // elements for v3s16 720 getActionDefinitionsBuilder(G_PHI) 721 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 722 .legalFor(AllS32Vectors) 723 .legalFor(AllS64Vectors) 724 .legalFor(AddrSpaces64) 725 .legalFor(AddrSpaces32) 726 .legalFor(AddrSpaces128) 727 .legalIf(isPointer(0)) 728 .clampScalar(0, S16, S256) 729 .widenScalarToNextPow2(0, 32) 730 .clampMaxNumElements(0, S32, 16) 731 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 732 .scalarize(0); 733 734 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 735 // Full set of gfx9 features. 736 if (ST.hasScalarAddSub64()) { 737 getActionDefinitionsBuilder({G_ADD, G_SUB}) 738 .legalFor({S64, S32, S16, V2S16}) 739 .clampMaxNumElementsStrict(0, S16, 2) 740 .scalarize(0) 741 .minScalar(0, S16) 742 .widenScalarToNextMultipleOf(0, 32) 743 .maxScalar(0, S32); 744 } else { 745 getActionDefinitionsBuilder({G_ADD, G_SUB}) 746 .legalFor({S32, S16, V2S16}) 747 .clampMaxNumElementsStrict(0, S16, 2) 748 .scalarize(0) 749 .minScalar(0, S16) 750 .widenScalarToNextMultipleOf(0, 32) 751 .maxScalar(0, S32); 752 } 753 754 if (ST.hasScalarSMulU64()) { 755 getActionDefinitionsBuilder(G_MUL) 756 .legalFor({S64, S32, S16, V2S16}) 757 .clampMaxNumElementsStrict(0, S16, 2) 758 .scalarize(0) 759 .minScalar(0, S16) 760 .widenScalarToNextMultipleOf(0, 32) 761 .custom(); 762 } else { 763 getActionDefinitionsBuilder(G_MUL) 764 .legalFor({S32, S16, V2S16}) 765 .clampMaxNumElementsStrict(0, S16, 2) 766 .scalarize(0) 767 .minScalar(0, S16) 768 .widenScalarToNextMultipleOf(0, 32) 769 .custom(); 770 } 771 assert(ST.hasMad64_32()); 772 773 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 774 .legalFor({S32, S16, V2S16}) // Clamp modifier 775 .minScalarOrElt(0, S16) 776 .clampMaxNumElementsStrict(0, S16, 2) 777 .scalarize(0) 778 .widenScalarToNextPow2(0, 32) 779 .lower(); 780 } else if (ST.has16BitInsts()) { 781 getActionDefinitionsBuilder({G_ADD, G_SUB}) 782 .legalFor({S32, S16}) 783 .minScalar(0, S16) 784 .widenScalarToNextMultipleOf(0, 32) 785 .maxScalar(0, S32) 786 .scalarize(0); 787 788 getActionDefinitionsBuilder(G_MUL) 789 .legalFor({S32, S16}) 790 .scalarize(0) 791 .minScalar(0, S16) 792 .widenScalarToNextMultipleOf(0, 32) 793 .custom(); 794 assert(ST.hasMad64_32()); 795 796 // Technically the saturating operations require clamp bit support, but this 797 // was introduced at the same time as 16-bit operations. 798 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 799 .legalFor({S32, S16}) // Clamp modifier 800 .minScalar(0, S16) 801 .scalarize(0) 802 .widenScalarToNextPow2(0, 16) 803 .lower(); 804 805 // We're just lowering this, but it helps get a better result to try to 806 // coerce to the desired type first. 807 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 808 .minScalar(0, S16) 809 .scalarize(0) 810 .lower(); 811 } else { 812 getActionDefinitionsBuilder({G_ADD, G_SUB}) 813 .legalFor({S32}) 814 .widenScalarToNextMultipleOf(0, 32) 815 .clampScalar(0, S32, S32) 816 .scalarize(0); 817 818 auto &Mul = getActionDefinitionsBuilder(G_MUL) 819 .legalFor({S32}) 820 .scalarize(0) 821 .minScalar(0, S32) 822 .widenScalarToNextMultipleOf(0, 32); 823 824 if (ST.hasMad64_32()) 825 Mul.custom(); 826 else 827 Mul.maxScalar(0, S32); 828 829 if (ST.hasIntClamp()) { 830 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 831 .legalFor({S32}) // Clamp modifier. 832 .scalarize(0) 833 .minScalarOrElt(0, S32) 834 .lower(); 835 } else { 836 // Clamp bit support was added in VI, along with 16-bit operations. 837 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 838 .minScalar(0, S32) 839 .scalarize(0) 840 .lower(); 841 } 842 843 // FIXME: DAG expansion gets better results. The widening uses the smaller 844 // range values and goes for the min/max lowering directly. 845 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 846 .minScalar(0, S32) 847 .scalarize(0) 848 .lower(); 849 } 850 851 getActionDefinitionsBuilder( 852 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 853 .customFor({S32, S64}) 854 .clampScalar(0, S32, S64) 855 .widenScalarToNextPow2(0, 32) 856 .scalarize(0); 857 858 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 859 .legalFor({S32}) 860 .maxScalar(0, S32); 861 862 if (ST.hasVOP3PInsts()) { 863 Mulh 864 .clampMaxNumElements(0, S8, 2) 865 .lowerFor({V2S8}); 866 } 867 868 Mulh 869 .scalarize(0) 870 .lower(); 871 872 // Report legal for any types we can handle anywhere. For the cases only legal 873 // on the SALU, RegBankSelect will be able to re-legalize. 874 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 875 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 876 .clampScalar(0, S32, S64) 877 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 878 .fewerElementsIf( 879 all(vectorWiderThan(0, 64), scalarOrEltNarrowerThan(0, 64)), 880 fewerEltsToSize64Vector(0)) 881 .widenScalarToNextPow2(0) 882 .scalarize(0); 883 884 getActionDefinitionsBuilder( 885 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 886 .legalFor({{S32, S1}, {S32, S32}}) 887 .clampScalar(0, S32, S32) 888 .scalarize(0); 889 890 getActionDefinitionsBuilder(G_BITCAST) 891 // Don't worry about the size constraint. 892 .legalIf(all(isRegisterClassType(ST, 0), isRegisterClassType(ST, 1))) 893 .lower(); 894 895 getActionDefinitionsBuilder(G_CONSTANT) 896 .legalFor({S1, S32, S64, S16, GlobalPtr, 897 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 898 .legalIf(isPointer(0)) 899 .clampScalar(0, S32, S64) 900 .widenScalarToNextPow2(0); 901 902 getActionDefinitionsBuilder(G_FCONSTANT) 903 .legalFor({S32, S64, S16}) 904 .clampScalar(0, S16, S64); 905 906 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 907 .legalIf(isRegisterClassType(ST, 0)) 908 // s1 and s16 are special cases because they have legal operations on 909 // them, but don't really occupy registers in the normal way. 910 .legalFor({S1, S16}) 911 .clampNumElements(0, V16S32, V32S32) 912 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 913 .clampScalarOrElt(0, S32, MaxScalar) 914 .widenScalarToNextPow2(0, 32) 915 .clampMaxNumElements(0, S32, 16); 916 917 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 918 919 // If the amount is divergent, we have to do a wave reduction to get the 920 // maximum value, so this is expanded during RegBankSelect. 921 getActionDefinitionsBuilder(G_DYN_STACKALLOC) 922 .legalFor({{PrivatePtr, S32}}); 923 924 getActionDefinitionsBuilder(G_STACKSAVE) 925 .customFor({PrivatePtr}); 926 getActionDefinitionsBuilder(G_STACKRESTORE) 927 .legalFor({PrivatePtr}); 928 929 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64}); 930 931 getActionDefinitionsBuilder(G_GLOBAL_VALUE) 932 .customIf(typeIsNot(0, PrivatePtr)); 933 934 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 935 936 auto &FPOpActions = getActionDefinitionsBuilder( 937 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 938 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 939 .legalFor({S32, S64}); 940 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 941 .customFor({S32, S64}); 942 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 943 .customFor({S32, S64}); 944 945 if (ST.has16BitInsts()) { 946 if (ST.hasVOP3PInsts()) 947 FPOpActions.legalFor({S16, V2S16}); 948 else 949 FPOpActions.legalFor({S16}); 950 951 TrigActions.customFor({S16}); 952 FDIVActions.customFor({S16}); 953 } 954 955 if (ST.hasPackedFP32Ops()) { 956 FPOpActions.legalFor({V2S32}); 957 FPOpActions.clampMaxNumElementsStrict(0, S32, 2); 958 } 959 960 auto &MinNumMaxNum = getActionDefinitionsBuilder( 961 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE, 962 G_FMAXNUM_IEEE}); 963 964 if (ST.hasVOP3PInsts()) { 965 MinNumMaxNum.customFor(FPTypesPK16) 966 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 967 .clampMaxNumElements(0, S16, 2) 968 .clampScalar(0, S16, S64) 969 .scalarize(0); 970 } else if (ST.has16BitInsts()) { 971 MinNumMaxNum.customFor(FPTypes16) 972 .clampScalar(0, S16, S64) 973 .scalarize(0); 974 } else { 975 MinNumMaxNum.customFor(FPTypesBase) 976 .clampScalar(0, S32, S64) 977 .scalarize(0); 978 } 979 980 if (ST.hasVOP3PInsts()) 981 FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 982 983 FPOpActions 984 .scalarize(0) 985 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 986 987 TrigActions 988 .scalarize(0) 989 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 990 991 FDIVActions 992 .scalarize(0) 993 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 994 995 getActionDefinitionsBuilder({G_FNEG, G_FABS}) 996 .legalFor(FPTypesPK16) 997 .clampMaxNumElementsStrict(0, S16, 2) 998 .scalarize(0) 999 .clampScalar(0, S16, S64); 1000 1001 if (ST.has16BitInsts()) { 1002 getActionDefinitionsBuilder(G_FSQRT) 1003 .legalFor({S16}) 1004 .customFor({S32, S64}) 1005 .scalarize(0) 1006 .unsupported(); 1007 getActionDefinitionsBuilder(G_FFLOOR) 1008 .legalFor({S32, S64, S16}) 1009 .scalarize(0) 1010 .clampScalar(0, S16, S64); 1011 1012 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 1013 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 1014 .scalarize(0) 1015 .maxScalarIf(typeIs(0, S16), 1, S16) 1016 .clampScalar(1, S32, S32) 1017 .lower(); 1018 1019 getActionDefinitionsBuilder(G_FFREXP) 1020 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 1021 .scalarize(0) 1022 .lower(); 1023 } else { 1024 getActionDefinitionsBuilder(G_FSQRT) 1025 .customFor({S32, S64, S16}) 1026 .scalarize(0) 1027 .unsupported(); 1028 1029 1030 if (ST.hasFractBug()) { 1031 getActionDefinitionsBuilder(G_FFLOOR) 1032 .customFor({S64}) 1033 .legalFor({S32, S64}) 1034 .scalarize(0) 1035 .clampScalar(0, S32, S64); 1036 } else { 1037 getActionDefinitionsBuilder(G_FFLOOR) 1038 .legalFor({S32, S64}) 1039 .scalarize(0) 1040 .clampScalar(0, S32, S64); 1041 } 1042 1043 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 1044 .legalFor({{S32, S32}, {S64, S32}}) 1045 .scalarize(0) 1046 .clampScalar(0, S32, S64) 1047 .clampScalar(1, S32, S32) 1048 .lower(); 1049 1050 getActionDefinitionsBuilder(G_FFREXP) 1051 .customFor({{S32, S32}, {S64, S32}}) 1052 .scalarize(0) 1053 .minScalar(0, S32) 1054 .clampScalar(1, S32, S32) 1055 .lower(); 1056 } 1057 1058 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC); 1059 if (ST.hasCvtPkF16F32Inst()) { 1060 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}}) 1061 .clampMaxNumElements(0, S16, 2); 1062 } else { 1063 FPTruncActions.legalFor({{S32, S64}, {S16, S32}}); 1064 } 1065 FPTruncActions.scalarize(0).lower(); 1066 1067 getActionDefinitionsBuilder(G_FPEXT) 1068 .legalFor({{S64, S32}, {S32, S16}}) 1069 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 1070 .scalarize(0); 1071 1072 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 1073 if (ST.has16BitInsts()) { 1074 FSubActions 1075 // Use actual fsub instruction 1076 .legalFor({S32, S16}) 1077 // Must use fadd + fneg 1078 .lowerFor({S64, V2S16}); 1079 } else { 1080 FSubActions 1081 // Use actual fsub instruction 1082 .legalFor({S32}) 1083 // Must use fadd + fneg 1084 .lowerFor({S64, S16, V2S16}); 1085 } 1086 1087 FSubActions 1088 .scalarize(0) 1089 .clampScalar(0, S32, S64); 1090 1091 // Whether this is legal depends on the floating point mode for the function. 1092 auto &FMad = getActionDefinitionsBuilder(G_FMAD); 1093 if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 1094 FMad.customFor({S32, S16}); 1095 else if (ST.hasMadMacF32Insts()) 1096 FMad.customFor({S32}); 1097 else if (ST.hasMadF16()) 1098 FMad.customFor({S16}); 1099 FMad.scalarize(0) 1100 .lower(); 1101 1102 auto &FRem = getActionDefinitionsBuilder(G_FREM); 1103 if (ST.has16BitInsts()) { 1104 FRem.customFor({S16, S32, S64}); 1105 } else { 1106 FRem.minScalar(0, S32) 1107 .customFor({S32, S64}); 1108 } 1109 FRem.scalarize(0); 1110 1111 // TODO: Do we need to clamp maximum bitwidth? 1112 getActionDefinitionsBuilder(G_TRUNC) 1113 .legalIf(isScalar(0)) 1114 .legalFor({{V2S16, V2S32}}) 1115 .clampMaxNumElements(0, S16, 2) 1116 // Avoid scalarizing in cases that should be truly illegal. In unresolvable 1117 // situations (like an invalid implicit use), we don't want to infinite loop 1118 // in the legalizer. 1119 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 1120 .alwaysLegal(); 1121 1122 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 1123 .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 1124 {S32, S1}, {S64, S1}, {S16, S1}}) 1125 .scalarize(0) 1126 .clampScalar(0, S32, S64) 1127 .widenScalarToNextPow2(1, 32); 1128 1129 // TODO: Split s1->s64 during regbankselect for VALU. 1130 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1131 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1132 .lowerIf(typeIs(1, S1)) 1133 .customFor({{S32, S64}, {S64, S64}}); 1134 if (ST.has16BitInsts()) 1135 IToFP.legalFor({{S16, S16}}); 1136 IToFP.clampScalar(1, S32, S64) 1137 .minScalar(0, S32) 1138 .scalarize(0) 1139 .widenScalarToNextPow2(1); 1140 1141 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 1142 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1143 .customFor({{S64, S32}, {S64, S64}}) 1144 .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 1145 if (ST.has16BitInsts()) 1146 FPToI.legalFor({{S16, S16}}); 1147 else 1148 FPToI.minScalar(1, S32); 1149 1150 FPToI.minScalar(0, S32) 1151 .widenScalarToNextPow2(0, 32) 1152 .scalarize(0) 1153 .lower(); 1154 1155 getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) 1156 .clampScalar(0, S16, S64) 1157 .scalarize(0) 1158 .lower(); 1159 1160 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 1161 .legalFor({S16, S32}) 1162 .scalarize(0) 1163 .lower(); 1164 1165 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN 1166 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) 1167 .scalarize(0) 1168 .lower(); 1169 1170 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT}) 1171 .clampScalar(0, S16, S64) 1172 .scalarize(0) 1173 .lower(); 1174 1175 if (ST.has16BitInsts()) { 1176 getActionDefinitionsBuilder( 1177 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1178 .legalFor({S16, S32, S64}) 1179 .clampScalar(0, S16, S64) 1180 .scalarize(0); 1181 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 1182 getActionDefinitionsBuilder( 1183 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1184 .legalFor({S32, S64}) 1185 .clampScalar(0, S32, S64) 1186 .scalarize(0); 1187 } else { 1188 getActionDefinitionsBuilder( 1189 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1190 .legalFor({S32}) 1191 .customFor({S64}) 1192 .clampScalar(0, S32, S64) 1193 .scalarize(0); 1194 } 1195 1196 getActionDefinitionsBuilder(G_PTR_ADD) 1197 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) 1198 .legalIf(all(isPointer(0), sameSize(0, 1))) 1199 .scalarize(0) 1200 .scalarSameSizeAs(1, 0); 1201 1202 getActionDefinitionsBuilder(G_PTRMASK) 1203 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1204 .scalarSameSizeAs(1, 0) 1205 .scalarize(0); 1206 1207 auto &CmpBuilder = 1208 getActionDefinitionsBuilder(G_ICMP) 1209 // The compare output type differs based on the register bank of the output, 1210 // so make both s1 and s32 legal. 1211 // 1212 // Scalar compares producing output in scc will be promoted to s32, as that 1213 // is the allocatable register type that will be needed for the copy from 1214 // scc. This will be promoted during RegBankSelect, and we assume something 1215 // before that won't try to use s32 result types. 1216 // 1217 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1218 // bank. 1219 .legalForCartesianProduct( 1220 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1221 .legalForCartesianProduct( 1222 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 1223 if (ST.has16BitInsts()) { 1224 CmpBuilder.legalFor({{S1, S16}}); 1225 } 1226 1227 CmpBuilder 1228 .widenScalarToNextPow2(1) 1229 .clampScalar(1, S32, S64) 1230 .scalarize(0) 1231 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 1232 1233 auto &FCmpBuilder = 1234 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( 1235 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); 1236 1237 if (ST.hasSALUFloatInsts()) 1238 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); 1239 1240 FCmpBuilder 1241 .widenScalarToNextPow2(1) 1242 .clampScalar(1, S32, S64) 1243 .scalarize(0); 1244 1245 // FIXME: fpow has a selection pattern that should move to custom lowering. 1246 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 1247 if (ST.has16BitInsts()) 1248 ExpOps.customFor({{S32}, {S16}}); 1249 else 1250 ExpOps.customFor({S32}); 1251 ExpOps.clampScalar(0, MinScalarFPTy, S32) 1252 .scalarize(0); 1253 1254 getActionDefinitionsBuilder(G_FPOWI) 1255 .clampScalar(0, MinScalarFPTy, S32) 1256 .lower(); 1257 1258 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 1259 Log2Ops.customFor({S32}); 1260 if (ST.has16BitInsts()) 1261 Log2Ops.legalFor({S16}); 1262 else 1263 Log2Ops.customFor({S16}); 1264 Log2Ops.scalarize(0) 1265 .lower(); 1266 1267 auto &LogOps = 1268 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); 1269 LogOps.customFor({S32, S16}); 1270 LogOps.clampScalar(0, MinScalarFPTy, S32) 1271 .scalarize(0); 1272 1273 // The 64-bit versions produce 32-bit results, but only on the SALU. 1274 getActionDefinitionsBuilder(G_CTPOP) 1275 .legalFor({{S32, S32}, {S32, S64}}) 1276 .clampScalar(0, S32, S32) 1277 .widenScalarToNextPow2(1, 32) 1278 .clampScalar(1, S32, S64) 1279 .scalarize(0) 1280 .widenScalarToNextPow2(0, 32); 1281 1282 // If no 16 bit instr is available, lower into different instructions. 1283 if (ST.has16BitInsts()) 1284 getActionDefinitionsBuilder(G_IS_FPCLASS) 1285 .legalForCartesianProduct({S1}, FPTypes16) 1286 .widenScalarToNextPow2(1) 1287 .scalarize(0) 1288 .lower(); 1289 else 1290 getActionDefinitionsBuilder(G_IS_FPCLASS) 1291 .legalForCartesianProduct({S1}, FPTypesBase) 1292 .lowerFor({S1, S16}) 1293 .widenScalarToNextPow2(1) 1294 .scalarize(0) 1295 .lower(); 1296 1297 // The hardware instructions return a different result on 0 than the generic 1298 // instructions expect. The hardware produces -1, but these produce the 1299 // bitwidth. 1300 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 1301 .scalarize(0) 1302 .clampScalar(0, S32, S32) 1303 .clampScalar(1, S32, S64) 1304 .widenScalarToNextPow2(0, 32) 1305 .widenScalarToNextPow2(1, 32) 1306 .custom(); 1307 1308 // The 64-bit versions produce 32-bit results, but only on the SALU. 1309 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF) 1310 .legalFor({{S32, S32}, {S32, S64}}) 1311 .customIf(scalarNarrowerThan(1, 32)) 1312 .clampScalar(0, S32, S32) 1313 .clampScalar(1, S32, S64) 1314 .scalarize(0) 1315 .widenScalarToNextPow2(0, 32) 1316 .widenScalarToNextPow2(1, 32); 1317 1318 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) 1319 .legalFor({{S32, S32}, {S32, S64}}) 1320 .clampScalar(0, S32, S32) 1321 .clampScalar(1, S32, S64) 1322 .scalarize(0) 1323 .widenScalarToNextPow2(0, 32) 1324 .widenScalarToNextPow2(1, 32); 1325 1326 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1327 // RegBankSelect. 1328 getActionDefinitionsBuilder(G_BITREVERSE) 1329 .legalFor({S32, S64}) 1330 .clampScalar(0, S32, S64) 1331 .scalarize(0) 1332 .widenScalarToNextPow2(0); 1333 1334 if (ST.has16BitInsts()) { 1335 getActionDefinitionsBuilder(G_BSWAP) 1336 .legalFor({S16, S32, V2S16}) 1337 .clampMaxNumElementsStrict(0, S16, 2) 1338 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1339 // narrowScalar limitation. 1340 .widenScalarToNextPow2(0) 1341 .clampScalar(0, S16, S32) 1342 .scalarize(0); 1343 1344 if (ST.hasVOP3PInsts()) { 1345 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1346 .legalFor({S32, S16, V2S16}) 1347 .clampMaxNumElements(0, S16, 2) 1348 .minScalar(0, S16) 1349 .widenScalarToNextPow2(0) 1350 .scalarize(0) 1351 .lower(); 1352 } else { 1353 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1354 .legalFor({S32, S16}) 1355 .widenScalarToNextPow2(0) 1356 .minScalar(0, S16) 1357 .scalarize(0) 1358 .lower(); 1359 } 1360 } else { 1361 // TODO: Should have same legality without v_perm_b32 1362 getActionDefinitionsBuilder(G_BSWAP) 1363 .legalFor({S32}) 1364 .lowerIf(scalarNarrowerThan(0, 32)) 1365 // FIXME: Fixing non-power-of-2 before clamp is workaround for 1366 // narrowScalar limitation. 1367 .widenScalarToNextPow2(0) 1368 .maxScalar(0, S32) 1369 .scalarize(0) 1370 .lower(); 1371 1372 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 1373 .legalFor({S32}) 1374 .minScalar(0, S32) 1375 .widenScalarToNextPow2(0) 1376 .scalarize(0) 1377 .lower(); 1378 } 1379 1380 getActionDefinitionsBuilder(G_INTTOPTR) 1381 // List the common cases 1382 .legalForCartesianProduct(AddrSpaces64, {S64}) 1383 .legalForCartesianProduct(AddrSpaces32, {S32}) 1384 .scalarize(0) 1385 // Accept any address space as long as the size matches 1386 .legalIf(sameSize(0, 1)) 1387 .widenScalarIf(smallerThan(1, 0), 1388 [](const LegalityQuery &Query) { 1389 return std::pair( 1390 1, LLT::scalar(Query.Types[0].getSizeInBits())); 1391 }) 1392 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1393 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 1394 }); 1395 1396 getActionDefinitionsBuilder(G_PTRTOINT) 1397 // List the common cases 1398 .legalForCartesianProduct(AddrSpaces64, {S64}) 1399 .legalForCartesianProduct(AddrSpaces32, {S32}) 1400 .scalarize(0) 1401 // Accept any address space as long as the size matches 1402 .legalIf(sameSize(0, 1)) 1403 .widenScalarIf(smallerThan(0, 1), 1404 [](const LegalityQuery &Query) { 1405 return std::pair( 1406 0, LLT::scalar(Query.Types[1].getSizeInBits())); 1407 }) 1408 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1409 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 1410 }); 1411 1412 getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 1413 .scalarize(0) 1414 .custom(); 1415 1416 const auto needToSplitMemOp = [=](const LegalityQuery &Query, 1417 bool IsLoad) -> bool { 1418 const LLT DstTy = Query.Types[0]; 1419 1420 // Split vector extloads. 1421 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1422 1423 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 1424 return true; 1425 1426 const LLT PtrTy = Query.Types[1]; 1427 unsigned AS = PtrTy.getAddressSpace(); 1428 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 1429 Query.MMODescrs[0].Ordering != 1430 AtomicOrdering::NotAtomic)) 1431 return true; 1432 1433 // Catch weird sized loads that don't evenly divide into the access sizes 1434 // TODO: May be able to widen depending on alignment etc. 1435 unsigned NumRegs = (MemSize + 31) / 32; 1436 if (NumRegs == 3) { 1437 if (!ST.hasDwordx3LoadStores()) 1438 return true; 1439 } else { 1440 // If the alignment allows, these should have been widened. 1441 if (!isPowerOf2_32(NumRegs)) 1442 return true; 1443 } 1444 1445 return false; 1446 }; 1447 1448 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1449 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1450 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 1451 1452 // TODO: Refine based on subtargets which support unaligned access or 128-bit 1453 // LDS 1454 // TODO: Unsupported flat for SI. 1455 1456 for (unsigned Op : {G_LOAD, G_STORE}) { 1457 const bool IsStore = Op == G_STORE; 1458 1459 auto &Actions = getActionDefinitionsBuilder(Op); 1460 // Explicitly list some common cases. 1461 // TODO: Does this help compile time at all? 1462 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1463 {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1464 {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1465 {S64, GlobalPtr, S64, GlobalAlign32}, 1466 {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1467 {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1468 {S32, GlobalPtr, S8, GlobalAlign8}, 1469 {S32, GlobalPtr, S16, GlobalAlign16}, 1470 1471 {S32, LocalPtr, S32, 32}, 1472 {S64, LocalPtr, S64, 32}, 1473 {V2S32, LocalPtr, V2S32, 32}, 1474 {S32, LocalPtr, S8, 8}, 1475 {S32, LocalPtr, S16, 16}, 1476 {V2S16, LocalPtr, S32, 32}, 1477 1478 {S32, PrivatePtr, S32, 32}, 1479 {S32, PrivatePtr, S8, 8}, 1480 {S32, PrivatePtr, S16, 16}, 1481 {V2S16, PrivatePtr, S32, 32}, 1482 1483 {S32, ConstantPtr, S32, GlobalAlign32}, 1484 {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1485 {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1486 {S64, ConstantPtr, S64, GlobalAlign32}, 1487 {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 1488 Actions.legalIf( 1489 [=](const LegalityQuery &Query) -> bool { 1490 return isLoadStoreLegal(ST, Query); 1491 }); 1492 1493 // The custom pointers (fat pointers, buffer resources) don't work with load 1494 // and store at this level. Fat pointers should have been lowered to 1495 // intrinsics before the translation to MIR. 1496 Actions.unsupportedIf( 1497 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); 1498 1499 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 1500 // ptrtoint. This is needed to account for the fact that we can't have i128 1501 // as a register class for SelectionDAG reasons. 1502 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1503 return hasBufferRsrcWorkaround(Query.Types[0]); 1504 }); 1505 1506 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1507 // 64-bits. 1508 // 1509 // TODO: Should generalize bitcast action into coerce, which will also cover 1510 // inserting addrspacecasts. 1511 Actions.customIf(typeIs(1, Constant32Ptr)); 1512 1513 // Turn any illegal element vectors into something easier to deal 1514 // with. These will ultimately produce 32-bit scalar shifts to extract the 1515 // parts anyway. 1516 // 1517 // For odd 16-bit element vectors, prefer to split those into pieces with 1518 // 16-bit vector parts. 1519 Actions.bitcastIf( 1520 [=](const LegalityQuery &Query) -> bool { 1521 return shouldBitcastLoadStoreType(ST, Query.Types[0], 1522 Query.MMODescrs[0].MemoryTy); 1523 }, bitcastToRegisterType(0)); 1524 1525 if (!IsStore) { 1526 // Widen suitably aligned loads by loading extra bytes. The standard 1527 // legalization actions can't properly express widening memory operands. 1528 Actions.customIf([=](const LegalityQuery &Query) -> bool { 1529 return shouldWidenLoad(ST, Query, G_LOAD); 1530 }); 1531 } 1532 1533 // FIXME: load/store narrowing should be moved to lower action 1534 Actions 1535 .narrowScalarIf( 1536 [=](const LegalityQuery &Query) -> bool { 1537 return !Query.Types[0].isVector() && 1538 needToSplitMemOp(Query, Op == G_LOAD); 1539 }, 1540 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1541 const LLT DstTy = Query.Types[0]; 1542 const LLT PtrTy = Query.Types[1]; 1543 1544 const unsigned DstSize = DstTy.getSizeInBits(); 1545 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1546 1547 // Split extloads. 1548 if (DstSize > MemSize) 1549 return std::pair(0, LLT::scalar(MemSize)); 1550 1551 unsigned MaxSize = maxSizeForAddrSpace( 1552 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1553 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1554 if (MemSize > MaxSize) 1555 return std::pair(0, LLT::scalar(MaxSize)); 1556 1557 uint64_t Align = Query.MMODescrs[0].AlignInBits; 1558 return std::pair(0, LLT::scalar(Align)); 1559 }) 1560 .fewerElementsIf( 1561 [=](const LegalityQuery &Query) -> bool { 1562 return Query.Types[0].isVector() && 1563 needToSplitMemOp(Query, Op == G_LOAD); 1564 }, 1565 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 1566 const LLT DstTy = Query.Types[0]; 1567 const LLT PtrTy = Query.Types[1]; 1568 1569 LLT EltTy = DstTy.getElementType(); 1570 unsigned MaxSize = maxSizeForAddrSpace( 1571 ST, PtrTy.getAddressSpace(), Op == G_LOAD, 1572 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 1573 1574 // FIXME: Handle widened to power of 2 results better. This ends 1575 // up scalarizing. 1576 // FIXME: 3 element stores scalarized on SI 1577 1578 // Split if it's too large for the address space. 1579 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1580 if (MemSize > MaxSize) { 1581 unsigned NumElts = DstTy.getNumElements(); 1582 unsigned EltSize = EltTy.getSizeInBits(); 1583 1584 if (MaxSize % EltSize == 0) { 1585 return std::pair( 1586 0, LLT::scalarOrVector( 1587 ElementCount::getFixed(MaxSize / EltSize), EltTy)); 1588 } 1589 1590 unsigned NumPieces = MemSize / MaxSize; 1591 1592 // FIXME: Refine when odd breakdowns handled 1593 // The scalars will need to be re-legalized. 1594 if (NumPieces == 1 || NumPieces >= NumElts || 1595 NumElts % NumPieces != 0) 1596 return std::pair(0, EltTy); 1597 1598 return std::pair(0, 1599 LLT::fixed_vector(NumElts / NumPieces, EltTy)); 1600 } 1601 1602 // FIXME: We could probably handle weird extending loads better. 1603 if (DstTy.getSizeInBits() > MemSize) 1604 return std::pair(0, EltTy); 1605 1606 unsigned EltSize = EltTy.getSizeInBits(); 1607 unsigned DstSize = DstTy.getSizeInBits(); 1608 if (!isPowerOf2_32(DstSize)) { 1609 // We're probably decomposing an odd sized store. Try to split 1610 // to the widest type. TODO: Account for alignment. As-is it 1611 // should be OK, since the new parts will be further legalized. 1612 unsigned FloorSize = llvm::bit_floor(DstSize); 1613 return std::pair( 1614 0, LLT::scalarOrVector( 1615 ElementCount::getFixed(FloorSize / EltSize), EltTy)); 1616 } 1617 1618 // May need relegalization for the scalars. 1619 return std::pair(0, EltTy); 1620 }) 1621 .minScalar(0, S32) 1622 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 1623 .widenScalarToNextPow2(0) 1624 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1625 .lower(); 1626 } 1627 1628 // FIXME: Unaligned accesses not lowered. 1629 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1630 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1631 {S32, GlobalPtr, S16, 2 * 8}, 1632 {S32, LocalPtr, S8, 8}, 1633 {S32, LocalPtr, S16, 16}, 1634 {S32, PrivatePtr, S8, 8}, 1635 {S32, PrivatePtr, S16, 16}, 1636 {S32, ConstantPtr, S8, 8}, 1637 {S32, ConstantPtr, S16, 2 * 8}}) 1638 .legalIf( 1639 [=](const LegalityQuery &Query) -> bool { 1640 return isLoadStoreLegal(ST, Query); 1641 }); 1642 1643 if (ST.hasFlatAddressSpace()) { 1644 ExtLoads.legalForTypesWithMemDesc( 1645 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 1646 } 1647 1648 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1649 // 64-bits. 1650 // 1651 // TODO: Should generalize bitcast action into coerce, which will also cover 1652 // inserting addrspacecasts. 1653 ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1654 1655 ExtLoads.clampScalar(0, S32, S32) 1656 .widenScalarToNextPow2(0) 1657 .lower(); 1658 1659 auto &Atomics = getActionDefinitionsBuilder( 1660 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 1661 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 1662 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1663 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 1664 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1665 {S64, GlobalPtr}, {S64, LocalPtr}, 1666 {S32, RegionPtr}, {S64, RegionPtr}}); 1667 if (ST.hasFlatAddressSpace()) { 1668 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 1669 } 1670 1671 // TODO: v2bf16 operations, and fat buffer pointer support. 1672 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1673 if (ST.hasLDSFPAtomicAddF32()) { 1674 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1675 if (ST.hasLdsAtomicAddF64()) 1676 Atomic.legalFor({{S64, LocalPtr}}); 1677 if (ST.hasAtomicDsPkAdd16Insts()) 1678 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}}); 1679 } 1680 if (ST.hasAtomicFaddInsts()) 1681 Atomic.legalFor({{S32, GlobalPtr}}); 1682 if (ST.hasFlatAtomicFaddF32Inst()) 1683 Atomic.legalFor({{S32, FlatPtr}}); 1684 1685 if (ST.hasGFX90AInsts()) { 1686 // These are legal with some caveats, and should have undergone expansion in 1687 // the IR in most situations 1688 // TODO: Move atomic expansion into legalizer 1689 Atomic.legalFor({ 1690 {S32, GlobalPtr}, 1691 {S64, GlobalPtr}, 1692 {S64, FlatPtr} 1693 }); 1694 } 1695 1696 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() || 1697 ST.hasAtomicBufferGlobalPkAddF16Insts()) 1698 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}}); 1699 if (ST.hasAtomicGlobalPkAddBF16Inst()) 1700 Atomic.legalFor({{V2BF16, GlobalPtr}}); 1701 if (ST.hasAtomicFlatPkAdd16Insts()) 1702 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); 1703 1704 1705 // Most of the legalization work here is done by AtomicExpand. We could 1706 // probably use a simpler legality rule that just assumes anything is OK. 1707 auto &AtomicFMinFMax = 1708 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX}) 1709 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}}); 1710 1711 if (ST.hasAtomicFMinFMaxF32GlobalInsts()) 1712 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}}); 1713 if (ST.hasAtomicFMinFMaxF64GlobalInsts()) 1714 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}}); 1715 if (ST.hasAtomicFMinFMaxF32FlatInsts()) 1716 AtomicFMinFMax.legalFor({F32, FlatPtr}); 1717 if (ST.hasAtomicFMinFMaxF64FlatInsts()) 1718 AtomicFMinFMax.legalFor({F64, FlatPtr}); 1719 1720 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1721 // demarshalling 1722 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1723 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1724 {S32, FlatPtr}, {S64, FlatPtr}}) 1725 .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1726 {S32, RegionPtr}, {S64, RegionPtr}}); 1727 // TODO: Pointer types, any 32-bit or 64-bit vector 1728 1729 // Condition should be s32 for scalar, s1 for vector. 1730 getActionDefinitionsBuilder(G_SELECT) 1731 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1732 LocalPtr, FlatPtr, PrivatePtr, 1733 LLT::fixed_vector(2, LocalPtr), 1734 LLT::fixed_vector(2, PrivatePtr)}, 1735 {S1, S32}) 1736 .clampScalar(0, S16, S64) 1737 .scalarize(1) 1738 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 1739 .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 1740 .clampMaxNumElements(0, S32, 2) 1741 .clampMaxNumElements(0, LocalPtr, 2) 1742 .clampMaxNumElements(0, PrivatePtr, 2) 1743 .scalarize(0) 1744 .widenScalarToNextPow2(0) 1745 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 1746 1747 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 1748 // be more flexible with the shift amount type. 1749 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 1750 .legalFor({{S32, S32}, {S64, S32}}); 1751 if (ST.has16BitInsts()) { 1752 if (ST.hasVOP3PInsts()) { 1753 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 1754 .clampMaxNumElements(0, S16, 2); 1755 } else 1756 Shifts.legalFor({{S16, S16}}); 1757 1758 // TODO: Support 16-bit shift amounts for all types 1759 Shifts.widenScalarIf( 1760 [=](const LegalityQuery &Query) { 1761 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1762 // 32-bit amount. 1763 const LLT ValTy = Query.Types[0]; 1764 const LLT AmountTy = Query.Types[1]; 1765 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 && 1766 AmountTy.getSizeInBits() < 16; 1767 }, changeTo(1, S16)); 1768 Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1769 Shifts.clampScalar(1, S32, S32); 1770 Shifts.widenScalarToNextPow2(0, 16); 1771 Shifts.clampScalar(0, S16, S64); 1772 1773 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1774 .minScalar(0, S16) 1775 .scalarize(0) 1776 .lower(); 1777 } else { 1778 // Make sure we legalize the shift amount type first, as the general 1779 // expansion for the shifted type will produce much worse code if it hasn't 1780 // been truncated already. 1781 Shifts.clampScalar(1, S32, S32); 1782 Shifts.widenScalarToNextPow2(0, 32); 1783 Shifts.clampScalar(0, S32, S64); 1784 1785 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1786 .minScalar(0, S32) 1787 .scalarize(0) 1788 .lower(); 1789 } 1790 Shifts.scalarize(0); 1791 1792 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 1793 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 1794 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 1795 unsigned IdxTypeIdx = 2; 1796 1797 getActionDefinitionsBuilder(Op) 1798 .customIf([=](const LegalityQuery &Query) { 1799 const LLT EltTy = Query.Types[EltTypeIdx]; 1800 const LLT VecTy = Query.Types[VecTypeIdx]; 1801 const LLT IdxTy = Query.Types[IdxTypeIdx]; 1802 const unsigned EltSize = EltTy.getSizeInBits(); 1803 const bool isLegalVecType = 1804 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 1805 // Address space 8 pointers are 128-bit wide values, but the logic 1806 // below will try to bitcast them to 2N x s64, which will fail. 1807 // Therefore, as an intermediate step, wrap extracts/insertions from a 1808 // ptrtoint-ing the vector and scalar arguments (or inttoptring the 1809 // extraction result) in order to produce a vector operation that can 1810 // be handled by the logic below. 1811 if (EltTy.isPointer() && EltSize > 64) 1812 return true; 1813 return (EltSize == 32 || EltSize == 64) && 1814 VecTy.getSizeInBits() % 32 == 0 && 1815 VecTy.getSizeInBits() <= MaxRegisterSize && 1816 IdxTy.getSizeInBits() == 32 && 1817 isLegalVecType; 1818 }) 1819 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), 1820 scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1821 bitcastToVectorElement32(VecTypeIdx)) 1822 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1823 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), 1824 scalarOrEltWiderThan(VecTypeIdx, 64)), 1825 [=](const LegalityQuery &Query) { 1826 // For > 64-bit element types, try to turn this into a 1827 // 64-bit element vector since we may be able to do better 1828 // indexing if this is scalar. If not, fall back to 32. 1829 const LLT EltTy = Query.Types[EltTypeIdx]; 1830 const LLT VecTy = Query.Types[VecTypeIdx]; 1831 const unsigned DstEltSize = EltTy.getSizeInBits(); 1832 const unsigned VecSize = VecTy.getSizeInBits(); 1833 1834 const unsigned TargetEltSize = 1835 DstEltSize % 64 == 0 ? 64 : 32; 1836 return std::pair(VecTypeIdx, 1837 LLT::fixed_vector(VecSize / TargetEltSize, 1838 TargetEltSize)); 1839 }) 1840 .clampScalar(EltTypeIdx, S32, S64) 1841 .clampScalar(VecTypeIdx, S32, S64) 1842 .clampScalar(IdxTypeIdx, S32, S32) 1843 .clampMaxNumElements(VecTypeIdx, S32, 32) 1844 // TODO: Clamp elements for 64-bit vectors? 1845 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx), 1846 moreElementsToNextExistingRegClass(VecTypeIdx)) 1847 // It should only be necessary with variable indexes. 1848 // As a last resort, lower to the stack 1849 .lower(); 1850 } 1851 1852 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 1853 .unsupportedIf([=](const LegalityQuery &Query) { 1854 const LLT &EltTy = Query.Types[1].getElementType(); 1855 return Query.Types[0] != EltTy; 1856 }); 1857 1858 for (unsigned Op : {G_EXTRACT, G_INSERT}) { 1859 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 1860 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 1861 1862 // FIXME: Doesn't handle extract of illegal sizes. 1863 getActionDefinitionsBuilder(Op) 1864 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 1865 .lowerIf([=](const LegalityQuery &Query) { 1866 // Sub-vector(or single element) insert and extract. 1867 // TODO: verify immediate offset here since lower only works with 1868 // whole elements. 1869 const LLT BigTy = Query.Types[BigTyIdx]; 1870 return BigTy.isVector(); 1871 }) 1872 // FIXME: Multiples of 16 should not be legal. 1873 .legalIf([=](const LegalityQuery &Query) { 1874 const LLT BigTy = Query.Types[BigTyIdx]; 1875 const LLT LitTy = Query.Types[LitTyIdx]; 1876 return (BigTy.getSizeInBits() % 32 == 0) && 1877 (LitTy.getSizeInBits() % 16 == 0); 1878 }) 1879 .widenScalarIf( 1880 [=](const LegalityQuery &Query) { 1881 const LLT BigTy = Query.Types[BigTyIdx]; 1882 return (BigTy.getScalarSizeInBits() < 16); 1883 }, 1884 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 1885 .widenScalarIf( 1886 [=](const LegalityQuery &Query) { 1887 const LLT LitTy = Query.Types[LitTyIdx]; 1888 return (LitTy.getScalarSizeInBits() < 16); 1889 }, 1890 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 1891 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1892 .widenScalarToNextPow2(BigTyIdx, 32); 1893 1894 } 1895 1896 auto &BuildVector = 1897 getActionDefinitionsBuilder(G_BUILD_VECTOR) 1898 .legalForCartesianProduct(AllS32Vectors, {S32}) 1899 .legalForCartesianProduct(AllS64Vectors, {S64}) 1900 .clampNumElements(0, V16S32, V32S32) 1901 .clampNumElements(0, V2S64, V16S64) 1902 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 1903 .moreElementsIf(isIllegalRegisterType(ST, 0), 1904 moreElementsToNextExistingRegClass(0)); 1905 1906 if (ST.hasScalarPackInsts()) { 1907 BuildVector 1908 // FIXME: Should probably widen s1 vectors straight to s32 1909 .minScalarOrElt(0, S16) 1910 .minScalar(1, S16); 1911 1912 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1913 .legalFor({V2S16, S32}) 1914 .lower(); 1915 } else { 1916 BuildVector.customFor({V2S16, S16}); 1917 BuildVector.minScalarOrElt(0, S32); 1918 1919 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1920 .customFor({V2S16, S32}) 1921 .lower(); 1922 } 1923 1924 BuildVector.legalIf(isRegisterType(ST, 0)); 1925 1926 // FIXME: Clamp maximum size 1927 getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1928 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1))) 1929 .clampMaxNumElements(0, S32, 32) 1930 .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1931 .clampMaxNumElements(0, S16, 64); 1932 1933 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 1934 1935 // Merge/Unmerge 1936 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 1937 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 1938 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 1939 1940 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1941 const LLT Ty = Query.Types[TypeIdx]; 1942 if (Ty.isVector()) { 1943 const LLT &EltTy = Ty.getElementType(); 1944 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 1945 return true; 1946 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 1947 return true; 1948 } 1949 return false; 1950 }; 1951 1952 auto &Builder = 1953 getActionDefinitionsBuilder(Op) 1954 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1))) 1955 .lowerFor({{S16, V2S16}}) 1956 .lowerIf([=](const LegalityQuery &Query) { 1957 const LLT BigTy = Query.Types[BigTyIdx]; 1958 return BigTy.getSizeInBits() == 32; 1959 }) 1960 // Try to widen to s16 first for small types. 1961 // TODO: Only do this on targets with legal s16 shifts 1962 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 1963 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 1964 .moreElementsIf(isSmallOddVector(BigTyIdx), 1965 oneMoreElement(BigTyIdx)) 1966 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1967 elementTypeIs(1, S16)), 1968 changeTo(1, V2S16)) 1969 // Clamp the little scalar to s8-s256 and make it a power of 2. It's 1970 // not worth considering the multiples of 64 since 2*192 and 2*384 1971 // are not valid. 1972 .clampScalar(LitTyIdx, S32, S512) 1973 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1974 // Break up vectors with weird elements into scalars 1975 .fewerElementsIf( 1976 [=](const LegalityQuery &Query) { 1977 return notValidElt(Query, LitTyIdx); 1978 }, 1979 scalarize(0)) 1980 .fewerElementsIf( 1981 [=](const LegalityQuery &Query) { 1982 return notValidElt(Query, BigTyIdx); 1983 }, 1984 scalarize(1)) 1985 .clampScalar(BigTyIdx, S32, MaxScalar); 1986 1987 if (Op == G_MERGE_VALUES) { 1988 Builder.widenScalarIf( 1989 // TODO: Use 16-bit shifts if legal for 8-bit values? 1990 [=](const LegalityQuery &Query) { 1991 const LLT Ty = Query.Types[LitTyIdx]; 1992 return Ty.getSizeInBits() < 32; 1993 }, 1994 changeTo(LitTyIdx, S32)); 1995 } 1996 1997 Builder.widenScalarIf( 1998 [=](const LegalityQuery &Query) { 1999 const LLT Ty = Query.Types[BigTyIdx]; 2000 return Ty.getSizeInBits() % 16 != 0; 2001 }, 2002 [=](const LegalityQuery &Query) { 2003 // Pick the next power of 2, or a multiple of 64 over 128. 2004 // Whichever is smaller. 2005 const LLT &Ty = Query.Types[BigTyIdx]; 2006 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 2007 if (NewSizeInBits >= 256) { 2008 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 2009 if (RoundedTo < NewSizeInBits) 2010 NewSizeInBits = RoundedTo; 2011 } 2012 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 2013 }) 2014 // Any vectors left are the wrong size. Scalarize them. 2015 .scalarize(0) 2016 .scalarize(1); 2017 } 2018 2019 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 2020 // RegBankSelect. 2021 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 2022 .legalFor({{S32}, {S64}}) 2023 .clampScalar(0, S32, S64); 2024 2025 if (ST.hasVOP3PInsts()) { 2026 SextInReg.lowerFor({{V2S16}}) 2027 // Prefer to reduce vector widths for 16-bit vectors before lowering, to 2028 // get more vector shift opportunities, since we'll get those when 2029 // expanded. 2030 .clampMaxNumElementsStrict(0, S16, 2); 2031 } else if (ST.has16BitInsts()) { 2032 SextInReg.lowerFor({{S32}, {S64}, {S16}}); 2033 } else { 2034 // Prefer to promote to s32 before lowering if we don't have 16-bit 2035 // shifts. This avoid a lot of intermediate truncate and extend operations. 2036 SextInReg.lowerFor({{S32}, {S64}}); 2037 } 2038 2039 SextInReg 2040 .scalarize(0) 2041 .clampScalar(0, S32, S64) 2042 .lower(); 2043 2044 getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 2045 .scalarize(0) 2046 .lower(); 2047 2048 // TODO: Only Try to form v2s16 with legal packed instructions. 2049 getActionDefinitionsBuilder(G_FSHR) 2050 .legalFor({{S32, S32}}) 2051 .lowerFor({{V2S16, V2S16}}) 2052 .clampMaxNumElementsStrict(0, S16, 2) 2053 .scalarize(0) 2054 .lower(); 2055 2056 if (ST.hasVOP3PInsts()) { 2057 getActionDefinitionsBuilder(G_FSHL) 2058 .lowerFor({{V2S16, V2S16}}) 2059 .clampMaxNumElementsStrict(0, S16, 2) 2060 .scalarize(0) 2061 .lower(); 2062 } else { 2063 getActionDefinitionsBuilder(G_FSHL) 2064 .scalarize(0) 2065 .lower(); 2066 } 2067 2068 getActionDefinitionsBuilder(G_READCYCLECOUNTER) 2069 .legalFor({S64}); 2070 2071 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64}); 2072 2073 getActionDefinitionsBuilder(G_FENCE) 2074 .alwaysLegal(); 2075 2076 getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 2077 .scalarize(0) 2078 .minScalar(0, S32) 2079 .lower(); 2080 2081 getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 2082 .legalFor({{S32, S32}, {S64, S32}}) 2083 .clampScalar(1, S32, S32) 2084 .clampScalar(0, S32, S64) 2085 .widenScalarToNextPow2(0) 2086 .scalarize(0); 2087 2088 getActionDefinitionsBuilder( 2089 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops 2090 G_FCOPYSIGN, 2091 2092 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, 2093 G_READ_REGISTER, G_WRITE_REGISTER, 2094 2095 G_SADDO, G_SSUBO}) 2096 .lower(); 2097 2098 if (ST.hasIEEEMinimumMaximumInsts()) { 2099 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) 2100 .legalFor(FPTypesPK16) 2101 .clampMaxNumElements(0, S16, 2) 2102 .scalarize(0); 2103 } else { 2104 // TODO: Implement 2105 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 2106 } 2107 2108 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 2109 .lower(); 2110 2111 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom(); 2112 2113 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 2114 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 2115 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 2116 .unsupported(); 2117 2118 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); 2119 2120 getActionDefinitionsBuilder( 2121 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX, 2122 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN, 2123 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM, 2124 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR}) 2125 .legalFor(AllVectors) 2126 .scalarize(1) 2127 .lower(); 2128 2129 getLegacyLegalizerInfo().computeTables(); 2130 verify(*ST.getInstrInfo()); 2131 } 2132 2133 bool AMDGPULegalizerInfo::legalizeCustom( 2134 LegalizerHelper &Helper, MachineInstr &MI, 2135 LostDebugLocObserver &LocObserver) const { 2136 MachineIRBuilder &B = Helper.MIRBuilder; 2137 MachineRegisterInfo &MRI = *B.getMRI(); 2138 2139 switch (MI.getOpcode()) { 2140 case TargetOpcode::G_ADDRSPACE_CAST: 2141 return legalizeAddrSpaceCast(MI, MRI, B); 2142 case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 2143 return legalizeFroundeven(MI, MRI, B); 2144 case TargetOpcode::G_FCEIL: 2145 return legalizeFceil(MI, MRI, B); 2146 case TargetOpcode::G_FREM: 2147 return legalizeFrem(MI, MRI, B); 2148 case TargetOpcode::G_INTRINSIC_TRUNC: 2149 return legalizeIntrinsicTrunc(MI, MRI, B); 2150 case TargetOpcode::G_SITOFP: 2151 return legalizeITOFP(MI, MRI, B, true); 2152 case TargetOpcode::G_UITOFP: 2153 return legalizeITOFP(MI, MRI, B, false); 2154 case TargetOpcode::G_FPTOSI: 2155 return legalizeFPTOI(MI, MRI, B, true); 2156 case TargetOpcode::G_FPTOUI: 2157 return legalizeFPTOI(MI, MRI, B, false); 2158 case TargetOpcode::G_FMINNUM: 2159 case TargetOpcode::G_FMAXNUM: 2160 case TargetOpcode::G_FMINIMUMNUM: 2161 case TargetOpcode::G_FMAXIMUMNUM: 2162 case TargetOpcode::G_FMINNUM_IEEE: 2163 case TargetOpcode::G_FMAXNUM_IEEE: 2164 return legalizeMinNumMaxNum(Helper, MI); 2165 case TargetOpcode::G_EXTRACT_VECTOR_ELT: 2166 return legalizeExtractVectorElt(MI, MRI, B); 2167 case TargetOpcode::G_INSERT_VECTOR_ELT: 2168 return legalizeInsertVectorElt(MI, MRI, B); 2169 case TargetOpcode::G_FSIN: 2170 case TargetOpcode::G_FCOS: 2171 return legalizeSinCos(MI, MRI, B); 2172 case TargetOpcode::G_GLOBAL_VALUE: 2173 return legalizeGlobalValue(MI, MRI, B); 2174 case TargetOpcode::G_LOAD: 2175 case TargetOpcode::G_SEXTLOAD: 2176 case TargetOpcode::G_ZEXTLOAD: 2177 return legalizeLoad(Helper, MI); 2178 case TargetOpcode::G_STORE: 2179 return legalizeStore(Helper, MI); 2180 case TargetOpcode::G_FMAD: 2181 return legalizeFMad(MI, MRI, B); 2182 case TargetOpcode::G_FDIV: 2183 return legalizeFDIV(MI, MRI, B); 2184 case TargetOpcode::G_FFREXP: 2185 return legalizeFFREXP(MI, MRI, B); 2186 case TargetOpcode::G_FSQRT: 2187 return legalizeFSQRT(MI, MRI, B); 2188 case TargetOpcode::G_UDIV: 2189 case TargetOpcode::G_UREM: 2190 case TargetOpcode::G_UDIVREM: 2191 return legalizeUnsignedDIV_REM(MI, MRI, B); 2192 case TargetOpcode::G_SDIV: 2193 case TargetOpcode::G_SREM: 2194 case TargetOpcode::G_SDIVREM: 2195 return legalizeSignedDIV_REM(MI, MRI, B); 2196 case TargetOpcode::G_ATOMIC_CMPXCHG: 2197 return legalizeAtomicCmpXChg(MI, MRI, B); 2198 case TargetOpcode::G_FLOG2: 2199 return legalizeFlog2(MI, B); 2200 case TargetOpcode::G_FLOG: 2201 case TargetOpcode::G_FLOG10: 2202 return legalizeFlogCommon(MI, B); 2203 case TargetOpcode::G_FEXP2: 2204 return legalizeFExp2(MI, B); 2205 case TargetOpcode::G_FEXP: 2206 case TargetOpcode::G_FEXP10: 2207 return legalizeFExp(MI, B); 2208 case TargetOpcode::G_FPOW: 2209 return legalizeFPow(MI, B); 2210 case TargetOpcode::G_FFLOOR: 2211 return legalizeFFloor(MI, MRI, B); 2212 case TargetOpcode::G_BUILD_VECTOR: 2213 case TargetOpcode::G_BUILD_VECTOR_TRUNC: 2214 return legalizeBuildVector(MI, MRI, B); 2215 case TargetOpcode::G_MUL: 2216 return legalizeMul(Helper, MI); 2217 case TargetOpcode::G_CTLZ: 2218 case TargetOpcode::G_CTTZ: 2219 return legalizeCTLZ_CTTZ(MI, MRI, B); 2220 case TargetOpcode::G_CTLZ_ZERO_UNDEF: 2221 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B); 2222 case TargetOpcode::G_STACKSAVE: 2223 return legalizeStackSave(MI, B); 2224 case TargetOpcode::G_GET_FPENV: 2225 return legalizeGetFPEnv(MI, MRI, B); 2226 case TargetOpcode::G_SET_FPENV: 2227 return legalizeSetFPEnv(MI, MRI, B); 2228 case TargetOpcode::G_TRAP: 2229 return legalizeTrap(MI, MRI, B); 2230 case TargetOpcode::G_DEBUGTRAP: 2231 return legalizeDebugTrap(MI, MRI, B); 2232 default: 2233 return false; 2234 } 2235 2236 llvm_unreachable("expected switch to return"); 2237 } 2238 2239 Register AMDGPULegalizerInfo::getSegmentAperture( 2240 unsigned AS, 2241 MachineRegisterInfo &MRI, 2242 MachineIRBuilder &B) const { 2243 MachineFunction &MF = B.getMF(); 2244 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 2245 const LLT S32 = LLT::scalar(32); 2246 const LLT S64 = LLT::scalar(64); 2247 2248 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 2249 2250 if (ST.hasApertureRegs()) { 2251 // Note: this register is somewhat broken. When used as a 32-bit operand, 2252 // it only returns zeroes. The real value is in the upper 32 bits. 2253 // Thus, we must emit extract the high 32 bits. 2254 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2255 ? AMDGPU::SRC_SHARED_BASE 2256 : AMDGPU::SRC_PRIVATE_BASE; 2257 // FIXME: It would be more natural to emit a COPY here, but then copy 2258 // coalescing would kick in and it would think it's okay to use the "HI" 2259 // subregister (instead of extracting the HI 32 bits) which is an artificial 2260 // (unusable) register. 2261 // Register TableGen definitions would need an overhaul to get rid of the 2262 // artificial "HI" aperture registers and prevent this kind of issue from 2263 // happening. 2264 Register Dst = MRI.createGenericVirtualRegister(S64); 2265 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2266 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2267 return B.buildUnmerge(S32, Dst).getReg(1); 2268 } 2269 2270 // TODO: can we be smarter about machine pointer info? 2271 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 2272 Register LoadAddr = MRI.createGenericVirtualRegister( 2273 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2274 // For code object version 5, private_base and shared_base are passed through 2275 // implicit kernargs. 2276 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 2277 AMDGPU::AMDHSA_COV5) { 2278 AMDGPUTargetLowering::ImplicitParameter Param = 2279 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 2280 : AMDGPUTargetLowering::PRIVATE_BASE; 2281 uint64_t Offset = 2282 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 2283 2284 Register KernargPtrReg = MRI.createGenericVirtualRegister( 2285 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2286 2287 if (!loadInputValue(KernargPtrReg, B, 2288 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 2289 return Register(); 2290 2291 MachineMemOperand *MMO = MF.getMachineMemOperand( 2292 PtrInfo, 2293 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2294 MachineMemOperand::MOInvariant, 2295 LLT::scalar(32), commonAlignment(Align(64), Offset)); 2296 2297 // Pointer address 2298 B.buildPtrAdd(LoadAddr, KernargPtrReg, 2299 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 2300 // Load address 2301 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2302 } 2303 2304 Register QueuePtr = MRI.createGenericVirtualRegister( 2305 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 2306 2307 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 2308 return Register(); 2309 2310 // Offset into amd_queue_t for group_segment_aperture_base_hi / 2311 // private_segment_aperture_base_hi. 2312 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 2313 2314 MachineMemOperand *MMO = MF.getMachineMemOperand( 2315 PtrInfo, 2316 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 2317 MachineMemOperand::MOInvariant, 2318 LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 2319 2320 B.buildPtrAdd(LoadAddr, QueuePtr, 2321 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 2322 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 2323 } 2324 2325 /// Return true if the value is a known valid address, such that a null check is 2326 /// not necessary. 2327 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 2328 const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 2329 MachineInstr *Def = MRI.getVRegDef(Val); 2330 switch (Def->getOpcode()) { 2331 case AMDGPU::G_FRAME_INDEX: 2332 case AMDGPU::G_GLOBAL_VALUE: 2333 case AMDGPU::G_BLOCK_ADDR: 2334 return true; 2335 case AMDGPU::G_CONSTANT: { 2336 const ConstantInt *CI = Def->getOperand(1).getCImm(); 2337 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 2338 } 2339 default: 2340 return false; 2341 } 2342 2343 return false; 2344 } 2345 2346 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 2347 MachineInstr &MI, MachineRegisterInfo &MRI, 2348 MachineIRBuilder &B) const { 2349 MachineFunction &MF = B.getMF(); 2350 2351 // MI can either be a G_ADDRSPACE_CAST or a 2352 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull 2353 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST || 2354 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() == 2355 Intrinsic::amdgcn_addrspacecast_nonnull)); 2356 2357 const LLT S32 = LLT::scalar(32); 2358 Register Dst = MI.getOperand(0).getReg(); 2359 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg() 2360 : MI.getOperand(1).getReg(); 2361 LLT DstTy = MRI.getType(Dst); 2362 LLT SrcTy = MRI.getType(Src); 2363 unsigned DestAS = DstTy.getAddressSpace(); 2364 unsigned SrcAS = SrcTy.getAddressSpace(); 2365 2366 // TODO: Avoid reloading from the queue ptr for each cast, or at least each 2367 // vector element. 2368 assert(!DstTy.isVector()); 2369 2370 const AMDGPUTargetMachine &TM 2371 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 2372 2373 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 2374 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 2375 return true; 2376 } 2377 2378 if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 2379 (DestAS == AMDGPUAS::LOCAL_ADDRESS || 2380 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2381 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for 2382 // G_ADDRSPACE_CAST we need to guess. 2383 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { 2384 // Extract low 32-bits of the pointer. 2385 B.buildExtract(Dst, Src, 0); 2386 MI.eraseFromParent(); 2387 return true; 2388 } 2389 2390 unsigned NullVal = TM.getNullPointerValue(DestAS); 2391 2392 auto SegmentNull = B.buildConstant(DstTy, NullVal); 2393 auto FlatNull = B.buildConstant(SrcTy, 0); 2394 2395 // Extract low 32-bits of the pointer. 2396 auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 2397 2398 auto CmpRes = 2399 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 2400 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 2401 2402 MI.eraseFromParent(); 2403 return true; 2404 } 2405 2406 if (DestAS == AMDGPUAS::FLAT_ADDRESS && 2407 (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 2408 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 2409 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register { 2410 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 2411 if (!ApertureReg.isValid()) 2412 return false; 2413 2414 // Coerce the type of the low half of the result so we can use 2415 // merge_values. 2416 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 2417 2418 // TODO: Should we allow mismatched types but matching sizes in merges to 2419 // avoid the ptrtoint? 2420 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0); 2421 }; 2422 2423 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for 2424 // G_ADDRSPACE_CAST we need to guess. 2425 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) { 2426 castLocalOrPrivateToFlat(Dst); 2427 MI.eraseFromParent(); 2428 return true; 2429 } 2430 2431 Register BuildPtr = castLocalOrPrivateToFlat(DstTy); 2432 2433 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 2434 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 2435 2436 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 2437 SegmentNull.getReg(0)); 2438 2439 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 2440 2441 MI.eraseFromParent(); 2442 return true; 2443 } 2444 2445 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2446 SrcTy.getSizeInBits() == 64) { 2447 // Truncate. 2448 B.buildExtract(Dst, Src, 0); 2449 MI.eraseFromParent(); 2450 return true; 2451 } 2452 2453 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 2454 DstTy.getSizeInBits() == 64) { 2455 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 2456 uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2457 auto PtrLo = B.buildPtrToInt(S32, Src); 2458 auto HighAddr = B.buildConstant(S32, AddrHiVal); 2459 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 2460 MI.eraseFromParent(); 2461 return true; 2462 } 2463 2464 // Invalid casts are poison. 2465 // TODO: Should return poison 2466 B.buildUndef(Dst); 2467 MI.eraseFromParent(); 2468 return true; 2469 } 2470 2471 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, 2472 MachineRegisterInfo &MRI, 2473 MachineIRBuilder &B) const { 2474 Register Src = MI.getOperand(1).getReg(); 2475 LLT Ty = MRI.getType(Src); 2476 assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 2477 2478 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 2479 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 2480 2481 auto C1 = B.buildFConstant(Ty, C1Val); 2482 auto CopySign = B.buildFCopysign(Ty, C1, Src); 2483 2484 // TODO: Should this propagate fast-math-flags? 2485 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 2486 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 2487 2488 auto C2 = B.buildFConstant(Ty, C2Val); 2489 auto Fabs = B.buildFAbs(Ty, Src); 2490 2491 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 2492 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2493 MI.eraseFromParent(); 2494 return true; 2495 } 2496 2497 bool AMDGPULegalizerInfo::legalizeFceil( 2498 MachineInstr &MI, MachineRegisterInfo &MRI, 2499 MachineIRBuilder &B) const { 2500 2501 const LLT S1 = LLT::scalar(1); 2502 const LLT S64 = LLT::scalar(64); 2503 2504 Register Src = MI.getOperand(1).getReg(); 2505 assert(MRI.getType(Src) == S64); 2506 2507 // result = trunc(src) 2508 // if (src > 0.0 && src != result) 2509 // result += 1.0 2510 2511 auto Trunc = B.buildIntrinsicTrunc(S64, Src); 2512 2513 const auto Zero = B.buildFConstant(S64, 0.0); 2514 const auto One = B.buildFConstant(S64, 1.0); 2515 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 2516 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 2517 auto And = B.buildAnd(S1, Lt0, NeTrunc); 2518 auto Add = B.buildSelect(S64, And, One, Zero); 2519 2520 // TODO: Should this propagate fast-math-flags? 2521 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 2522 MI.eraseFromParent(); 2523 return true; 2524 } 2525 2526 bool AMDGPULegalizerInfo::legalizeFrem( 2527 MachineInstr &MI, MachineRegisterInfo &MRI, 2528 MachineIRBuilder &B) const { 2529 Register DstReg = MI.getOperand(0).getReg(); 2530 Register Src0Reg = MI.getOperand(1).getReg(); 2531 Register Src1Reg = MI.getOperand(2).getReg(); 2532 auto Flags = MI.getFlags(); 2533 LLT Ty = MRI.getType(DstReg); 2534 2535 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2536 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2537 auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2538 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2539 MI.eraseFromParent(); 2540 return true; 2541 } 2542 2543 static MachineInstrBuilder extractF64Exponent(Register Hi, 2544 MachineIRBuilder &B) { 2545 const unsigned FractBits = 52; 2546 const unsigned ExpBits = 11; 2547 LLT S32 = LLT::scalar(32); 2548 2549 auto Const0 = B.buildConstant(S32, FractBits - 32); 2550 auto Const1 = B.buildConstant(S32, ExpBits); 2551 2552 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) 2553 .addUse(Hi) 2554 .addUse(Const0.getReg(0)) 2555 .addUse(Const1.getReg(0)); 2556 2557 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 2558 } 2559 2560 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 2561 MachineInstr &MI, MachineRegisterInfo &MRI, 2562 MachineIRBuilder &B) const { 2563 const LLT S1 = LLT::scalar(1); 2564 const LLT S32 = LLT::scalar(32); 2565 const LLT S64 = LLT::scalar(64); 2566 2567 Register Src = MI.getOperand(1).getReg(); 2568 assert(MRI.getType(Src) == S64); 2569 2570 // TODO: Should this use extract since the low half is unused? 2571 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2572 Register Hi = Unmerge.getReg(1); 2573 2574 // Extract the upper half, since this is where we will find the sign and 2575 // exponent. 2576 auto Exp = extractF64Exponent(Hi, B); 2577 2578 const unsigned FractBits = 52; 2579 2580 // Extract the sign bit. 2581 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 2582 auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 2583 2584 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 2585 2586 const auto Zero32 = B.buildConstant(S32, 0); 2587 2588 // Extend back to 64-bits. 2589 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 2590 2591 auto Shr = B.buildAShr(S64, FractMask, Exp); 2592 auto Not = B.buildNot(S64, Shr); 2593 auto Tmp0 = B.buildAnd(S64, Src, Not); 2594 auto FiftyOne = B.buildConstant(S32, FractBits - 1); 2595 2596 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 2597 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 2598 2599 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 2600 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2601 MI.eraseFromParent(); 2602 return true; 2603 } 2604 2605 bool AMDGPULegalizerInfo::legalizeITOFP( 2606 MachineInstr &MI, MachineRegisterInfo &MRI, 2607 MachineIRBuilder &B, bool Signed) const { 2608 2609 Register Dst = MI.getOperand(0).getReg(); 2610 Register Src = MI.getOperand(1).getReg(); 2611 2612 const LLT S64 = LLT::scalar(64); 2613 const LLT S32 = LLT::scalar(32); 2614 2615 assert(MRI.getType(Src) == S64); 2616 2617 auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2618 auto ThirtyTwo = B.buildConstant(S32, 32); 2619 2620 if (MRI.getType(Dst) == S64) { 2621 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2622 : B.buildUITOFP(S64, Unmerge.getReg(1)); 2623 2624 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 2625 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 2626 2627 // TODO: Should this propagate fast-math-flags? 2628 B.buildFAdd(Dst, LdExp, CvtLo); 2629 MI.eraseFromParent(); 2630 return true; 2631 } 2632 2633 assert(MRI.getType(Dst) == S32); 2634 2635 auto One = B.buildConstant(S32, 1); 2636 2637 MachineInstrBuilder ShAmt; 2638 if (Signed) { 2639 auto ThirtyOne = B.buildConstant(S32, 31); 2640 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2641 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2642 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2643 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) 2644 .addUse(Unmerge.getReg(1)); 2645 auto LS2 = B.buildSub(S32, LS, One); 2646 ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2647 } else 2648 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2649 auto Norm = B.buildShl(S64, Src, ShAmt); 2650 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2651 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2652 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2653 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2654 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2655 B.buildFLdexp(Dst, FVal, Scale); 2656 MI.eraseFromParent(); 2657 return true; 2658 } 2659 2660 // TODO: Copied from DAG implementation. Verify logic and document how this 2661 // actually works. 2662 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2663 MachineRegisterInfo &MRI, 2664 MachineIRBuilder &B, 2665 bool Signed) const { 2666 2667 Register Dst = MI.getOperand(0).getReg(); 2668 Register Src = MI.getOperand(1).getReg(); 2669 2670 const LLT S64 = LLT::scalar(64); 2671 const LLT S32 = LLT::scalar(32); 2672 2673 const LLT SrcLT = MRI.getType(Src); 2674 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 2675 2676 unsigned Flags = MI.getFlags(); 2677 2678 // The basic idea of converting a floating point number into a pair of 32-bit 2679 // integers is illustrated as follows: 2680 // 2681 // tf := trunc(val); 2682 // hif := floor(tf * 2^-32); 2683 // lof := tf - hif * 2^32; // lof is always positive due to floor. 2684 // hi := fptoi(hif); 2685 // lo := fptoi(lof); 2686 // 2687 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2688 MachineInstrBuilder Sign; 2689 if (Signed && SrcLT == S32) { 2690 // However, a 32-bit floating point number has only 23 bits mantissa and 2691 // it's not enough to hold all the significant bits of `lof` if val is 2692 // negative. To avoid the loss of precision, We need to take the absolute 2693 // value after truncating and flip the result back based on the original 2694 // signedness. 2695 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2696 Trunc = B.buildFAbs(S32, Trunc, Flags); 2697 } 2698 MachineInstrBuilder K0, K1; 2699 if (SrcLT == S64) { 2700 K0 = B.buildFConstant( 2701 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2702 K1 = B.buildFConstant( 2703 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2704 } else { 2705 K0 = B.buildFConstant( 2706 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 2707 K1 = B.buildFConstant( 2708 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2709 } 2710 2711 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2712 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2713 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 2714 2715 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2716 : B.buildFPTOUI(S32, FloorMul); 2717 auto Lo = B.buildFPTOUI(S32, Fma); 2718 2719 if (Signed && SrcLT == S32) { 2720 // Flip the result based on the signedness, which is either all 0s or 1s. 2721 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2722 // r := xor({lo, hi}, sign) - sign; 2723 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2724 Sign); 2725 } else 2726 B.buildMergeLikeInstr(Dst, {Lo, Hi}); 2727 MI.eraseFromParent(); 2728 2729 return true; 2730 } 2731 2732 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 2733 MachineInstr &MI) const { 2734 MachineFunction &MF = Helper.MIRBuilder.getMF(); 2735 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2736 2737 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 2738 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 2739 2740 // With ieee_mode disabled, the instructions have the correct behavior 2741 // already for G_FMINIMUMNUM/G_FMAXIMUMNUM. 2742 // 2743 // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode 2744 // enabled. 2745 if (!MFI->getMode().IEEE) { 2746 if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM || 2747 MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM) 2748 return true; 2749 2750 return !IsIEEEOp; 2751 } 2752 2753 if (IsIEEEOp) 2754 return true; 2755 2756 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 2757 } 2758 2759 bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 2760 MachineInstr &MI, MachineRegisterInfo &MRI, 2761 MachineIRBuilder &B) const { 2762 // TODO: Should move some of this into LegalizerHelper. 2763 2764 // TODO: Promote dynamic indexing of s16 to s32 2765 2766 Register Dst = MI.getOperand(0).getReg(); 2767 Register Vec = MI.getOperand(1).getReg(); 2768 2769 LLT VecTy = MRI.getType(Vec); 2770 LLT EltTy = VecTy.getElementType(); 2771 assert(EltTy == MRI.getType(Dst)); 2772 2773 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2774 // but we can't go directly to that logic becasue you can't bitcast a vector 2775 // of pointers to a vector of integers. Therefore, introduce an intermediate 2776 // vector of integers using ptrtoint (and inttoptr on the output) in order to 2777 // drive the legalization forward. 2778 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2779 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2780 LLT IntVecTy = VecTy.changeElementType(IntTy); 2781 2782 auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 2783 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 2784 B.buildIntToPtr(Dst, IntElt); 2785 2786 MI.eraseFromParent(); 2787 return true; 2788 } 2789 2790 // FIXME: Artifact combiner probably should have replaced the truncated 2791 // constant before this, so we shouldn't need 2792 // getIConstantVRegValWithLookThrough. 2793 std::optional<ValueAndVReg> MaybeIdxVal = 2794 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2795 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2796 return true; 2797 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2798 2799 if (IdxVal < VecTy.getNumElements()) { 2800 auto Unmerge = B.buildUnmerge(EltTy, Vec); 2801 B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 2802 } else { 2803 B.buildUndef(Dst); 2804 } 2805 2806 MI.eraseFromParent(); 2807 return true; 2808 } 2809 2810 bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 2811 MachineInstr &MI, MachineRegisterInfo &MRI, 2812 MachineIRBuilder &B) const { 2813 // TODO: Should move some of this into LegalizerHelper. 2814 2815 // TODO: Promote dynamic indexing of s16 to s32 2816 2817 Register Dst = MI.getOperand(0).getReg(); 2818 Register Vec = MI.getOperand(1).getReg(); 2819 Register Ins = MI.getOperand(2).getReg(); 2820 2821 LLT VecTy = MRI.getType(Vec); 2822 LLT EltTy = VecTy.getElementType(); 2823 assert(EltTy == MRI.getType(Ins)); 2824 2825 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 2826 // but we can't go directly to that logic becasue you can't bitcast a vector 2827 // of pointers to a vector of integers. Therefore, make the pointer vector 2828 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 2829 // new value, and then inttoptr the result vector back. This will then allow 2830 // the rest of legalization to take over. 2831 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 2832 LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 2833 LLT IntVecTy = VecTy.changeElementType(IntTy); 2834 2835 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 2836 auto IntIns = B.buildPtrToInt(IntTy, Ins); 2837 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 2838 MI.getOperand(3)); 2839 B.buildIntToPtr(Dst, IntVecDest); 2840 MI.eraseFromParent(); 2841 return true; 2842 } 2843 2844 // FIXME: Artifact combiner probably should have replaced the truncated 2845 // constant before this, so we shouldn't need 2846 // getIConstantVRegValWithLookThrough. 2847 std::optional<ValueAndVReg> MaybeIdxVal = 2848 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2849 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 2850 return true; 2851 2852 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 2853 2854 unsigned NumElts = VecTy.getNumElements(); 2855 if (IdxVal < NumElts) { 2856 SmallVector<Register, 8> SrcRegs; 2857 for (unsigned i = 0; i < NumElts; ++i) 2858 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 2859 B.buildUnmerge(SrcRegs, Vec); 2860 2861 SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2862 B.buildMergeLikeInstr(Dst, SrcRegs); 2863 } else { 2864 B.buildUndef(Dst); 2865 } 2866 2867 MI.eraseFromParent(); 2868 return true; 2869 } 2870 2871 bool AMDGPULegalizerInfo::legalizeSinCos( 2872 MachineInstr &MI, MachineRegisterInfo &MRI, 2873 MachineIRBuilder &B) const { 2874 2875 Register DstReg = MI.getOperand(0).getReg(); 2876 Register SrcReg = MI.getOperand(1).getReg(); 2877 LLT Ty = MRI.getType(DstReg); 2878 unsigned Flags = MI.getFlags(); 2879 2880 Register TrigVal; 2881 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 2882 if (ST.hasTrigReducedRange()) { 2883 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 2884 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) 2885 .addUse(MulVal.getReg(0)) 2886 .setMIFlags(Flags) 2887 .getReg(0); 2888 } else 2889 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 2890 2891 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 2892 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2893 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) 2894 .addUse(TrigVal) 2895 .setMIFlags(Flags); 2896 MI.eraseFromParent(); 2897 return true; 2898 } 2899 2900 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2901 MachineIRBuilder &B, 2902 const GlobalValue *GV, 2903 int64_t Offset, 2904 unsigned GAFlags) const { 2905 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 2906 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 2907 // to the following code sequence: 2908 // 2909 // For constant address space: 2910 // s_getpc_b64 s[0:1] 2911 // s_add_u32 s0, s0, $symbol 2912 // s_addc_u32 s1, s1, 0 2913 // 2914 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2915 // a fixup or relocation is emitted to replace $symbol with a literal 2916 // constant, which is a pc-relative offset from the encoding of the $symbol 2917 // operand to the global variable. 2918 // 2919 // For global address space: 2920 // s_getpc_b64 s[0:1] 2921 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 2922 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 2923 // 2924 // s_getpc_b64 returns the address of the s_add_u32 instruction and then 2925 // fixups or relocations are emitted to replace $symbol@*@lo and 2926 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 2927 // which is a 64-bit pc-relative offset from the encoding of the $symbol 2928 // operand to the global variable. 2929 2930 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2931 2932 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 2933 B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 2934 2935 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 2936 .addDef(PCReg); 2937 2938 MIB.addGlobalAddress(GV, Offset, GAFlags); 2939 if (GAFlags == SIInstrInfo::MO_NONE) 2940 MIB.addImm(0); 2941 else 2942 MIB.addGlobalAddress(GV, Offset, GAFlags + 1); 2943 2944 if (!B.getMRI()->getRegClassOrNull(PCReg)) 2945 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 2946 2947 if (PtrTy.getSizeInBits() == 32) 2948 B.buildExtract(DstReg, PCReg, 0); 2949 return true; 2950 } 2951 2952 // Emit a ABS32_LO / ABS32_HI relocation stub. 2953 void AMDGPULegalizerInfo::buildAbsGlobalAddress( 2954 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, 2955 MachineRegisterInfo &MRI) const { 2956 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; 2957 2958 LLT S32 = LLT::scalar(32); 2959 2960 // Use the destination directly, if and only if we store the lower address 2961 // part only and we don't have a register class being set. 2962 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) 2963 ? DstReg 2964 : MRI.createGenericVirtualRegister(S32); 2965 2966 if (!MRI.getRegClassOrNull(AddrLo)) 2967 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); 2968 2969 // Write the lower half. 2970 B.buildInstr(AMDGPU::S_MOV_B32) 2971 .addDef(AddrLo) 2972 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 2973 2974 // If required, write the upper half as well. 2975 if (RequiresHighHalf) { 2976 assert(PtrTy.getSizeInBits() == 64 && 2977 "Must provide a 64-bit pointer type!"); 2978 2979 Register AddrHi = MRI.createGenericVirtualRegister(S32); 2980 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); 2981 2982 B.buildInstr(AMDGPU::S_MOV_B32) 2983 .addDef(AddrHi) 2984 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); 2985 2986 // Use the destination directly, if and only if we don't have a register 2987 // class being set. 2988 Register AddrDst = !MRI.getRegClassOrNull(DstReg) 2989 ? DstReg 2990 : MRI.createGenericVirtualRegister(LLT::scalar(64)); 2991 2992 if (!MRI.getRegClassOrNull(AddrDst)) 2993 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); 2994 2995 B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); 2996 2997 // If we created a new register for the destination, cast the result into 2998 // the final output. 2999 if (AddrDst != DstReg) 3000 B.buildCast(DstReg, AddrDst); 3001 } else if (AddrLo != DstReg) { 3002 // If we created a new register for the destination, cast the result into 3003 // the final output. 3004 B.buildCast(DstReg, AddrLo); 3005 } 3006 } 3007 3008 bool AMDGPULegalizerInfo::legalizeGlobalValue( 3009 MachineInstr &MI, MachineRegisterInfo &MRI, 3010 MachineIRBuilder &B) const { 3011 Register DstReg = MI.getOperand(0).getReg(); 3012 LLT Ty = MRI.getType(DstReg); 3013 unsigned AS = Ty.getAddressSpace(); 3014 3015 const GlobalValue *GV = MI.getOperand(1).getGlobal(); 3016 MachineFunction &MF = B.getMF(); 3017 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3018 3019 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 3020 if (!MFI->isModuleEntryFunction() && 3021 GV->getName() != "llvm.amdgcn.module.lds" && 3022 !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) { 3023 const Function &Fn = MF.getFunction(); 3024 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 3025 Fn, "local memory global used by non-kernel function", 3026 MI.getDebugLoc(), DS_Warning)); 3027 3028 // We currently don't have a way to correctly allocate LDS objects that 3029 // aren't directly associated with a kernel. We do force inlining of 3030 // functions that use local objects. However, if these dead functions are 3031 // not eliminated, we don't want a compile time error. Just emit a warning 3032 // and a trap, since there should be no callable path here. 3033 B.buildTrap(); 3034 B.buildUndef(DstReg); 3035 MI.eraseFromParent(); 3036 return true; 3037 } 3038 3039 // TODO: We could emit code to handle the initialization somewhere. 3040 // We ignore the initializer for now and legalize it to allow selection. 3041 // The initializer will anyway get errored out during assembly emission. 3042 const SITargetLowering *TLI = ST.getTargetLowering(); 3043 if (!TLI->shouldUseLDSConstAddress(GV)) { 3044 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 3045 return true; // Leave in place; 3046 } 3047 3048 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 3049 Type *Ty = GV->getValueType(); 3050 // HIP uses an unsized array `extern __shared__ T s[]` or similar 3051 // zero-sized type in other languages to declare the dynamic shared 3052 // memory which size is not known at the compile time. They will be 3053 // allocated by the runtime and placed directly after the static 3054 // allocated ones. They all share the same offset. 3055 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 3056 // Adjust alignment for that dynamic shared memory array. 3057 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 3058 LLT S32 = LLT::scalar(32); 3059 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); 3060 B.buildIntToPtr(DstReg, Sz); 3061 MI.eraseFromParent(); 3062 return true; 3063 } 3064 } 3065 3066 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 3067 *cast<GlobalVariable>(GV))); 3068 MI.eraseFromParent(); 3069 return true; 3070 } 3071 3072 if (ST.isAmdPalOS() || ST.isMesa3DOS()) { 3073 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); 3074 MI.eraseFromParent(); 3075 return true; 3076 } 3077 3078 const SITargetLowering *TLI = ST.getTargetLowering(); 3079 3080 if (TLI->shouldEmitFixup(GV)) { 3081 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 3082 MI.eraseFromParent(); 3083 return true; 3084 } 3085 3086 if (TLI->shouldEmitPCReloc(GV)) { 3087 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 3088 MI.eraseFromParent(); 3089 return true; 3090 } 3091 3092 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3093 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 3094 3095 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 3096 MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 3097 MachinePointerInfo::getGOT(MF), 3098 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 3099 MachineMemOperand::MOInvariant, 3100 LoadTy, Align(8)); 3101 3102 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 3103 3104 if (Ty.getSizeInBits() == 32) { 3105 // Truncate if this is a 32-bit constant address. 3106 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 3107 B.buildExtract(DstReg, Load, 0); 3108 } else 3109 B.buildLoad(DstReg, GOTAddr, *GOTMMO); 3110 3111 MI.eraseFromParent(); 3112 return true; 3113 } 3114 3115 static LLT widenToNextPowerOf2(LLT Ty) { 3116 if (Ty.isVector()) 3117 return Ty.changeElementCount( 3118 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 3119 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 3120 } 3121 3122 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 3123 MachineInstr &MI) const { 3124 MachineIRBuilder &B = Helper.MIRBuilder; 3125 MachineRegisterInfo &MRI = *B.getMRI(); 3126 GISelChangeObserver &Observer = Helper.Observer; 3127 3128 Register PtrReg = MI.getOperand(1).getReg(); 3129 LLT PtrTy = MRI.getType(PtrReg); 3130 unsigned AddrSpace = PtrTy.getAddressSpace(); 3131 3132 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 3133 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 3134 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 3135 Observer.changingInstr(MI); 3136 MI.getOperand(1).setReg(Cast.getReg(0)); 3137 Observer.changedInstr(MI); 3138 return true; 3139 } 3140 3141 if (MI.getOpcode() != AMDGPU::G_LOAD) 3142 return false; 3143 3144 Register ValReg = MI.getOperand(0).getReg(); 3145 LLT ValTy = MRI.getType(ValReg); 3146 3147 if (hasBufferRsrcWorkaround(ValTy)) { 3148 Observer.changingInstr(MI); 3149 castBufferRsrcFromV4I32(MI, B, MRI, 0); 3150 Observer.changedInstr(MI); 3151 return true; 3152 } 3153 3154 MachineMemOperand *MMO = *MI.memoperands_begin(); 3155 const unsigned ValSize = ValTy.getSizeInBits(); 3156 const LLT MemTy = MMO->getMemoryType(); 3157 const Align MemAlign = MMO->getAlign(); 3158 const unsigned MemSize = MemTy.getSizeInBits(); 3159 const uint64_t AlignInBits = 8 * MemAlign.value(); 3160 3161 // Widen non-power-of-2 loads to the alignment if needed 3162 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 3163 const unsigned WideMemSize = PowerOf2Ceil(MemSize); 3164 3165 // This was already the correct extending load result type, so just adjust 3166 // the memory type. 3167 if (WideMemSize == ValSize) { 3168 MachineFunction &MF = B.getMF(); 3169 3170 MachineMemOperand *WideMMO = 3171 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 3172 Observer.changingInstr(MI); 3173 MI.setMemRefs(MF, {WideMMO}); 3174 Observer.changedInstr(MI); 3175 return true; 3176 } 3177 3178 // Don't bother handling edge case that should probably never be produced. 3179 if (ValSize > WideMemSize) 3180 return false; 3181 3182 LLT WideTy = widenToNextPowerOf2(ValTy); 3183 3184 Register WideLoad; 3185 if (!WideTy.isVector()) { 3186 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3187 B.buildTrunc(ValReg, WideLoad).getReg(0); 3188 } else { 3189 // Extract the subvector. 3190 3191 if (isRegisterType(ST, ValTy)) { 3192 // If this a case where G_EXTRACT is legal, use it. 3193 // (e.g. <3 x s32> -> <4 x s32>) 3194 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3195 B.buildExtract(ValReg, WideLoad, 0); 3196 } else { 3197 // For cases where the widened type isn't a nice register value, unmerge 3198 // from a widened register (e.g. <3 x s16> -> <4 x s16>) 3199 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3200 B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 3201 } 3202 } 3203 3204 MI.eraseFromParent(); 3205 return true; 3206 } 3207 3208 return false; 3209 } 3210 3211 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 3212 MachineInstr &MI) const { 3213 MachineIRBuilder &B = Helper.MIRBuilder; 3214 MachineRegisterInfo &MRI = *B.getMRI(); 3215 GISelChangeObserver &Observer = Helper.Observer; 3216 3217 Register DataReg = MI.getOperand(0).getReg(); 3218 LLT DataTy = MRI.getType(DataReg); 3219 3220 if (hasBufferRsrcWorkaround(DataTy)) { 3221 Observer.changingInstr(MI); 3222 castBufferRsrcArgToV4I32(MI, B, 0); 3223 Observer.changedInstr(MI); 3224 return true; 3225 } 3226 return false; 3227 } 3228 3229 bool AMDGPULegalizerInfo::legalizeFMad( 3230 MachineInstr &MI, MachineRegisterInfo &MRI, 3231 MachineIRBuilder &B) const { 3232 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 3233 assert(Ty.isScalar()); 3234 3235 MachineFunction &MF = B.getMF(); 3236 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3237 3238 // TODO: Always legal with future ftz flag. 3239 // FIXME: Do we need just output? 3240 if (Ty == LLT::float32() && 3241 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 3242 return true; 3243 if (Ty == LLT::float16() && 3244 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 3245 return true; 3246 3247 MachineIRBuilder HelperBuilder(MI); 3248 GISelObserverWrapper DummyObserver; 3249 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 3250 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 3251 } 3252 3253 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 3254 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3255 Register DstReg = MI.getOperand(0).getReg(); 3256 Register PtrReg = MI.getOperand(1).getReg(); 3257 Register CmpVal = MI.getOperand(2).getReg(); 3258 Register NewVal = MI.getOperand(3).getReg(); 3259 3260 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 3261 "this should not have been custom lowered"); 3262 3263 LLT ValTy = MRI.getType(CmpVal); 3264 LLT VecTy = LLT::fixed_vector(2, ValTy); 3265 3266 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3267 3268 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3269 .addDef(DstReg) 3270 .addUse(PtrReg) 3271 .addUse(PackedVal) 3272 .setMemRefs(MI.memoperands()); 3273 3274 MI.eraseFromParent(); 3275 return true; 3276 } 3277 3278 /// Return true if it's known that \p Src can never be an f32 denormal value. 3279 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 3280 Register Src) { 3281 const MachineInstr *DefMI = MRI.getVRegDef(Src); 3282 switch (DefMI->getOpcode()) { 3283 case TargetOpcode::G_INTRINSIC: { 3284 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { 3285 case Intrinsic::amdgcn_frexp_mant: 3286 return true; 3287 default: 3288 break; 3289 } 3290 3291 break; 3292 } 3293 case TargetOpcode::G_FFREXP: { 3294 if (DefMI->getOperand(0).getReg() == Src) 3295 return true; 3296 break; 3297 } 3298 case TargetOpcode::G_FPEXT: { 3299 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); 3300 } 3301 default: 3302 return false; 3303 } 3304 3305 return false; 3306 } 3307 3308 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 3309 if (Flags & MachineInstr::FmAfn) 3310 return true; 3311 const auto &Options = MF.getTarget().Options; 3312 return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 3313 } 3314 3315 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 3316 unsigned Flags) { 3317 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 3318 MF.getDenormalMode(APFloat::IEEEsingle()).Input != 3319 DenormalMode::PreserveSign; 3320 } 3321 3322 std::pair<Register, Register> 3323 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 3324 unsigned Flags) const { 3325 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 3326 return {}; 3327 3328 const LLT F32 = LLT::scalar(32); 3329 auto SmallestNormal = B.buildFConstant( 3330 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 3331 auto IsLtSmallestNormal = 3332 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 3333 3334 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 3335 auto One = B.buildFConstant(F32, 1.0); 3336 auto ScaleFactor = 3337 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 3338 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 3339 3340 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 3341 } 3342 3343 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 3344 MachineIRBuilder &B) const { 3345 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 3346 // If we have to handle denormals, scale up the input and adjust the result. 3347 3348 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 3349 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 3350 3351 Register Dst = MI.getOperand(0).getReg(); 3352 Register Src = MI.getOperand(1).getReg(); 3353 LLT Ty = B.getMRI()->getType(Dst); 3354 unsigned Flags = MI.getFlags(); 3355 3356 if (Ty == LLT::scalar(16)) { 3357 const LLT F32 = LLT::scalar(32); 3358 // Nothing in half is a denormal when promoted to f32. 3359 auto Ext = B.buildFPExt(F32, Src, Flags); 3360 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) 3361 .addUse(Ext.getReg(0)) 3362 .setMIFlags(Flags); 3363 B.buildFPTrunc(Dst, Log2, Flags); 3364 MI.eraseFromParent(); 3365 return true; 3366 } 3367 3368 assert(Ty == LLT::scalar(32)); 3369 3370 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 3371 if (!ScaledInput) { 3372 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) 3373 .addUse(Src) 3374 .setMIFlags(Flags); 3375 MI.eraseFromParent(); 3376 return true; 3377 } 3378 3379 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3380 .addUse(ScaledInput) 3381 .setMIFlags(Flags); 3382 3383 auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 3384 auto Zero = B.buildFConstant(Ty, 0.0); 3385 auto ResultOffset = 3386 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 3387 B.buildFSub(Dst, Log2, ResultOffset, Flags); 3388 3389 MI.eraseFromParent(); 3390 return true; 3391 } 3392 3393 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 3394 Register Z, unsigned Flags) { 3395 auto FMul = B.buildFMul(Ty, X, Y, Flags); 3396 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 3397 } 3398 3399 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 3400 MachineIRBuilder &B) const { 3401 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 3402 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 3403 3404 MachineRegisterInfo &MRI = *B.getMRI(); 3405 Register Dst = MI.getOperand(0).getReg(); 3406 Register X = MI.getOperand(1).getReg(); 3407 unsigned Flags = MI.getFlags(); 3408 const LLT Ty = MRI.getType(X); 3409 MachineFunction &MF = B.getMF(); 3410 3411 const LLT F32 = LLT::scalar(32); 3412 const LLT F16 = LLT::scalar(16); 3413 3414 const AMDGPUTargetMachine &TM = 3415 static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 3416 3417 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 3418 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 3419 if (Ty == F16 && !ST.has16BitInsts()) { 3420 Register LogVal = MRI.createGenericVirtualRegister(F32); 3421 auto PromoteSrc = B.buildFPExt(F32, X); 3422 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 3423 B.buildFPTrunc(Dst, LogVal); 3424 } else { 3425 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 3426 } 3427 3428 MI.eraseFromParent(); 3429 return true; 3430 } 3431 3432 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 3433 if (ScaledInput) 3434 X = ScaledInput; 3435 3436 auto Y = 3437 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); 3438 3439 Register R; 3440 if (ST.hasFastFMAF32()) { 3441 // c+cc are ln(2)/ln(10) to more than 49 bits 3442 const float c_log10 = 0x1.344134p-2f; 3443 const float cc_log10 = 0x1.09f79ep-26f; 3444 3445 // c + cc is ln(2) to more than 49 bits 3446 const float c_log = 0x1.62e42ep-1f; 3447 const float cc_log = 0x1.efa39ep-25f; 3448 3449 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 3450 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 3451 3452 R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 3453 auto NegR = B.buildFNeg(Ty, R, Flags); 3454 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 3455 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 3456 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 3457 } else { 3458 // ch+ct is ln(2)/ln(10) to more than 36 bits 3459 const float ch_log10 = 0x1.344000p-2f; 3460 const float ct_log10 = 0x1.3509f6p-18f; 3461 3462 // ch + ct is ln(2) to more than 36 bits 3463 const float ch_log = 0x1.62e000p-1f; 3464 const float ct_log = 0x1.0bfbe8p-15f; 3465 3466 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 3467 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 3468 3469 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3470 auto YH = B.buildAnd(Ty, Y, MaskConst); 3471 auto YT = B.buildFSub(Ty, Y, YH, Flags); 3472 auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 3473 3474 Register Mad0 = 3475 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 3476 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 3477 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 3478 } 3479 3480 const bool IsFiniteOnly = 3481 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 3482 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 3483 3484 if (!IsFiniteOnly) { 3485 // Expand isfinite(x) => fabs(x) < inf 3486 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3487 auto Fabs = B.buildFAbs(Ty, Y); 3488 auto IsFinite = 3489 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 3490 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 3491 } 3492 3493 if (ScaledInput) { 3494 auto Zero = B.buildFConstant(Ty, 0.0); 3495 auto ShiftK = 3496 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 3497 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 3498 B.buildFSub(Dst, R, Shift, Flags); 3499 } else { 3500 B.buildCopy(Dst, R); 3501 } 3502 3503 MI.eraseFromParent(); 3504 return true; 3505 } 3506 3507 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3508 Register Src, bool IsLog10, 3509 unsigned Flags) const { 3510 const double Log2BaseInverted = 3511 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3512 3513 LLT Ty = B.getMRI()->getType(Dst); 3514 3515 if (Ty == LLT::scalar(32)) { 3516 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3517 if (ScaledInput) { 3518 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3519 .addUse(Src) 3520 .setMIFlags(Flags); 3521 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3522 auto Zero = B.buildFConstant(Ty, 0.0); 3523 auto ResultOffset = 3524 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3525 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3526 3527 if (ST.hasFastFMAF32()) 3528 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3529 else { 3530 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3531 B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3532 } 3533 3534 return true; 3535 } 3536 } 3537 3538 auto Log2Operand = Ty == LLT::scalar(16) 3539 ? B.buildFLog2(Ty, Src, Flags) 3540 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 3541 .addUse(Src) 3542 .setMIFlags(Flags); 3543 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 3544 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 3545 return true; 3546 } 3547 3548 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 3549 MachineIRBuilder &B) const { 3550 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 3551 // If we have to handle denormals, scale up the input and adjust the result. 3552 3553 Register Dst = MI.getOperand(0).getReg(); 3554 Register Src = MI.getOperand(1).getReg(); 3555 unsigned Flags = MI.getFlags(); 3556 LLT Ty = B.getMRI()->getType(Dst); 3557 const LLT F16 = LLT::scalar(16); 3558 const LLT F32 = LLT::scalar(32); 3559 3560 if (Ty == F16) { 3561 // Nothing in half is a denormal when promoted to f32. 3562 auto Ext = B.buildFPExt(F32, Src, Flags); 3563 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) 3564 .addUse(Ext.getReg(0)) 3565 .setMIFlags(Flags); 3566 B.buildFPTrunc(Dst, Log2, Flags); 3567 MI.eraseFromParent(); 3568 return true; 3569 } 3570 3571 assert(Ty == F32); 3572 3573 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 3574 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3575 .addUse(Src) 3576 .setMIFlags(Flags); 3577 MI.eraseFromParent(); 3578 return true; 3579 } 3580 3581 // bool needs_scaling = x < -0x1.f80000p+6f; 3582 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 3583 3584 // -nextafter(128.0, -1) 3585 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 3586 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 3587 RangeCheckConst, Flags); 3588 3589 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 3590 auto Zero = B.buildFConstant(Ty, 0.0); 3591 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 3592 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 3593 3594 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3595 .addUse(AddInput.getReg(0)) 3596 .setMIFlags(Flags); 3597 3598 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 3599 auto One = B.buildFConstant(Ty, 1.0); 3600 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 3601 B.buildFMul(Dst, Exp2, ResultScale, Flags); 3602 MI.eraseFromParent(); 3603 return true; 3604 } 3605 3606 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 3607 Register X, unsigned Flags) const { 3608 LLT Ty = B.getMRI()->getType(Dst); 3609 LLT F32 = LLT::scalar(32); 3610 3611 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { 3612 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3613 auto Mul = B.buildFMul(Ty, X, Log2E, Flags); 3614 3615 if (Ty == F32) { 3616 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 3617 .addUse(Mul.getReg(0)) 3618 .setMIFlags(Flags); 3619 } else { 3620 B.buildFExp2(Dst, Mul.getReg(0), Flags); 3621 } 3622 3623 return true; 3624 } 3625 3626 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); 3627 auto NeedsScaling = 3628 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); 3629 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); 3630 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); 3631 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); 3632 3633 auto Log2E = B.buildFConstant(Ty, numbers::log2e); 3634 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); 3635 3636 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3637 .addUse(ExpInput.getReg(0)) 3638 .setMIFlags(Flags); 3639 3640 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); 3641 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); 3642 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); 3643 return true; 3644 } 3645 3646 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 3647 MachineIRBuilder &B) const { 3648 Register Dst = MI.getOperand(0).getReg(); 3649 Register X = MI.getOperand(1).getReg(); 3650 const unsigned Flags = MI.getFlags(); 3651 MachineFunction &MF = B.getMF(); 3652 MachineRegisterInfo &MRI = *B.getMRI(); 3653 LLT Ty = MRI.getType(Dst); 3654 const LLT F16 = LLT::scalar(16); 3655 const LLT F32 = LLT::scalar(32); 3656 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; 3657 3658 if (Ty == F16) { 3659 // v_exp_f16 (fmul x, log2e) 3660 if (allowApproxFunc(MF, Flags)) { 3661 // TODO: Does this really require fast? 3662 legalizeFExpUnsafe(B, Dst, X, Flags); 3663 MI.eraseFromParent(); 3664 return true; 3665 } 3666 3667 // exp(f16 x) -> 3668 // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 3669 3670 // Nothing in half is a denormal when promoted to f32. 3671 auto Ext = B.buildFPExt(F32, X, Flags); 3672 Register Lowered = MRI.createGenericVirtualRegister(F32); 3673 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 3674 B.buildFPTrunc(Dst, Lowered, Flags); 3675 MI.eraseFromParent(); 3676 return true; 3677 } 3678 3679 assert(Ty == F32); 3680 3681 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 3682 // library behavior. Also, is known-not-daz source sufficient? 3683 if (allowApproxFunc(MF, Flags)) { 3684 legalizeFExpUnsafe(B, Dst, X, Flags); 3685 MI.eraseFromParent(); 3686 return true; 3687 } 3688 3689 // Algorithm: 3690 // 3691 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 3692 // 3693 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 3694 // n = 64*m + j, 0 <= j < 64 3695 // 3696 // e^x = 2^((64*m + j + f)/64) 3697 // = (2^m) * (2^(j/64)) * 2^(f/64) 3698 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 3699 // 3700 // f = x*(64/ln(2)) - n 3701 // r = f*(ln(2)/64) = x - n*(ln(2)/64) 3702 // 3703 // e^x = (2^m) * (2^(j/64)) * e^r 3704 // 3705 // (2^(j/64)) is precomputed 3706 // 3707 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3708 // e^r = 1 + q 3709 // 3710 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 3711 // 3712 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 3713 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 3714 Register PH, PL; 3715 3716 if (ST.hasFastFMAF32()) { 3717 const float c_exp = numbers::log2ef; 3718 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 3719 const float c_exp10 = 0x1.a934f0p+1f; 3720 const float cc_exp10 = 0x1.2f346ep-24f; 3721 3722 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 3723 PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 3724 auto NegPH = B.buildFNeg(Ty, PH, Flags); 3725 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 3726 3727 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 3728 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 3729 } else { 3730 const float ch_exp = 0x1.714000p+0f; 3731 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 3732 3733 const float ch_exp10 = 0x1.a92000p+1f; 3734 const float cl_exp10 = 0x1.4f0978p-11f; 3735 3736 auto MaskConst = B.buildConstant(Ty, 0xfffff000); 3737 auto XH = B.buildAnd(Ty, X, MaskConst); 3738 auto XL = B.buildFSub(Ty, X, XH, Flags); 3739 3740 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 3741 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 3742 3743 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 3744 auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 3745 3746 Register Mad0 = 3747 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 3748 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 3749 } 3750 3751 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); 3752 3753 // It is unsafe to contract this fsub into the PH multiply. 3754 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 3755 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 3756 auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 3757 3758 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 3759 .addUse(A.getReg(0)) 3760 .setMIFlags(Flags); 3761 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 3762 3763 auto UnderflowCheckConst = 3764 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 3765 auto Zero = B.buildFConstant(Ty, 0.0); 3766 auto Underflow = 3767 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 3768 3769 R = B.buildSelect(Ty, Underflow, Zero, R); 3770 3771 const auto &Options = MF.getTarget().Options; 3772 3773 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 3774 auto OverflowCheckConst = 3775 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 3776 3777 auto Overflow = 3778 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 3779 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 3780 R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 3781 } 3782 3783 B.buildCopy(Dst, R); 3784 MI.eraseFromParent(); 3785 return true; 3786 } 3787 3788 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 3789 MachineIRBuilder &B) const { 3790 Register Dst = MI.getOperand(0).getReg(); 3791 Register Src0 = MI.getOperand(1).getReg(); 3792 Register Src1 = MI.getOperand(2).getReg(); 3793 unsigned Flags = MI.getFlags(); 3794 LLT Ty = B.getMRI()->getType(Dst); 3795 const LLT F16 = LLT::float16(); 3796 const LLT F32 = LLT::float32(); 3797 3798 if (Ty == F32) { 3799 auto Log = B.buildFLog2(F32, Src0, Flags); 3800 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3801 .addUse(Log.getReg(0)) 3802 .addUse(Src1) 3803 .setMIFlags(Flags); 3804 B.buildFExp2(Dst, Mul, Flags); 3805 } else if (Ty == F16) { 3806 // There's no f16 fmul_legacy, so we need to convert for it. 3807 auto Log = B.buildFLog2(F16, Src0, Flags); 3808 auto Ext0 = B.buildFPExt(F32, Log, Flags); 3809 auto Ext1 = B.buildFPExt(F32, Src1, Flags); 3810 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 3811 .addUse(Ext0.getReg(0)) 3812 .addUse(Ext1.getReg(0)) 3813 .setMIFlags(Flags); 3814 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); 3815 } else 3816 return false; 3817 3818 MI.eraseFromParent(); 3819 return true; 3820 } 3821 3822 // Find a source register, ignoring any possible source modifiers. 3823 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 3824 Register ModSrc = OrigSrc; 3825 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 3826 ModSrc = SrcFNeg->getOperand(1).getReg(); 3827 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3828 ModSrc = SrcFAbs->getOperand(1).getReg(); 3829 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 3830 ModSrc = SrcFAbs->getOperand(1).getReg(); 3831 return ModSrc; 3832 } 3833 3834 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 3835 MachineRegisterInfo &MRI, 3836 MachineIRBuilder &B) const { 3837 3838 const LLT S1 = LLT::scalar(1); 3839 const LLT F64 = LLT::float64(); 3840 Register Dst = MI.getOperand(0).getReg(); 3841 Register OrigSrc = MI.getOperand(1).getReg(); 3842 unsigned Flags = MI.getFlags(); 3843 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && 3844 "this should not have been custom lowered"); 3845 3846 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 3847 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 3848 // efficient way to implement it is using V_FRACT_F64. The workaround for the 3849 // V_FRACT bug is: 3850 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3851 // 3852 // Convert floor(x) to (x - fract(x)) 3853 3854 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) 3855 .addUse(OrigSrc) 3856 .setMIFlags(Flags); 3857 3858 // Give source modifier matching some assistance before obscuring a foldable 3859 // pattern. 3860 3861 // TODO: We can avoid the neg on the fract? The input sign to fract 3862 // shouldn't matter? 3863 Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 3864 3865 auto Const = 3866 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); 3867 3868 Register Min = MRI.createGenericVirtualRegister(F64); 3869 3870 // We don't need to concern ourselves with the snan handling difference, so 3871 // use the one which will directly select. 3872 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3873 if (MFI->getMode().IEEE) 3874 B.buildFMinNumIEEE(Min, Fract, Const, Flags); 3875 else 3876 B.buildFMinNum(Min, Fract, Const, Flags); 3877 3878 Register CorrectedFract = Min; 3879 if (!MI.getFlag(MachineInstr::FmNoNans)) { 3880 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 3881 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); 3882 } 3883 3884 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); 3885 B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 3886 3887 MI.eraseFromParent(); 3888 return true; 3889 } 3890 3891 // Turn an illegal packed v2s16 build vector into bit operations. 3892 // TODO: This should probably be a bitcast action in LegalizerHelper. 3893 bool AMDGPULegalizerInfo::legalizeBuildVector( 3894 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3895 Register Dst = MI.getOperand(0).getReg(); 3896 const LLT S32 = LLT::scalar(32); 3897 const LLT S16 = LLT::scalar(16); 3898 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 3899 3900 Register Src0 = MI.getOperand(1).getReg(); 3901 Register Src1 = MI.getOperand(2).getReg(); 3902 3903 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3904 assert(MRI.getType(Src0) == S32); 3905 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3906 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3907 } 3908 3909 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 3910 B.buildBitcast(Dst, Merge); 3911 3912 MI.eraseFromParent(); 3913 return true; 3914 } 3915 3916 // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 3917 // 3918 // Source and accumulation registers must all be 32-bits. 3919 // 3920 // TODO: When the multiply is uniform, we should produce a code sequence 3921 // that is better suited to instruction selection on the SALU. Instead of 3922 // the outer loop going over parts of the result, the outer loop should go 3923 // over parts of one of the factors. This should result in instruction 3924 // selection that makes full use of S_ADDC_U32 instructions. 3925 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 3926 MutableArrayRef<Register> Accum, 3927 ArrayRef<Register> Src0, 3928 ArrayRef<Register> Src1, 3929 bool UsePartialMad64_32, 3930 bool SeparateOddAlignedProducts) const { 3931 // Use (possibly empty) vectors of S1 registers to represent the set of 3932 // carries from one pair of positions to the next. 3933 using Carry = SmallVector<Register, 2>; 3934 3935 MachineIRBuilder &B = Helper.MIRBuilder; 3936 GISelValueTracking &VT = *Helper.getValueTracking(); 3937 3938 const LLT S1 = LLT::scalar(1); 3939 const LLT S32 = LLT::scalar(32); 3940 const LLT S64 = LLT::scalar(64); 3941 3942 Register Zero32; 3943 Register Zero64; 3944 3945 auto getZero32 = [&]() -> Register { 3946 if (!Zero32) 3947 Zero32 = B.buildConstant(S32, 0).getReg(0); 3948 return Zero32; 3949 }; 3950 auto getZero64 = [&]() -> Register { 3951 if (!Zero64) 3952 Zero64 = B.buildConstant(S64, 0).getReg(0); 3953 return Zero64; 3954 }; 3955 3956 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 3957 for (unsigned i = 0; i < Src0.size(); ++i) { 3958 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero()); 3959 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero()); 3960 } 3961 3962 // Merge the given carries into the 32-bit LocalAccum, which is modified 3963 // in-place. 3964 // 3965 // Returns the carry-out, which is a single S1 register or null. 3966 auto mergeCarry = 3967 [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 3968 if (CarryIn.empty()) 3969 return Register(); 3970 3971 bool HaveCarryOut = true; 3972 Register CarryAccum; 3973 if (CarryIn.size() == 1) { 3974 if (!LocalAccum) { 3975 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3976 return Register(); 3977 } 3978 3979 CarryAccum = getZero32(); 3980 } else { 3981 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 3982 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 3983 CarryAccum = 3984 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 3985 .getReg(0); 3986 } 3987 3988 if (!LocalAccum) { 3989 LocalAccum = getZero32(); 3990 HaveCarryOut = false; 3991 } 3992 } 3993 3994 auto Add = 3995 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 3996 LocalAccum = Add.getReg(0); 3997 return HaveCarryOut ? Add.getReg(1) : Register(); 3998 }; 3999 4000 // Build a multiply-add chain to compute 4001 // 4002 // LocalAccum + (partial products at DstIndex) 4003 // + (opportunistic subset of CarryIn) 4004 // 4005 // LocalAccum is an array of one or two 32-bit registers that are updated 4006 // in-place. The incoming registers may be null. 4007 // 4008 // In some edge cases, carry-ins can be consumed "for free". In that case, 4009 // the consumed carry bits are removed from CarryIn in-place. 4010 auto buildMadChain = 4011 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 4012 -> Carry { 4013 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 4014 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 4015 4016 Carry CarryOut; 4017 unsigned j0 = 0; 4018 4019 // Use plain 32-bit multiplication for the most significant part of the 4020 // result by default. 4021 if (LocalAccum.size() == 1 && 4022 (!UsePartialMad64_32 || !CarryIn.empty())) { 4023 do { 4024 // Skip multiplication if one of the operands is 0 4025 unsigned j1 = DstIndex - j0; 4026 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 4027 ++j0; 4028 continue; 4029 } 4030 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 4031 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) { 4032 LocalAccum[0] = Mul.getReg(0); 4033 } else { 4034 if (CarryIn.empty()) { 4035 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 4036 } else { 4037 LocalAccum[0] = 4038 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 4039 .getReg(0); 4040 CarryIn.pop_back(); 4041 } 4042 } 4043 ++j0; 4044 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 4045 } 4046 4047 // Build full 64-bit multiplies. 4048 if (j0 <= DstIndex) { 4049 bool HaveSmallAccum = false; 4050 Register Tmp; 4051 4052 if (LocalAccum[0]) { 4053 if (LocalAccum.size() == 1) { 4054 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 4055 HaveSmallAccum = true; 4056 } else if (LocalAccum[1]) { 4057 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 4058 HaveSmallAccum = false; 4059 } else { 4060 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 4061 HaveSmallAccum = true; 4062 } 4063 } else { 4064 assert(LocalAccum.size() == 1 || !LocalAccum[1]); 4065 Tmp = getZero64(); 4066 HaveSmallAccum = true; 4067 } 4068 4069 do { 4070 unsigned j1 = DstIndex - j0; 4071 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 4072 ++j0; 4073 continue; 4074 } 4075 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 4076 {Src0[j0], Src1[j1], Tmp}); 4077 Tmp = Mad.getReg(0); 4078 if (!HaveSmallAccum) 4079 CarryOut.push_back(Mad.getReg(1)); 4080 HaveSmallAccum = false; 4081 4082 ++j0; 4083 } while (j0 <= DstIndex); 4084 4085 auto Unmerge = B.buildUnmerge(S32, Tmp); 4086 LocalAccum[0] = Unmerge.getReg(0); 4087 if (LocalAccum.size() > 1) 4088 LocalAccum[1] = Unmerge.getReg(1); 4089 } 4090 4091 return CarryOut; 4092 }; 4093 4094 // Outer multiply loop, iterating over destination parts from least 4095 // significant to most significant parts. 4096 // 4097 // The columns of the following diagram correspond to the destination parts 4098 // affected by one iteration of the outer loop (ignoring boundary 4099 // conditions). 4100 // 4101 // Dest index relative to 2 * i: 1 0 -1 4102 // ------ 4103 // Carries from previous iteration: e o 4104 // Even-aligned partial product sum: E E . 4105 // Odd-aligned partial product sum: O O 4106 // 4107 // 'o' is OddCarry, 'e' is EvenCarry. 4108 // EE and OO are computed from partial products via buildMadChain and use 4109 // accumulation where possible and appropriate. 4110 // 4111 Register SeparateOddCarry; 4112 Carry EvenCarry; 4113 Carry OddCarry; 4114 4115 for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 4116 Carry OddCarryIn = std::move(OddCarry); 4117 Carry EvenCarryIn = std::move(EvenCarry); 4118 OddCarry.clear(); 4119 EvenCarry.clear(); 4120 4121 // Partial products at offset 2 * i. 4122 if (2 * i < Accum.size()) { 4123 auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 4124 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 4125 } 4126 4127 // Partial products at offset 2 * i - 1. 4128 if (i > 0) { 4129 if (!SeparateOddAlignedProducts) { 4130 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 4131 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 4132 } else { 4133 bool IsHighest = 2 * i >= Accum.size(); 4134 Register SeparateOddOut[2]; 4135 auto LocalAccum = MutableArrayRef(SeparateOddOut) 4136 .take_front(IsHighest ? 1 : 2); 4137 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 4138 4139 MachineInstr *Lo; 4140 4141 if (i == 1) { 4142 if (!IsHighest) 4143 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 4144 else 4145 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 4146 } else { 4147 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 4148 SeparateOddCarry); 4149 } 4150 Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 4151 4152 if (!IsHighest) { 4153 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 4154 Lo->getOperand(1).getReg()); 4155 Accum[2 * i] = Hi.getReg(0); 4156 SeparateOddCarry = Hi.getReg(1); 4157 } 4158 } 4159 } 4160 4161 // Add in the carries from the previous iteration 4162 if (i > 0) { 4163 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 4164 EvenCarryIn.push_back(CarryOut); 4165 4166 if (2 * i < Accum.size()) { 4167 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 4168 OddCarry.push_back(CarryOut); 4169 } 4170 } 4171 } 4172 } 4173 4174 // Custom narrowing of wide multiplies using wide multiply-add instructions. 4175 // 4176 // TODO: If the multiply is followed by an addition, we should attempt to 4177 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 4178 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 4179 MachineInstr &MI) const { 4180 assert(ST.hasMad64_32()); 4181 assert(MI.getOpcode() == TargetOpcode::G_MUL); 4182 4183 MachineIRBuilder &B = Helper.MIRBuilder; 4184 MachineRegisterInfo &MRI = *B.getMRI(); 4185 4186 Register DstReg = MI.getOperand(0).getReg(); 4187 Register Src0 = MI.getOperand(1).getReg(); 4188 Register Src1 = MI.getOperand(2).getReg(); 4189 4190 LLT Ty = MRI.getType(DstReg); 4191 assert(Ty.isScalar()); 4192 4193 unsigned Size = Ty.getSizeInBits(); 4194 unsigned NumParts = Size / 32; 4195 assert((Size % 32) == 0); 4196 assert(NumParts >= 2); 4197 4198 // Whether to use MAD_64_32 for partial products whose high half is 4199 // discarded. This avoids some ADD instructions but risks false dependency 4200 // stalls on some subtargets in some cases. 4201 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 4202 4203 // Whether to compute odd-aligned partial products separately. This is 4204 // advisable on subtargets where the accumulator of MAD_64_32 must be placed 4205 // in an even-aligned VGPR. 4206 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 4207 4208 LLT S32 = LLT::scalar(32); 4209 SmallVector<Register, 2> Src0Parts, Src1Parts; 4210 for (unsigned i = 0; i < NumParts; ++i) { 4211 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4212 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 4213 } 4214 B.buildUnmerge(Src0Parts, Src0); 4215 B.buildUnmerge(Src1Parts, Src1); 4216 4217 SmallVector<Register, 2> AccumRegs(NumParts); 4218 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 4219 SeparateOddAlignedProducts); 4220 4221 B.buildMergeLikeInstr(DstReg, AccumRegs); 4222 MI.eraseFromParent(); 4223 return true; 4224 } 4225 4226 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 4227 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 4228 // case with a single min instruction instead of a compare+select. 4229 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 4230 MachineRegisterInfo &MRI, 4231 MachineIRBuilder &B) const { 4232 Register Dst = MI.getOperand(0).getReg(); 4233 Register Src = MI.getOperand(1).getReg(); 4234 LLT DstTy = MRI.getType(Dst); 4235 LLT SrcTy = MRI.getType(Src); 4236 4237 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 4238 ? AMDGPU::G_AMDGPU_FFBH_U32 4239 : AMDGPU::G_AMDGPU_FFBL_B32; 4240 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 4241 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 4242 4243 MI.eraseFromParent(); 4244 return true; 4245 } 4246 4247 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, 4248 MachineRegisterInfo &MRI, 4249 MachineIRBuilder &B) const { 4250 Register Dst = MI.getOperand(0).getReg(); 4251 Register Src = MI.getOperand(1).getReg(); 4252 LLT SrcTy = MRI.getType(Src); 4253 TypeSize NumBits = SrcTy.getSizeInBits(); 4254 4255 assert(NumBits < 32u); 4256 4257 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits); 4258 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u); 4259 auto Shift = B.buildShl(S32, Extend, ShiftAmt); 4260 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift}); 4261 B.buildTrunc(Dst, Ctlz); 4262 MI.eraseFromParent(); 4263 return true; 4264 } 4265 4266 // Check that this is a G_XOR x, -1 4267 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 4268 if (MI.getOpcode() != TargetOpcode::G_XOR) 4269 return false; 4270 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 4271 return ConstVal == -1; 4272 } 4273 4274 // Return the use branch instruction, otherwise null if the usage is invalid. 4275 static MachineInstr * 4276 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 4277 MachineBasicBlock *&UncondBrTarget, bool &Negated) { 4278 Register CondDef = MI.getOperand(0).getReg(); 4279 if (!MRI.hasOneNonDBGUse(CondDef)) 4280 return nullptr; 4281 4282 MachineBasicBlock *Parent = MI.getParent(); 4283 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 4284 4285 if (isNot(MRI, *UseMI)) { 4286 Register NegatedCond = UseMI->getOperand(0).getReg(); 4287 if (!MRI.hasOneNonDBGUse(NegatedCond)) 4288 return nullptr; 4289 4290 // We're deleting the def of this value, so we need to remove it. 4291 eraseInstr(*UseMI, MRI); 4292 4293 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 4294 Negated = true; 4295 } 4296 4297 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 4298 return nullptr; 4299 4300 // Make sure the cond br is followed by a G_BR, or is the last instruction. 4301 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 4302 if (Next == Parent->end()) { 4303 MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 4304 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 4305 return nullptr; 4306 UncondBrTarget = &*NextMBB; 4307 } else { 4308 if (Next->getOpcode() != AMDGPU::G_BR) 4309 return nullptr; 4310 Br = &*Next; 4311 UncondBrTarget = Br->getOperand(0).getMBB(); 4312 } 4313 4314 return UseMI; 4315 } 4316 4317 void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg, 4318 MachineIRBuilder &B, 4319 const ArgDescriptor *Arg, 4320 const TargetRegisterClass *ArgRC, 4321 LLT ArgTy) const { 4322 MCRegister SrcReg = Arg->getRegister(); 4323 assert(SrcReg.isPhysical() && "Physical register expected"); 4324 assert(DstReg.isVirtual() && "Virtual register expected"); 4325 4326 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 4327 *ArgRC, B.getDebugLoc(), ArgTy); 4328 if (Arg->isMasked()) { 4329 // TODO: Should we try to emit this once in the entry block? 4330 const LLT S32 = LLT::scalar(32); 4331 const unsigned Mask = Arg->getMask(); 4332 const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 4333 4334 Register AndMaskSrc = LiveIn; 4335 4336 // TODO: Avoid clearing the high bits if we know workitem id y/z are always 4337 // 0. 4338 if (Shift != 0) { 4339 auto ShiftAmt = B.buildConstant(S32, Shift); 4340 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 4341 } 4342 4343 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 4344 } else { 4345 B.buildCopy(DstReg, LiveIn); 4346 } 4347 } 4348 4349 bool AMDGPULegalizerInfo::loadInputValue( 4350 Register DstReg, MachineIRBuilder &B, 4351 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4352 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4353 const ArgDescriptor *Arg = nullptr; 4354 const TargetRegisterClass *ArgRC; 4355 LLT ArgTy; 4356 4357 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 4358 const ArgDescriptor WorkGroupIDX = 4359 ArgDescriptor::createRegister(AMDGPU::TTMP9); 4360 // If GridZ is not programmed in an entry function then the hardware will set 4361 // it to all zeros, so there is no need to mask the GridY value in the low 4362 // order bits. 4363 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 4364 AMDGPU::TTMP7, 4365 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 4366 const ArgDescriptor WorkGroupIDZ = 4367 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 4368 if (ST.hasArchitectedSGPRs() && 4369 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) { 4370 switch (ArgType) { 4371 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 4372 Arg = &WorkGroupIDX; 4373 ArgRC = &AMDGPU::SReg_32RegClass; 4374 ArgTy = LLT::scalar(32); 4375 break; 4376 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 4377 Arg = &WorkGroupIDY; 4378 ArgRC = &AMDGPU::SReg_32RegClass; 4379 ArgTy = LLT::scalar(32); 4380 break; 4381 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 4382 Arg = &WorkGroupIDZ; 4383 ArgRC = &AMDGPU::SReg_32RegClass; 4384 ArgTy = LLT::scalar(32); 4385 break; 4386 default: 4387 break; 4388 } 4389 } 4390 4391 if (!Arg) 4392 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4393 4394 if (!Arg) { 4395 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4396 // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4397 // case the pointer argument may be missing and we use null. 4398 B.buildConstant(DstReg, 0); 4399 return true; 4400 } 4401 4402 // It's undefined behavior if a function marked with the amdgpu-no-* 4403 // attributes uses the corresponding intrinsic. 4404 B.buildUndef(DstReg); 4405 return true; 4406 } 4407 4408 if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4409 return false; // TODO: Handle these 4410 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4411 return true; 4412 } 4413 4414 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 4415 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4416 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4417 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 4418 return false; 4419 4420 MI.eraseFromParent(); 4421 return true; 4422 } 4423 4424 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 4425 int64_t C) { 4426 B.buildConstant(MI.getOperand(0).getReg(), C); 4427 MI.eraseFromParent(); 4428 return true; 4429 } 4430 4431 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 4432 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 4433 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4434 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 4435 if (MaxID == 0) 4436 return replaceWithConstant(B, MI, 0); 4437 4438 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4439 const ArgDescriptor *Arg; 4440 const TargetRegisterClass *ArgRC; 4441 LLT ArgTy; 4442 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4443 4444 Register DstReg = MI.getOperand(0).getReg(); 4445 if (!Arg) { 4446 // It's undefined behavior if a function marked with the amdgpu-no-* 4447 // attributes uses the corresponding intrinsic. 4448 B.buildUndef(DstReg); 4449 MI.eraseFromParent(); 4450 return true; 4451 } 4452 4453 if (Arg->isMasked()) { 4454 // Don't bother inserting AssertZext for packed IDs since we're emitting the 4455 // masking operations anyway. 4456 // 4457 // TODO: We could assert the top bit is 0 for the source copy. 4458 if (!loadInputValue(DstReg, B, ArgType)) 4459 return false; 4460 } else { 4461 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 4462 if (!loadInputValue(TmpReg, B, ArgType)) 4463 return false; 4464 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 4465 } 4466 4467 MI.eraseFromParent(); 4468 return true; 4469 } 4470 4471 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 4472 int64_t Offset) const { 4473 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 4474 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 4475 4476 // TODO: If we passed in the base kernel offset we could have a better 4477 // alignment than 4, but we don't really need it. 4478 if (!loadInputValue(KernArgReg, B, 4479 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4480 llvm_unreachable("failed to find kernarg segment ptr"); 4481 4482 auto COffset = B.buildConstant(LLT::scalar(64), Offset); 4483 // TODO: Should get nuw 4484 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 4485 } 4486 4487 /// Legalize a value that's loaded from kernel arguments. This is only used by 4488 /// legacy intrinsics. 4489 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 4490 MachineIRBuilder &B, 4491 uint64_t Offset, 4492 Align Alignment) const { 4493 Register DstReg = MI.getOperand(0).getReg(); 4494 4495 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 4496 "unexpected kernarg parameter type"); 4497 4498 Register Ptr = getKernargParameterPtr(B, Offset); 4499 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 4500 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 4501 MachineMemOperand::MODereferenceable | 4502 MachineMemOperand::MOInvariant); 4503 MI.eraseFromParent(); 4504 return true; 4505 } 4506 4507 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 4508 MachineRegisterInfo &MRI, 4509 MachineIRBuilder &B) const { 4510 Register Dst = MI.getOperand(0).getReg(); 4511 LLT DstTy = MRI.getType(Dst); 4512 LLT S16 = LLT::scalar(16); 4513 LLT S32 = LLT::scalar(32); 4514 LLT S64 = LLT::scalar(64); 4515 4516 if (DstTy == S16) 4517 return legalizeFDIV16(MI, MRI, B); 4518 if (DstTy == S32) 4519 return legalizeFDIV32(MI, MRI, B); 4520 if (DstTy == S64) 4521 return legalizeFDIV64(MI, MRI, B); 4522 4523 return false; 4524 } 4525 4526 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4527 Register DstDivReg, 4528 Register DstRemReg, 4529 Register X, 4530 Register Y) const { 4531 const LLT S1 = LLT::scalar(1); 4532 const LLT S32 = LLT::scalar(32); 4533 4534 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 4535 // algorithm used here. 4536 4537 // Initial estimate of inv(y). 4538 auto FloatY = B.buildUITOFP(S32, Y); 4539 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 4540 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 4541 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 4542 auto Z = B.buildFPTOUI(S32, ScaledY); 4543 4544 // One round of UNR. 4545 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 4546 auto NegYZ = B.buildMul(S32, NegY, Z); 4547 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 4548 4549 // Quotient/remainder estimate. 4550 auto Q = B.buildUMulH(S32, X, Z); 4551 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 4552 4553 // First quotient/remainder refinement. 4554 auto One = B.buildConstant(S32, 1); 4555 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4556 if (DstDivReg) 4557 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 4558 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 4559 4560 // Second quotient/remainder refinement. 4561 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4562 if (DstDivReg) 4563 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 4564 4565 if (DstRemReg) 4566 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 4567 } 4568 4569 // Build integer reciprocal sequence around V_RCP_IFLAG_F32 4570 // 4571 // Return lo, hi of result 4572 // 4573 // %cvt.lo = G_UITOFP Val.lo 4574 // %cvt.hi = G_UITOFP Val.hi 4575 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 4576 // %rcp = G_AMDGPU_RCP_IFLAG %mad 4577 // %mul1 = G_FMUL %rcp, 0x5f7ffffc 4578 // %mul2 = G_FMUL %mul1, 2**(-32) 4579 // %trunc = G_INTRINSIC_TRUNC %mul2 4580 // %mad2 = G_FMAD %trunc, -(2**32), %mul1 4581 // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 4582 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 4583 Register Val) { 4584 const LLT S32 = LLT::scalar(32); 4585 auto Unmerge = B.buildUnmerge(S32, Val); 4586 4587 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 4588 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 4589 4590 auto Mad = B.buildFMAD( 4591 S32, CvtHi, // 2**32 4592 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 4593 4594 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 4595 auto Mul1 = B.buildFMul( 4596 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 4597 4598 // 2**(-32) 4599 auto Mul2 = B.buildFMul( 4600 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 4601 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 4602 4603 // -(2**32) 4604 auto Mad2 = B.buildFMAD( 4605 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 4606 Mul1); 4607 4608 auto ResultLo = B.buildFPTOUI(S32, Mad2); 4609 auto ResultHi = B.buildFPTOUI(S32, Trunc); 4610 4611 return {ResultLo.getReg(0), ResultHi.getReg(0)}; 4612 } 4613 4614 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4615 Register DstDivReg, 4616 Register DstRemReg, 4617 Register Numer, 4618 Register Denom) const { 4619 const LLT S32 = LLT::scalar(32); 4620 const LLT S64 = LLT::scalar(64); 4621 const LLT S1 = LLT::scalar(1); 4622 Register RcpLo, RcpHi; 4623 4624 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 4625 4626 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 4627 4628 auto Zero64 = B.buildConstant(S64, 0); 4629 auto NegDenom = B.buildSub(S64, Zero64, Denom); 4630 4631 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 4632 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 4633 4634 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 4635 Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 4636 Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 4637 4638 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 4639 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4640 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 4641 4642 auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 4643 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 4644 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 4645 Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 4646 Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 4647 4648 auto Zero32 = B.buildConstant(S32, 0); 4649 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4650 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4651 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 4652 4653 auto UnmergeNumer = B.buildUnmerge(S32, Numer); 4654 Register NumerLo = UnmergeNumer.getReg(0); 4655 Register NumerHi = UnmergeNumer.getReg(1); 4656 4657 auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 4658 auto Mul3 = B.buildMul(S64, Denom, MulHi3); 4659 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 4660 Register Mul3_Lo = UnmergeMul3.getReg(0); 4661 Register Mul3_Hi = UnmergeMul3.getReg(1); 4662 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 4663 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 4664 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4665 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 4666 4667 auto UnmergeDenom = B.buildUnmerge(S32, Denom); 4668 Register DenomLo = UnmergeDenom.getReg(0); 4669 Register DenomHi = UnmergeDenom.getReg(1); 4670 4671 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 4672 auto C1 = B.buildSExt(S32, CmpHi); 4673 4674 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 4675 auto C2 = B.buildSExt(S32, CmpLo); 4676 4677 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 4678 auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 4679 4680 // TODO: Here and below portions of the code can be enclosed into if/endif. 4681 // Currently control flow is unconditional and we have 4 selects after 4682 // potential endif to substitute PHIs. 4683 4684 // if C3 != 0 ... 4685 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 4686 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 4687 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4688 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 4689 4690 auto One64 = B.buildConstant(S64, 1); 4691 auto Add3 = B.buildAdd(S64, MulHi3, One64); 4692 4693 auto C4 = 4694 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 4695 auto C5 = 4696 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 4697 auto C6 = B.buildSelect( 4698 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 4699 4700 // if (C6 != 0) 4701 auto Add4 = B.buildAdd(S64, Add3, One64); 4702 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 4703 4704 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 4705 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4706 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 4707 4708 // endif C6 4709 // endif C3 4710 4711 if (DstDivReg) { 4712 auto Sel1 = B.buildSelect( 4713 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4714 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4715 Sel1, MulHi3); 4716 } 4717 4718 if (DstRemReg) { 4719 auto Sel2 = B.buildSelect( 4720 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4721 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4722 Sel2, Sub1); 4723 } 4724 } 4725 4726 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 4727 MachineRegisterInfo &MRI, 4728 MachineIRBuilder &B) const { 4729 Register DstDivReg, DstRemReg; 4730 switch (MI.getOpcode()) { 4731 default: 4732 llvm_unreachable("Unexpected opcode!"); 4733 case AMDGPU::G_UDIV: { 4734 DstDivReg = MI.getOperand(0).getReg(); 4735 break; 4736 } 4737 case AMDGPU::G_UREM: { 4738 DstRemReg = MI.getOperand(0).getReg(); 4739 break; 4740 } 4741 case AMDGPU::G_UDIVREM: { 4742 DstDivReg = MI.getOperand(0).getReg(); 4743 DstRemReg = MI.getOperand(1).getReg(); 4744 break; 4745 } 4746 } 4747 4748 const LLT S64 = LLT::scalar(64); 4749 const LLT S32 = LLT::scalar(32); 4750 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4751 Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4752 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4753 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4754 4755 if (Ty == S32) 4756 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 4757 else if (Ty == S64) 4758 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 4759 else 4760 return false; 4761 4762 MI.eraseFromParent(); 4763 return true; 4764 } 4765 4766 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 4767 MachineRegisterInfo &MRI, 4768 MachineIRBuilder &B) const { 4769 const LLT S64 = LLT::scalar(64); 4770 const LLT S32 = LLT::scalar(32); 4771 4772 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 4773 if (Ty != S32 && Ty != S64) 4774 return false; 4775 4776 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4777 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4778 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4779 4780 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 4781 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 4782 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 4783 4784 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 4785 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 4786 4787 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 4788 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 4789 4790 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4791 switch (MI.getOpcode()) { 4792 default: 4793 llvm_unreachable("Unexpected opcode!"); 4794 case AMDGPU::G_SDIV: { 4795 DstDivReg = MI.getOperand(0).getReg(); 4796 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4797 break; 4798 } 4799 case AMDGPU::G_SREM: { 4800 DstRemReg = MI.getOperand(0).getReg(); 4801 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4802 break; 4803 } 4804 case AMDGPU::G_SDIVREM: { 4805 DstDivReg = MI.getOperand(0).getReg(); 4806 DstRemReg = MI.getOperand(1).getReg(); 4807 TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4808 TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4809 break; 4810 } 4811 } 4812 4813 if (Ty == S32) 4814 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4815 else 4816 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 4817 4818 if (DstDivReg) { 4819 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4820 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4821 B.buildSub(DstDivReg, SignXor, Sign); 4822 } 4823 4824 if (DstRemReg) { 4825 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4826 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4827 B.buildSub(DstRemReg, SignXor, Sign); 4828 } 4829 4830 MI.eraseFromParent(); 4831 return true; 4832 } 4833 4834 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 4835 MachineRegisterInfo &MRI, 4836 MachineIRBuilder &B) const { 4837 Register Res = MI.getOperand(0).getReg(); 4838 Register LHS = MI.getOperand(1).getReg(); 4839 Register RHS = MI.getOperand(2).getReg(); 4840 uint16_t Flags = MI.getFlags(); 4841 LLT ResTy = MRI.getType(Res); 4842 4843 const MachineFunction &MF = B.getMF(); 4844 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 4845 MF.getTarget().Options.UnsafeFPMath; 4846 4847 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) { 4848 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 4849 return false; 4850 4851 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 4852 // the CI documentation has a worst case error of 1 ulp. 4853 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 4854 // use it as long as we aren't trying to use denormals. 4855 // 4856 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 4857 4858 // 1 / x -> RCP(x) 4859 if (CLHS->isExactlyValue(1.0)) { 4860 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4861 .addUse(RHS) 4862 .setMIFlags(Flags); 4863 4864 MI.eraseFromParent(); 4865 return true; 4866 } 4867 4868 // -1 / x -> RCP( FNEG(x) ) 4869 if (CLHS->isExactlyValue(-1.0)) { 4870 auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 4871 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 4872 .addUse(FNeg.getReg(0)) 4873 .setMIFlags(Flags); 4874 4875 MI.eraseFromParent(); 4876 return true; 4877 } 4878 } 4879 4880 // For f16 require afn or arcp. 4881 // For f32 require afn. 4882 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 4883 !MI.getFlag(MachineInstr::FmArcp))) 4884 return false; 4885 4886 // x / y -> x * (1.0 / y) 4887 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4888 .addUse(RHS) 4889 .setMIFlags(Flags); 4890 B.buildFMul(Res, LHS, RCP, Flags); 4891 4892 MI.eraseFromParent(); 4893 return true; 4894 } 4895 4896 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4897 MachineRegisterInfo &MRI, 4898 MachineIRBuilder &B) const { 4899 Register Res = MI.getOperand(0).getReg(); 4900 Register X = MI.getOperand(1).getReg(); 4901 Register Y = MI.getOperand(2).getReg(); 4902 uint16_t Flags = MI.getFlags(); 4903 LLT ResTy = MRI.getType(Res); 4904 4905 const MachineFunction &MF = B.getMF(); 4906 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4907 MI.getFlag(MachineInstr::FmAfn); 4908 4909 if (!AllowInaccurateRcp) 4910 return false; 4911 4912 auto NegY = B.buildFNeg(ResTy, Y); 4913 auto One = B.buildFConstant(ResTy, 1.0); 4914 4915 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4916 .addUse(Y) 4917 .setMIFlags(Flags); 4918 4919 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4920 R = B.buildFMA(ResTy, Tmp0, R, R); 4921 4922 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4923 R = B.buildFMA(ResTy, Tmp1, R, R); 4924 4925 auto Ret = B.buildFMul(ResTy, X, R); 4926 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4927 4928 B.buildFMA(Res, Tmp2, R, Ret); 4929 MI.eraseFromParent(); 4930 return true; 4931 } 4932 4933 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4934 MachineRegisterInfo &MRI, 4935 MachineIRBuilder &B) const { 4936 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4937 return true; 4938 4939 Register Res = MI.getOperand(0).getReg(); 4940 Register LHS = MI.getOperand(1).getReg(); 4941 Register RHS = MI.getOperand(2).getReg(); 4942 4943 uint16_t Flags = MI.getFlags(); 4944 4945 LLT S16 = LLT::scalar(16); 4946 LLT S32 = LLT::scalar(32); 4947 4948 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32 4949 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32 4950 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d 4951 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp 4952 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 4953 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp 4954 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n 4955 // tmp.u = opx(V_MUL_F32, e32.u, r32.u); 4956 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000) 4957 // q32.u = opx(V_ADD_F32, tmp.u, q32.u); 4958 // q16.u = opx(V_CVT_F16_F32, q32.u); 4959 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n) 4960 4961 auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4962 auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4963 auto NegRHSExt = B.buildFNeg(S32, RHSExt); 4964 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4965 .addUse(RHSExt.getReg(0)) 4966 .setMIFlags(Flags); 4967 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags); 4968 MachineInstrBuilder Err; 4969 if (ST.hasMadMacF32Insts()) { 4970 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags); 4971 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags); 4972 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags); 4973 } else { 4974 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags); 4975 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags); 4976 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags); 4977 } 4978 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags); 4979 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000)); 4980 Quot = B.buildFAdd(S32, Tmp, Quot, Flags); 4981 auto RDst = B.buildFPTrunc(S16, Quot, Flags); 4982 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4983 .addUse(RDst.getReg(0)) 4984 .addUse(RHS) 4985 .addUse(LHS) 4986 .setMIFlags(Flags); 4987 4988 MI.eraseFromParent(); 4989 return true; 4990 } 4991 4992 static constexpr unsigned SPDenormModeBitField = 4993 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2); 4994 4995 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4996 // to enable denorm mode. When 'Enable' is false, disable denorm mode. 4997 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4998 const GCNSubtarget &ST, 4999 SIModeRegisterDefaults Mode) { 5000 // Set SP denorm mode to this value. 5001 unsigned SPDenormMode = 5002 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 5003 5004 if (ST.hasDenormModeInst()) { 5005 // Preserve default FP64FP16 denorm mode while updating FP32 mode. 5006 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 5007 5008 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 5009 B.buildInstr(AMDGPU::S_DENORM_MODE) 5010 .addImm(NewDenormModeValue); 5011 5012 } else { 5013 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 5014 .addImm(SPDenormMode) 5015 .addImm(SPDenormModeBitField); 5016 } 5017 } 5018 5019 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 5020 MachineRegisterInfo &MRI, 5021 MachineIRBuilder &B) const { 5022 if (legalizeFastUnsafeFDIV(MI, MRI, B)) 5023 return true; 5024 5025 Register Res = MI.getOperand(0).getReg(); 5026 Register LHS = MI.getOperand(1).getReg(); 5027 Register RHS = MI.getOperand(2).getReg(); 5028 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5029 SIModeRegisterDefaults Mode = MFI->getMode(); 5030 5031 uint16_t Flags = MI.getFlags(); 5032 5033 LLT S32 = LLT::scalar(32); 5034 LLT S1 = LLT::scalar(1); 5035 5036 auto One = B.buildFConstant(S32, 1.0f); 5037 5038 auto DenominatorScaled = 5039 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 5040 .addUse(LHS) 5041 .addUse(RHS) 5042 .addImm(0) 5043 .setMIFlags(Flags); 5044 auto NumeratorScaled = 5045 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 5046 .addUse(LHS) 5047 .addUse(RHS) 5048 .addImm(1) 5049 .setMIFlags(Flags); 5050 5051 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5052 .addUse(DenominatorScaled.getReg(0)) 5053 .setMIFlags(Flags); 5054 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 5055 5056 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); 5057 const bool HasDynamicDenormals = 5058 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || 5059 (Mode.FP32Denormals.Output == DenormalMode::Dynamic); 5060 5061 Register SavedSPDenormMode; 5062 if (!PreservesDenormals) { 5063 if (HasDynamicDenormals) { 5064 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 5065 B.buildInstr(AMDGPU::S_GETREG_B32) 5066 .addDef(SavedSPDenormMode) 5067 .addImm(SPDenormModeBitField); 5068 } 5069 toggleSPDenormMode(true, B, ST, Mode); 5070 } 5071 5072 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 5073 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 5074 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 5075 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 5076 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 5077 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 5078 5079 if (!PreservesDenormals) { 5080 if (HasDynamicDenormals) { 5081 assert(SavedSPDenormMode); 5082 B.buildInstr(AMDGPU::S_SETREG_B32) 5083 .addReg(SavedSPDenormMode) 5084 .addImm(SPDenormModeBitField); 5085 } else 5086 toggleSPDenormMode(false, B, ST, Mode); 5087 } 5088 5089 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) 5090 .addUse(Fma4.getReg(0)) 5091 .addUse(Fma1.getReg(0)) 5092 .addUse(Fma3.getReg(0)) 5093 .addUse(NumeratorScaled.getReg(1)) 5094 .setMIFlags(Flags); 5095 5096 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 5097 .addUse(Fmas.getReg(0)) 5098 .addUse(RHS) 5099 .addUse(LHS) 5100 .setMIFlags(Flags); 5101 5102 MI.eraseFromParent(); 5103 return true; 5104 } 5105 5106 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 5107 MachineRegisterInfo &MRI, 5108 MachineIRBuilder &B) const { 5109 if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 5110 return true; 5111 5112 Register Res = MI.getOperand(0).getReg(); 5113 Register LHS = MI.getOperand(1).getReg(); 5114 Register RHS = MI.getOperand(2).getReg(); 5115 5116 uint16_t Flags = MI.getFlags(); 5117 5118 LLT S64 = LLT::scalar(64); 5119 LLT S1 = LLT::scalar(1); 5120 5121 auto One = B.buildFConstant(S64, 1.0); 5122 5123 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 5124 .addUse(LHS) 5125 .addUse(RHS) 5126 .addImm(0) 5127 .setMIFlags(Flags); 5128 5129 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 5130 5131 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) 5132 .addUse(DivScale0.getReg(0)) 5133 .setMIFlags(Flags); 5134 5135 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 5136 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 5137 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 5138 5139 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 5140 .addUse(LHS) 5141 .addUse(RHS) 5142 .addImm(1) 5143 .setMIFlags(Flags); 5144 5145 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 5146 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 5147 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 5148 5149 Register Scale; 5150 if (!ST.hasUsableDivScaleConditionOutput()) { 5151 // Workaround a hardware bug on SI where the condition output from div_scale 5152 // is not usable. 5153 5154 LLT S32 = LLT::scalar(32); 5155 5156 auto NumUnmerge = B.buildUnmerge(S32, LHS); 5157 auto DenUnmerge = B.buildUnmerge(S32, RHS); 5158 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 5159 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 5160 5161 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 5162 Scale1Unmerge.getReg(1)); 5163 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 5164 Scale0Unmerge.getReg(1)); 5165 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 5166 } else { 5167 Scale = DivScale1.getReg(1); 5168 } 5169 5170 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) 5171 .addUse(Fma4.getReg(0)) 5172 .addUse(Fma3.getReg(0)) 5173 .addUse(Mul.getReg(0)) 5174 .addUse(Scale) 5175 .setMIFlags(Flags); 5176 5177 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) 5178 .addUse(Fmas.getReg(0)) 5179 .addUse(RHS) 5180 .addUse(LHS) 5181 .setMIFlags(Flags); 5182 5183 MI.eraseFromParent(); 5184 return true; 5185 } 5186 5187 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 5188 MachineRegisterInfo &MRI, 5189 MachineIRBuilder &B) const { 5190 Register Res0 = MI.getOperand(0).getReg(); 5191 Register Res1 = MI.getOperand(1).getReg(); 5192 Register Val = MI.getOperand(2).getReg(); 5193 uint16_t Flags = MI.getFlags(); 5194 5195 LLT Ty = MRI.getType(Res0); 5196 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 5197 5198 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) 5199 .addUse(Val) 5200 .setMIFlags(Flags); 5201 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) 5202 .addUse(Val) 5203 .setMIFlags(Flags); 5204 5205 if (ST.hasFractBug()) { 5206 auto Fabs = B.buildFAbs(Ty, Val); 5207 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 5208 auto IsFinite = 5209 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 5210 auto Zero = B.buildConstant(InstrExpTy, 0); 5211 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 5212 Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 5213 } 5214 5215 B.buildCopy(Res0, Mant); 5216 B.buildSExtOrTrunc(Res1, Exp); 5217 5218 MI.eraseFromParent(); 5219 return true; 5220 } 5221 5222 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 5223 MachineRegisterInfo &MRI, 5224 MachineIRBuilder &B) const { 5225 Register Res = MI.getOperand(0).getReg(); 5226 Register LHS = MI.getOperand(2).getReg(); 5227 Register RHS = MI.getOperand(3).getReg(); 5228 uint16_t Flags = MI.getFlags(); 5229 5230 LLT S32 = LLT::scalar(32); 5231 LLT S1 = LLT::scalar(1); 5232 5233 auto Abs = B.buildFAbs(S32, RHS, Flags); 5234 const APFloat C0Val(1.0f); 5235 5236 auto C0 = B.buildFConstant(S32, 0x1p+96f); 5237 auto C1 = B.buildFConstant(S32, 0x1p-32f); 5238 auto C2 = B.buildFConstant(S32, 1.0f); 5239 5240 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 5241 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 5242 5243 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 5244 5245 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 5246 .addUse(Mul0.getReg(0)) 5247 .setMIFlags(Flags); 5248 5249 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 5250 5251 B.buildFMul(Res, Sel, Mul1, Flags); 5252 5253 MI.eraseFromParent(); 5254 return true; 5255 } 5256 5257 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, 5258 MachineRegisterInfo &MRI, 5259 MachineIRBuilder &B) const { 5260 // Bypass the correct expansion a standard promotion through G_FSQRT would 5261 // get. The f32 op is accurate enough for the f16 cas. 5262 unsigned Flags = MI.getFlags(); 5263 assert(!ST.has16BitInsts()); 5264 const LLT F32 = LLT::scalar(32); 5265 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); 5266 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) 5267 .addUse(Ext.getReg(0)) 5268 .setMIFlags(Flags); 5269 B.buildFPTrunc(MI.getOperand(0), Log2, Flags); 5270 MI.eraseFromParent(); 5271 return true; 5272 } 5273 5274 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, 5275 MachineRegisterInfo &MRI, 5276 MachineIRBuilder &B) const { 5277 MachineFunction &MF = B.getMF(); 5278 Register Dst = MI.getOperand(0).getReg(); 5279 Register X = MI.getOperand(1).getReg(); 5280 const unsigned Flags = MI.getFlags(); 5281 const LLT S1 = LLT::scalar(1); 5282 const LLT F32 = LLT::scalar(32); 5283 const LLT I32 = LLT::scalar(32); 5284 5285 if (allowApproxFunc(MF, Flags)) { 5286 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) 5287 .addUse(X) 5288 .setMIFlags(Flags); 5289 MI.eraseFromParent(); 5290 return true; 5291 } 5292 5293 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); 5294 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); 5295 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); 5296 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); 5297 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); 5298 5299 Register SqrtS = MRI.createGenericVirtualRegister(F32); 5300 if (needsDenormHandlingF32(MF, X, Flags)) { 5301 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) 5302 .addUse(SqrtX.getReg(0)) 5303 .setMIFlags(Flags); 5304 5305 auto NegOne = B.buildConstant(I32, -1); 5306 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); 5307 5308 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); 5309 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 5310 5311 auto PosOne = B.buildConstant(I32, 1); 5312 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); 5313 5314 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); 5315 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 5316 5317 auto Zero = B.buildFConstant(F32, 0.0f); 5318 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); 5319 5320 SqrtS = 5321 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); 5322 5323 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); 5324 SqrtS = 5325 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); 5326 } else { 5327 auto SqrtR = 5328 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); 5329 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); 5330 5331 auto Half = B.buildFConstant(F32, 0.5f); 5332 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); 5333 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); 5334 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); 5335 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); 5336 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); 5337 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); 5338 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); 5339 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); 5340 } 5341 5342 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); 5343 5344 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); 5345 5346 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); 5347 5348 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5349 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); 5350 5351 MI.eraseFromParent(); 5352 return true; 5353 } 5354 5355 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, 5356 MachineRegisterInfo &MRI, 5357 MachineIRBuilder &B) const { 5358 // For double type, the SQRT and RSQ instructions don't have required 5359 // precision, we apply Goldschmidt's algorithm to improve the result: 5360 // 5361 // y0 = rsq(x) 5362 // g0 = x * y0 5363 // h0 = 0.5 * y0 5364 // 5365 // r0 = 0.5 - h0 * g0 5366 // g1 = g0 * r0 + g0 5367 // h1 = h0 * r0 + h0 5368 // 5369 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 5370 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 5371 // h2 = h1 * r1 + h1 5372 // 5373 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 5374 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 5375 // 5376 // sqrt(x) = g3 5377 5378 const LLT S1 = LLT::scalar(1); 5379 const LLT S32 = LLT::scalar(32); 5380 const LLT F64 = LLT::scalar(64); 5381 5382 Register Dst = MI.getOperand(0).getReg(); 5383 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 5384 5385 Register X = MI.getOperand(1).getReg(); 5386 unsigned Flags = MI.getFlags(); 5387 5388 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 5389 5390 auto ZeroInt = B.buildConstant(S32, 0); 5391 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 5392 5393 // Scale up input if it is too small. 5394 auto ScaleUpFactor = B.buildConstant(S32, 256); 5395 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 5396 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 5397 5398 auto SqrtY = 5399 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); 5400 5401 auto Half = B.buildFConstant(F64, 0.5); 5402 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 5403 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 5404 5405 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 5406 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 5407 5408 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 5409 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 5410 5411 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 5412 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 5413 5414 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 5415 5416 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 5417 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 5418 5419 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 5420 5421 // Scale down the result. 5422 auto ScaleDownFactor = B.buildConstant(S32, -128); 5423 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 5424 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 5425 5426 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 5427 // with finite only or nsz because rsq(+/-0) = +/-inf 5428 5429 // TODO: Check for DAZ and expand to subnormals 5430 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 5431 5432 // If x is +INF, +0, or -0, use its original value 5433 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 5434 5435 MI.eraseFromParent(); 5436 return true; 5437 } 5438 5439 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 5440 MachineRegisterInfo &MRI, 5441 MachineIRBuilder &B) const { 5442 LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 5443 if (Ty == LLT::scalar(32)) 5444 return legalizeFSQRTF32(MI, MRI, B); 5445 if (Ty == LLT::scalar(64)) 5446 return legalizeFSQRTF64(MI, MRI, B); 5447 if (Ty == LLT::scalar(16)) 5448 return legalizeFSQRTF16(MI, MRI, B); 5449 return false; 5450 } 5451 5452 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 5453 // FIXME: Why do we handle this one but not other removed instructions? 5454 // 5455 // Reciprocal square root. The clamp prevents infinite results, clamping 5456 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 5457 // +-max_float. 5458 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 5459 MachineRegisterInfo &MRI, 5460 MachineIRBuilder &B) const { 5461 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 5462 return true; 5463 5464 Register Dst = MI.getOperand(0).getReg(); 5465 Register Src = MI.getOperand(2).getReg(); 5466 auto Flags = MI.getFlags(); 5467 5468 LLT Ty = MRI.getType(Dst); 5469 5470 const fltSemantics *FltSemantics; 5471 if (Ty == LLT::scalar(32)) 5472 FltSemantics = &APFloat::IEEEsingle(); 5473 else if (Ty == LLT::scalar(64)) 5474 FltSemantics = &APFloat::IEEEdouble(); 5475 else 5476 return false; 5477 5478 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) 5479 .addUse(Src) 5480 .setMIFlags(Flags); 5481 5482 // We don't need to concern ourselves with the snan handling difference, since 5483 // the rsq quieted (or not) so use the one which will directly select. 5484 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5485 const bool UseIEEE = MFI->getMode().IEEE; 5486 5487 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 5488 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 5489 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 5490 5491 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 5492 5493 if (UseIEEE) 5494 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 5495 else 5496 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 5497 MI.eraseFromParent(); 5498 return true; 5499 } 5500 5501 // TODO: Fix pointer type handling 5502 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper, 5503 MachineInstr &MI, 5504 Intrinsic::ID IID) const { 5505 5506 MachineIRBuilder &B = Helper.MIRBuilder; 5507 MachineRegisterInfo &MRI = *B.getMRI(); 5508 5509 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 || 5510 IID == Intrinsic::amdgcn_permlanex16; 5511 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive || 5512 IID == Intrinsic::amdgcn_set_inactive_chain_arg; 5513 5514 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1, 5515 Register Src2, LLT VT) -> Register { 5516 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0); 5517 switch (IID) { 5518 case Intrinsic::amdgcn_readfirstlane: 5519 case Intrinsic::amdgcn_permlane64: 5520 return LaneOp.getReg(0); 5521 case Intrinsic::amdgcn_readlane: 5522 case Intrinsic::amdgcn_set_inactive: 5523 case Intrinsic::amdgcn_set_inactive_chain_arg: 5524 return LaneOp.addUse(Src1).getReg(0); 5525 case Intrinsic::amdgcn_writelane: 5526 return LaneOp.addUse(Src1).addUse(Src2).getReg(0); 5527 case Intrinsic::amdgcn_permlane16: 5528 case Intrinsic::amdgcn_permlanex16: { 5529 Register Src3 = MI.getOperand(5).getReg(); 5530 int64_t Src4 = MI.getOperand(6).getImm(); 5531 int64_t Src5 = MI.getOperand(7).getImm(); 5532 return LaneOp.addUse(Src1) 5533 .addUse(Src2) 5534 .addUse(Src3) 5535 .addImm(Src4) 5536 .addImm(Src5) 5537 .getReg(0); 5538 } 5539 case Intrinsic::amdgcn_mov_dpp8: 5540 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0); 5541 case Intrinsic::amdgcn_update_dpp: 5542 return LaneOp.addUse(Src1) 5543 .addImm(MI.getOperand(4).getImm()) 5544 .addImm(MI.getOperand(5).getImm()) 5545 .addImm(MI.getOperand(6).getImm()) 5546 .addImm(MI.getOperand(7).getImm()) 5547 .getReg(0); 5548 default: 5549 llvm_unreachable("unhandled lane op"); 5550 } 5551 }; 5552 5553 Register DstReg = MI.getOperand(0).getReg(); 5554 Register Src0 = MI.getOperand(2).getReg(); 5555 Register Src1, Src2; 5556 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane || 5557 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) { 5558 Src1 = MI.getOperand(3).getReg(); 5559 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) { 5560 Src2 = MI.getOperand(4).getReg(); 5561 } 5562 } 5563 5564 LLT Ty = MRI.getType(DstReg); 5565 unsigned Size = Ty.getSizeInBits(); 5566 5567 unsigned SplitSize = 32; 5568 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) && 5569 ST.hasDPALU_DPP() && 5570 AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm())) 5571 SplitSize = 64; 5572 5573 if (Size == SplitSize) { 5574 // Already legal 5575 return true; 5576 } 5577 5578 if (Size < 32) { 5579 Src0 = B.buildAnyExt(S32, Src0).getReg(0); 5580 5581 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 5582 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0); 5583 5584 if (IID == Intrinsic::amdgcn_writelane) 5585 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0); 5586 5587 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32); 5588 B.buildTrunc(DstReg, LaneOpDst); 5589 MI.eraseFromParent(); 5590 return true; 5591 } 5592 5593 if (Size % SplitSize != 0) 5594 return false; 5595 5596 LLT PartialResTy = LLT::scalar(SplitSize); 5597 bool NeedsBitcast = false; 5598 if (Ty.isVector()) { 5599 LLT EltTy = Ty.getElementType(); 5600 unsigned EltSize = EltTy.getSizeInBits(); 5601 if (EltSize == SplitSize) { 5602 PartialResTy = EltTy; 5603 } else if (EltSize == 16 || EltSize == 32) { 5604 unsigned NElem = SplitSize / EltSize; 5605 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem)); 5606 } else { 5607 // Handle all other cases via S32/S64 pieces 5608 NeedsBitcast = true; 5609 } 5610 } 5611 5612 SmallVector<Register, 4> PartialRes; 5613 unsigned NumParts = Size / SplitSize; 5614 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0); 5615 MachineInstrBuilder Src1Parts, Src2Parts; 5616 5617 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 5618 Src1Parts = B.buildUnmerge(PartialResTy, Src1); 5619 5620 if (IID == Intrinsic::amdgcn_writelane) 5621 Src2Parts = B.buildUnmerge(PartialResTy, Src2); 5622 5623 for (unsigned i = 0; i < NumParts; ++i) { 5624 Src0 = Src0Parts.getReg(i); 5625 5626 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) 5627 Src1 = Src1Parts.getReg(i); 5628 5629 if (IID == Intrinsic::amdgcn_writelane) 5630 Src2 = Src2Parts.getReg(i); 5631 5632 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy)); 5633 } 5634 5635 if (NeedsBitcast) 5636 B.buildBitcast(DstReg, B.buildMergeLikeInstr( 5637 LLT::scalar(Ty.getSizeInBits()), PartialRes)); 5638 else 5639 B.buildMergeLikeInstr(DstReg, PartialRes); 5640 5641 MI.eraseFromParent(); 5642 return true; 5643 } 5644 5645 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5646 MachineRegisterInfo &MRI, 5647 MachineIRBuilder &B) const { 5648 uint64_t Offset = 5649 ST.getTargetLowering()->getImplicitParameterOffset( 5650 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5651 LLT DstTy = MRI.getType(DstReg); 5652 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5653 5654 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5655 if (!loadInputValue(KernargPtrReg, B, 5656 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5657 return false; 5658 5659 // FIXME: This should be nuw 5660 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5661 return true; 5662 } 5663 5664 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 5665 /// bits of the pointer and replace them with the stride argument, then 5666 /// merge_values everything together. In the common case of a raw buffer (the 5667 /// stride component is 0), we can just AND off the upper half. 5668 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 5669 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5670 Register Result = MI.getOperand(0).getReg(); 5671 Register Pointer = MI.getOperand(2).getReg(); 5672 Register Stride = MI.getOperand(3).getReg(); 5673 Register NumRecords = MI.getOperand(4).getReg(); 5674 Register Flags = MI.getOperand(5).getReg(); 5675 5676 LLT S32 = LLT::scalar(32); 5677 5678 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 5679 auto Unmerge = B.buildUnmerge(S32, Pointer); 5680 Register LowHalf = Unmerge.getReg(0); 5681 Register HighHalf = Unmerge.getReg(1); 5682 5683 auto AndMask = B.buildConstant(S32, 0x0000ffff); 5684 auto Masked = B.buildAnd(S32, HighHalf, AndMask); 5685 5686 MachineInstrBuilder NewHighHalf = Masked; 5687 std::optional<ValueAndVReg> StrideConst = 5688 getIConstantVRegValWithLookThrough(Stride, MRI); 5689 if (!StrideConst || !StrideConst->Value.isZero()) { 5690 MachineInstrBuilder ShiftedStride; 5691 if (StrideConst) { 5692 uint32_t StrideVal = StrideConst->Value.getZExtValue(); 5693 uint32_t ShiftedStrideVal = StrideVal << 16; 5694 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 5695 } else { 5696 auto ExtStride = B.buildAnyExt(S32, Stride); 5697 auto ShiftConst = B.buildConstant(S32, 16); 5698 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 5699 } 5700 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 5701 } 5702 Register NewHighHalfReg = NewHighHalf.getReg(0); 5703 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 5704 MI.eraseFromParent(); 5705 return true; 5706 } 5707 5708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 5709 MachineRegisterInfo &MRI, 5710 MachineIRBuilder &B) const { 5711 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5712 if (!MFI->isEntryFunction()) { 5713 return legalizePreloadedArgIntrin(MI, MRI, B, 5714 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 5715 } 5716 5717 Register DstReg = MI.getOperand(0).getReg(); 5718 if (!getImplicitArgPtr(DstReg, MRI, B)) 5719 return false; 5720 5721 MI.eraseFromParent(); 5722 return true; 5723 } 5724 5725 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5726 MachineRegisterInfo &MRI, 5727 MachineIRBuilder &B) const { 5728 Function &F = B.getMF().getFunction(); 5729 std::optional<uint32_t> KnownSize = 5730 AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5731 if (KnownSize.has_value()) 5732 B.buildConstant(DstReg, *KnownSize); 5733 return false; 5734 } 5735 5736 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5737 MachineRegisterInfo &MRI, 5738 MachineIRBuilder &B) const { 5739 5740 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5741 if (!MFI->isEntryFunction()) { 5742 return legalizePreloadedArgIntrin(MI, MRI, B, 5743 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5744 } 5745 5746 Register DstReg = MI.getOperand(0).getReg(); 5747 if (!getLDSKernelId(DstReg, MRI, B)) 5748 return false; 5749 5750 MI.eraseFromParent(); 5751 return true; 5752 } 5753 5754 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 5755 MachineRegisterInfo &MRI, 5756 MachineIRBuilder &B, 5757 unsigned AddrSpace) const { 5758 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5759 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5760 Register Hi32 = Unmerge.getReg(1); 5761 5762 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 5763 MI.eraseFromParent(); 5764 return true; 5765 } 5766 5767 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 5768 // offset (the offset that is included in bounds checking and swizzling, to be 5769 // split between the instruction's voffset and immoffset fields) and soffset 5770 // (the offset that is excluded from bounds checking and swizzling, to go in 5771 // the instruction's soffset field). This function takes the first kind of 5772 // offset and figures out how to split it between voffset and immoffset. 5773 std::pair<Register, unsigned> 5774 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 5775 Register OrigOffset) const { 5776 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); 5777 Register BaseReg; 5778 unsigned ImmOffset; 5779 const LLT S32 = LLT::scalar(32); 5780 MachineRegisterInfo &MRI = *B.getMRI(); 5781 5782 std::tie(BaseReg, ImmOffset) = 5783 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 5784 5785 // If BaseReg is a pointer, convert it to int. 5786 if (MRI.getType(BaseReg).isPointer()) 5787 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 5788 5789 // If the immediate value is too big for the immoffset field, put only bits 5790 // that would normally fit in the immoffset field. The remaining value that 5791 // is copied/added for the voffset field is a large power of 2, and it 5792 // stands more chance of being CSEd with the copy/add for another similar 5793 // load/store. 5794 // However, do not do that rounding down if that is a negative 5795 // number, as it appears to be illegal to have a negative offset in the 5796 // vgpr, even if adding the immediate offset makes it positive. 5797 unsigned Overflow = ImmOffset & ~MaxImm; 5798 ImmOffset -= Overflow; 5799 if ((int32_t)Overflow < 0) { 5800 Overflow += ImmOffset; 5801 ImmOffset = 0; 5802 } 5803 5804 if (Overflow != 0) { 5805 if (!BaseReg) { 5806 BaseReg = B.buildConstant(S32, Overflow).getReg(0); 5807 } else { 5808 auto OverflowVal = B.buildConstant(S32, Overflow); 5809 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 5810 } 5811 } 5812 5813 if (!BaseReg) 5814 BaseReg = B.buildConstant(S32, 0).getReg(0); 5815 5816 return std::pair(BaseReg, ImmOffset); 5817 } 5818 5819 /// Handle register layout difference for f16 images for some subtargets. 5820 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 5821 MachineRegisterInfo &MRI, 5822 Register Reg, 5823 bool ImageStore) const { 5824 const LLT S16 = LLT::scalar(16); 5825 const LLT S32 = LLT::scalar(32); 5826 LLT StoreVT = MRI.getType(Reg); 5827 assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 5828 5829 if (ST.hasUnpackedD16VMem()) { 5830 auto Unmerge = B.buildUnmerge(S16, Reg); 5831 5832 SmallVector<Register, 4> WideRegs; 5833 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5834 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 5835 5836 int NumElts = StoreVT.getNumElements(); 5837 5838 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5839 .getReg(0); 5840 } 5841 5842 if (ImageStore && ST.hasImageStoreD16Bug()) { 5843 if (StoreVT.getNumElements() == 2) { 5844 SmallVector<Register, 4> PackedRegs; 5845 Reg = B.buildBitcast(S32, Reg).getReg(0); 5846 PackedRegs.push_back(Reg); 5847 PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5848 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5849 .getReg(0); 5850 } 5851 5852 if (StoreVT.getNumElements() == 3) { 5853 SmallVector<Register, 4> PackedRegs; 5854 auto Unmerge = B.buildUnmerge(S16, Reg); 5855 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5856 PackedRegs.push_back(Unmerge.getReg(I)); 5857 PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5858 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5859 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5860 } 5861 5862 if (StoreVT.getNumElements() == 4) { 5863 SmallVector<Register, 4> PackedRegs; 5864 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5865 auto Unmerge = B.buildUnmerge(S32, Reg); 5866 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5867 PackedRegs.push_back(Unmerge.getReg(I)); 5868 PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5869 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5870 .getReg(0); 5871 } 5872 5873 llvm_unreachable("invalid data type"); 5874 } 5875 5876 if (StoreVT == LLT::fixed_vector(3, S16)) { 5877 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 5878 .getReg(0); 5879 } 5880 return Reg; 5881 } 5882 5883 Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B, 5884 Register VData, LLT MemTy, 5885 bool IsFormat) const { 5886 MachineRegisterInfo *MRI = B.getMRI(); 5887 LLT Ty = MRI->getType(VData); 5888 5889 const LLT S16 = LLT::scalar(16); 5890 5891 // Fixup buffer resources themselves needing to be v4i128. 5892 if (hasBufferRsrcWorkaround(Ty)) 5893 return castBufferRsrcToV4I32(VData, B); 5894 5895 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) { 5896 Ty = getBitcastRegisterType(Ty); 5897 VData = B.buildBitcast(Ty, VData).getReg(0); 5898 } 5899 // Fixup illegal register types for i8 stores. 5900 if (Ty == LLT::scalar(8) || Ty == S16) { 5901 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 5902 return AnyExt; 5903 } 5904 5905 if (Ty.isVector()) { 5906 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 5907 if (IsFormat) 5908 return handleD16VData(B, *MRI, VData); 5909 } 5910 } 5911 5912 return VData; 5913 } 5914 5915 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 5916 LegalizerHelper &Helper, 5917 bool IsTyped, 5918 bool IsFormat) const { 5919 MachineIRBuilder &B = Helper.MIRBuilder; 5920 MachineRegisterInfo &MRI = *B.getMRI(); 5921 5922 Register VData = MI.getOperand(1).getReg(); 5923 LLT Ty = MRI.getType(VData); 5924 LLT EltTy = Ty.getScalarType(); 5925 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 5926 const LLT S32 = LLT::scalar(32); 5927 5928 MachineMemOperand *MMO = *MI.memoperands_begin(); 5929 const int MemSize = MMO->getSize().getValue(); 5930 LLT MemTy = MMO->getMemoryType(); 5931 5932 VData = fixStoreSourceType(B, VData, MemTy, IsFormat); 5933 5934 castBufferRsrcArgToV4I32(MI, B, 2); 5935 Register RSrc = MI.getOperand(2).getReg(); 5936 5937 unsigned ImmOffset; 5938 5939 // The typed intrinsics add an immediate after the registers. 5940 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 5941 5942 // The struct intrinsic variants add one additional operand over raw. 5943 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 5944 Register VIndex; 5945 int OpOffset = 0; 5946 if (HasVIndex) { 5947 VIndex = MI.getOperand(3).getReg(); 5948 OpOffset = 1; 5949 } else { 5950 VIndex = B.buildConstant(S32, 0).getReg(0); 5951 } 5952 5953 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 5954 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 5955 5956 unsigned Format = 0; 5957 if (IsTyped) { 5958 Format = MI.getOperand(5 + OpOffset).getImm(); 5959 ++OpOffset; 5960 } 5961 5962 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 5963 5964 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 5965 5966 unsigned Opc; 5967 if (IsTyped) { 5968 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 5969 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 5970 } else if (IsFormat) { 5971 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 5972 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 5973 } else { 5974 switch (MemSize) { 5975 case 1: 5976 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 5977 break; 5978 case 2: 5979 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 5980 break; 5981 default: 5982 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 5983 break; 5984 } 5985 } 5986 5987 auto MIB = B.buildInstr(Opc) 5988 .addUse(VData) // vdata 5989 .addUse(RSrc) // rsrc 5990 .addUse(VIndex) // vindex 5991 .addUse(VOffset) // voffset 5992 .addUse(SOffset) // soffset 5993 .addImm(ImmOffset); // offset(imm) 5994 5995 if (IsTyped) 5996 MIB.addImm(Format); 5997 5998 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5999 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 6000 .addMemOperand(MMO); 6001 6002 MI.eraseFromParent(); 6003 return true; 6004 } 6005 6006 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 6007 Register VIndex, Register VOffset, Register SOffset, 6008 unsigned ImmOffset, unsigned Format, 6009 unsigned AuxiliaryData, MachineMemOperand *MMO, 6010 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 6011 auto MIB = B.buildInstr(Opc) 6012 .addDef(LoadDstReg) // vdata 6013 .addUse(RSrc) // rsrc 6014 .addUse(VIndex) // vindex 6015 .addUse(VOffset) // voffset 6016 .addUse(SOffset) // soffset 6017 .addImm(ImmOffset); // offset(imm) 6018 6019 if (IsTyped) 6020 MIB.addImm(Format); 6021 6022 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 6023 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 6024 .addMemOperand(MMO); 6025 } 6026 6027 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 6028 LegalizerHelper &Helper, 6029 bool IsFormat, 6030 bool IsTyped) const { 6031 MachineIRBuilder &B = Helper.MIRBuilder; 6032 MachineRegisterInfo &MRI = *B.getMRI(); 6033 GISelChangeObserver &Observer = Helper.Observer; 6034 6035 // FIXME: Verifier should enforce 1 MMO for these intrinsics. 6036 MachineMemOperand *MMO = *MI.memoperands_begin(); 6037 const LLT MemTy = MMO->getMemoryType(); 6038 const LLT S32 = LLT::scalar(32); 6039 6040 Register Dst = MI.getOperand(0).getReg(); 6041 6042 Register StatusDst; 6043 int OpOffset = 0; 6044 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 6045 bool IsTFE = MI.getNumExplicitDefs() == 2; 6046 if (IsTFE) { 6047 StatusDst = MI.getOperand(1).getReg(); 6048 ++OpOffset; 6049 } 6050 6051 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 6052 Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 6053 6054 // The typed intrinsics add an immediate after the registers. 6055 const unsigned NumVIndexOps = IsTyped ? 8 : 7; 6056 6057 // The struct intrinsic variants add one additional operand over raw. 6058 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 6059 Register VIndex; 6060 if (HasVIndex) { 6061 VIndex = MI.getOperand(3 + OpOffset).getReg(); 6062 ++OpOffset; 6063 } else { 6064 VIndex = B.buildConstant(S32, 0).getReg(0); 6065 } 6066 6067 Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 6068 Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 6069 6070 unsigned Format = 0; 6071 if (IsTyped) { 6072 Format = MI.getOperand(5 + OpOffset).getImm(); 6073 ++OpOffset; 6074 } 6075 6076 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 6077 unsigned ImmOffset; 6078 6079 LLT Ty = MRI.getType(Dst); 6080 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 6081 // logic doesn't have to handle that case. 6082 if (hasBufferRsrcWorkaround(Ty)) { 6083 Observer.changingInstr(MI); 6084 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 6085 Observer.changedInstr(MI); 6086 Dst = MI.getOperand(0).getReg(); 6087 B.setInsertPt(B.getMBB(), MI); 6088 } 6089 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) { 6090 Ty = getBitcastRegisterType(Ty); 6091 Observer.changingInstr(MI); 6092 Helper.bitcastDst(MI, Ty, 0); 6093 Observer.changedInstr(MI); 6094 Dst = MI.getOperand(0).getReg(); 6095 B.setInsertPt(B.getMBB(), MI); 6096 } 6097 6098 LLT EltTy = Ty.getScalarType(); 6099 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 6100 const bool Unpacked = ST.hasUnpackedD16VMem(); 6101 6102 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 6103 6104 unsigned Opc; 6105 6106 // TODO: Support TFE for typed and narrow loads. 6107 if (IsTyped) { 6108 if (IsTFE) 6109 return false; 6110 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 6111 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 6112 } else if (IsFormat) { 6113 if (IsD16) { 6114 if (IsTFE) 6115 return false; 6116 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 6117 } else { 6118 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 6119 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 6120 } 6121 } else { 6122 switch (MemTy.getSizeInBits()) { 6123 case 8: 6124 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE 6125 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 6126 break; 6127 case 16: 6128 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE 6129 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 6130 break; 6131 default: 6132 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE 6133 : AMDGPU::G_AMDGPU_BUFFER_LOAD; 6134 break; 6135 } 6136 } 6137 6138 if (IsTFE) { 6139 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 6140 unsigned NumLoadDWords = NumValueDWords + 1; 6141 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 6142 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 6143 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6144 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6145 if (MemTy.getSizeInBits() < 32) { 6146 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32); 6147 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg); 6148 B.buildTrunc(Dst, ExtDst); 6149 } else if (NumValueDWords == 1) { 6150 B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 6151 } else { 6152 SmallVector<Register, 5> LoadElts; 6153 for (unsigned I = 0; I != NumValueDWords; ++I) 6154 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 6155 LoadElts.push_back(StatusDst); 6156 B.buildUnmerge(LoadElts, LoadDstReg); 6157 LoadElts.truncate(NumValueDWords); 6158 B.buildMergeLikeInstr(Dst, LoadElts); 6159 } 6160 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 6161 (IsD16 && !Ty.isVector())) { 6162 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 6163 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6164 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6165 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6166 B.buildTrunc(Dst, LoadDstReg); 6167 } else if (Unpacked && IsD16 && Ty.isVector()) { 6168 LLT UnpackedTy = Ty.changeElementSize(32); 6169 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 6170 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 6171 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6172 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6173 // FIXME: G_TRUNC should work, but legalization currently fails 6174 auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 6175 SmallVector<Register, 4> Repack; 6176 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 6177 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 6178 B.buildMergeLikeInstr(Dst, Repack); 6179 } else { 6180 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 6181 AuxiliaryData, MMO, IsTyped, HasVIndex, B); 6182 } 6183 6184 MI.eraseFromParent(); 6185 return true; 6186 } 6187 6188 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 6189 switch (IntrID) { 6190 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 6191 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 6192 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 6193 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 6194 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 6195 case Intrinsic::amdgcn_raw_buffer_atomic_add: 6196 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 6197 case Intrinsic::amdgcn_struct_buffer_atomic_add: 6198 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 6199 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 6200 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 6201 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 6202 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 6203 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 6204 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 6205 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 6206 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 6207 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 6208 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 6209 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 6210 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 6211 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 6212 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 6213 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 6214 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 6215 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 6216 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 6217 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 6218 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 6219 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 6220 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 6221 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 6222 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 6223 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 6224 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 6225 case Intrinsic::amdgcn_raw_buffer_atomic_and: 6226 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 6227 case Intrinsic::amdgcn_struct_buffer_atomic_and: 6228 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 6229 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 6230 case Intrinsic::amdgcn_raw_buffer_atomic_or: 6231 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 6232 case Intrinsic::amdgcn_struct_buffer_atomic_or: 6233 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 6234 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 6235 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 6236 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 6237 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 6238 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 6239 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 6240 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 6241 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 6242 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 6243 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 6244 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 6245 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 6246 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 6247 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 6248 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 6249 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 6250 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 6251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 6252 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 6253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 6254 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 6255 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 6256 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 6257 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 6258 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 6259 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 6260 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 6261 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 6262 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 6263 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 6264 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 6265 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 6266 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 6267 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 6268 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 6269 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 6270 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 6271 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 6272 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; 6273 default: 6274 llvm_unreachable("unhandled atomic opcode"); 6275 } 6276 } 6277 6278 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 6279 MachineIRBuilder &B, 6280 Intrinsic::ID IID) const { 6281 const bool IsCmpSwap = 6282 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 6283 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 6284 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 6285 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 6286 6287 Register Dst = MI.getOperand(0).getReg(); 6288 // Since we don't have 128-bit atomics, we don't need to handle the case of 6289 // p8 argmunents to the atomic itself 6290 Register VData = MI.getOperand(2).getReg(); 6291 6292 Register CmpVal; 6293 int OpOffset = 0; 6294 6295 if (IsCmpSwap) { 6296 CmpVal = MI.getOperand(3).getReg(); 6297 ++OpOffset; 6298 } 6299 6300 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 6301 Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 6302 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 6303 6304 // The struct intrinsic variants add one additional operand over raw. 6305 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 6306 Register VIndex; 6307 if (HasVIndex) { 6308 VIndex = MI.getOperand(4 + OpOffset).getReg(); 6309 ++OpOffset; 6310 } else { 6311 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 6312 } 6313 6314 Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 6315 Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 6316 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 6317 6318 MachineMemOperand *MMO = *MI.memoperands_begin(); 6319 6320 unsigned ImmOffset; 6321 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 6322 6323 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 6324 .addDef(Dst) 6325 .addUse(VData); // vdata 6326 6327 if (IsCmpSwap) 6328 MIB.addReg(CmpVal); 6329 6330 MIB.addUse(RSrc) // rsrc 6331 .addUse(VIndex) // vindex 6332 .addUse(VOffset) // voffset 6333 .addUse(SOffset) // soffset 6334 .addImm(ImmOffset) // offset(imm) 6335 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 6336 .addImm(HasVIndex ? -1 : 0) // idxen(imm) 6337 .addMemOperand(MMO); 6338 6339 MI.eraseFromParent(); 6340 return true; 6341 } 6342 6343 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 6344 /// vector with s16 typed elements. 6345 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 6346 SmallVectorImpl<Register> &PackedAddrs, 6347 unsigned ArgOffset, 6348 const AMDGPU::ImageDimIntrinsicInfo *Intr, 6349 bool IsA16, bool IsG16) { 6350 const LLT S16 = LLT::scalar(16); 6351 const LLT V2S16 = LLT::fixed_vector(2, 16); 6352 auto EndIdx = Intr->VAddrEnd; 6353 6354 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 6355 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6356 if (!SrcOp.isReg()) 6357 continue; // _L to _LZ may have eliminated this. 6358 6359 Register AddrReg = SrcOp.getReg(); 6360 6361 if ((I < Intr->GradientStart) || 6362 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 6363 (I >= Intr->CoordStart && !IsA16)) { 6364 if ((I < Intr->GradientStart) && IsA16 && 6365 (B.getMRI()->getType(AddrReg) == S16)) { 6366 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 6367 // Special handling of bias when A16 is on. Bias is of type half but 6368 // occupies full 32-bit. 6369 PackedAddrs.push_back( 6370 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6371 .getReg(0)); 6372 } else { 6373 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 6374 "Bias needs to be converted to 16 bit in A16 mode"); 6375 // Handle any gradient or coordinate operands that should not be packed 6376 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 6377 PackedAddrs.push_back(AddrReg); 6378 } 6379 } else { 6380 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 6381 // derivatives dx/dh and dx/dv are packed with undef. 6382 if (((I + 1) >= EndIdx) || 6383 ((Intr->NumGradients / 2) % 2 == 1 && 6384 (I == static_cast<unsigned>(Intr->GradientStart + 6385 (Intr->NumGradients / 2) - 1) || 6386 I == static_cast<unsigned>(Intr->GradientStart + 6387 Intr->NumGradients - 1))) || 6388 // Check for _L to _LZ optimization 6389 !MI.getOperand(ArgOffset + I + 1).isReg()) { 6390 PackedAddrs.push_back( 6391 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 6392 .getReg(0)); 6393 } else { 6394 PackedAddrs.push_back( 6395 B.buildBuildVector( 6396 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 6397 .getReg(0)); 6398 ++I; 6399 } 6400 } 6401 } 6402 } 6403 6404 /// Convert from separate vaddr components to a single vector address register, 6405 /// and replace the remaining operands with $noreg. 6406 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 6407 int DimIdx, int NumVAddrs) { 6408 const LLT S32 = LLT::scalar(32); 6409 (void)S32; 6410 SmallVector<Register, 8> AddrRegs; 6411 for (int I = 0; I != NumVAddrs; ++I) { 6412 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6413 if (SrcOp.isReg()) { 6414 AddrRegs.push_back(SrcOp.getReg()); 6415 assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 6416 } 6417 } 6418 6419 int NumAddrRegs = AddrRegs.size(); 6420 if (NumAddrRegs != 1) { 6421 auto VAddr = 6422 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 6423 MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 6424 } 6425 6426 for (int I = 1; I != NumVAddrs; ++I) { 6427 MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 6428 if (SrcOp.isReg()) 6429 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 6430 } 6431 } 6432 6433 /// Rewrite image intrinsics to use register layouts expected by the subtarget. 6434 /// 6435 /// Depending on the subtarget, load/store with 16-bit element data need to be 6436 /// rewritten to use the low half of 32-bit registers, or directly use a packed 6437 /// layout. 16-bit addresses should also sometimes be packed into 32-bit 6438 /// registers. 6439 /// 6440 /// We don't want to directly select image instructions just yet, but also want 6441 /// to exposes all register repacking to the legalizer/combiners. We also don't 6442 /// want a selected instruction entering RegBankSelect. In order to avoid 6443 /// defining a multitude of intermediate image instructions, directly hack on 6444 /// the intrinsic's arguments. In cases like a16 addresses, this requires 6445 /// padding now unnecessary arguments with $noreg. 6446 bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 6447 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 6448 const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 6449 6450 const MachineFunction &MF = *MI.getMF(); 6451 const unsigned NumDefs = MI.getNumExplicitDefs(); 6452 const unsigned ArgOffset = NumDefs + 1; 6453 bool IsTFE = NumDefs == 2; 6454 // We are only processing the operands of d16 image operations on subtargets 6455 // that use the unpacked register layout, or need to repack the TFE result. 6456 6457 // TODO: Do we need to guard against already legalized intrinsics? 6458 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 6459 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 6460 6461 MachineRegisterInfo *MRI = B.getMRI(); 6462 const LLT S32 = LLT::scalar(32); 6463 const LLT S16 = LLT::scalar(16); 6464 const LLT V2S16 = LLT::fixed_vector(2, 16); 6465 6466 unsigned DMask = 0; 6467 Register VData; 6468 LLT Ty; 6469 6470 if (!BaseOpcode->NoReturn || BaseOpcode->Store) { 6471 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 6472 Ty = MRI->getType(VData); 6473 } 6474 6475 const bool IsAtomicPacked16Bit = 6476 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 6477 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 6478 6479 // Check for 16 bit addresses and pack if true. 6480 LLT GradTy = 6481 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 6482 LLT AddrTy = 6483 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 6484 const bool IsG16 = 6485 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 6486 const bool IsA16 = AddrTy == S16; 6487 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; 6488 6489 int DMaskLanes = 0; 6490 if (!BaseOpcode->Atomic) { 6491 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 6492 if (BaseOpcode->Gather4) { 6493 DMaskLanes = 4; 6494 } else if (DMask != 0) { 6495 DMaskLanes = llvm::popcount(DMask); 6496 } else if (!IsTFE && !BaseOpcode->Store) { 6497 // If dmask is 0, this is a no-op load. This can be eliminated. 6498 B.buildUndef(MI.getOperand(0)); 6499 MI.eraseFromParent(); 6500 return true; 6501 } 6502 } 6503 6504 Observer.changingInstr(MI); 6505 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 6506 6507 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 6508 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 6509 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 6510 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 6511 unsigned NewOpcode = LoadOpcode; 6512 if (BaseOpcode->Store) 6513 NewOpcode = StoreOpcode; 6514 else if (BaseOpcode->NoReturn) 6515 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET; 6516 6517 // Track that we legalized this 6518 MI.setDesc(B.getTII().get(NewOpcode)); 6519 6520 // Expecting to get an error flag since TFC is on - and dmask is 0 Force 6521 // dmask to be at least 1 otherwise the instruction will fail 6522 if (IsTFE && DMask == 0) { 6523 DMask = 0x1; 6524 DMaskLanes = 1; 6525 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 6526 } 6527 6528 if (BaseOpcode->Atomic) { 6529 Register VData0 = MI.getOperand(2).getReg(); 6530 LLT Ty = MRI->getType(VData0); 6531 6532 // TODO: Allow atomic swap and bit ops for v2s16/v4s16 6533 if (Ty.isVector() && !IsAtomicPacked16Bit) 6534 return false; 6535 6536 if (BaseOpcode->AtomicX2) { 6537 Register VData1 = MI.getOperand(3).getReg(); 6538 // The two values are packed in one register. 6539 LLT PackedTy = LLT::fixed_vector(2, Ty); 6540 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 6541 MI.getOperand(2).setReg(Concat.getReg(0)); 6542 MI.getOperand(3).setReg(AMDGPU::NoRegister); 6543 } 6544 } 6545 6546 unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 6547 6548 // Rewrite the addressing register layout before doing anything else. 6549 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 6550 // 16 bit gradients are supported, but are tied to the A16 control 6551 // so both gradients and addresses must be 16 bit 6552 return false; 6553 } 6554 6555 if (IsA16 && !ST.hasA16()) { 6556 // A16 not supported 6557 return false; 6558 } 6559 6560 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); 6561 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 6562 6563 if (IsA16 || IsG16) { 6564 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the 6565 // instructions expect VGPR_32 6566 SmallVector<Register, 4> PackedRegs; 6567 6568 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); 6569 6570 // See also below in the non-a16 branch 6571 const bool UseNSA = ST.hasNSAEncoding() && 6572 PackedRegs.size() >= ST.getNSAThreshold(MF) && 6573 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 6574 const bool UsePartialNSA = 6575 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 6576 6577 if (UsePartialNSA) { 6578 // Pack registers that would go over NSAMaxSize into last VAddr register 6579 LLT PackedAddrTy = 6580 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 6581 auto Concat = B.buildConcatVectors( 6582 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 6583 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 6584 PackedRegs.resize(NSAMaxSize); 6585 } else if (!UseNSA && PackedRegs.size() > 1) { 6586 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 6587 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 6588 PackedRegs[0] = Concat.getReg(0); 6589 PackedRegs.resize(1); 6590 } 6591 6592 const unsigned NumPacked = PackedRegs.size(); 6593 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 6594 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 6595 if (!SrcOp.isReg()) { 6596 assert(SrcOp.isImm() && SrcOp.getImm() == 0); 6597 continue; 6598 } 6599 6600 assert(SrcOp.getReg() != AMDGPU::NoRegister); 6601 6602 if (I - Intr->VAddrStart < NumPacked) 6603 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 6604 else 6605 SrcOp.setReg(AMDGPU::NoRegister); 6606 } 6607 } else { 6608 // If the register allocator cannot place the address registers contiguously 6609 // without introducing moves, then using the non-sequential address encoding 6610 // is always preferable, since it saves VALU instructions and is usually a 6611 // wash in terms of code size or even better. 6612 // 6613 // However, we currently have no way of hinting to the register allocator 6614 // that MIMG addresses should be placed contiguously when it is possible to 6615 // do so, so force non-NSA for the common 2-address case as a heuristic. 6616 // 6617 // SIShrinkInstructions will convert NSA encodings to non-NSA after register 6618 // allocation when possible. 6619 // 6620 // Partial NSA is allowed on GFX11+ where the final register is a contiguous 6621 // set of the remaining addresses. 6622 const bool UseNSA = ST.hasNSAEncoding() && 6623 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 6624 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 6625 const bool UsePartialNSA = 6626 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 6627 6628 if (UsePartialNSA) { 6629 convertImageAddrToPacked(B, MI, 6630 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 6631 Intr->NumVAddrs - NSAMaxSize + 1); 6632 } else if (!UseNSA && Intr->NumVAddrs > 1) { 6633 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 6634 Intr->NumVAddrs); 6635 } 6636 } 6637 6638 int Flags = 0; 6639 if (IsA16) 6640 Flags |= 1; 6641 if (IsG16) 6642 Flags |= 2; 6643 MI.addOperand(MachineOperand::CreateImm(Flags)); 6644 6645 if (BaseOpcode->NoReturn) { // No TFE for stores? 6646 // TODO: Handle dmask trim 6647 if (!Ty.isVector() || !IsD16) 6648 return true; 6649 6650 Register RepackedReg = handleD16VData(B, *MRI, VData, true); 6651 if (RepackedReg != VData) { 6652 MI.getOperand(1).setReg(RepackedReg); 6653 } 6654 6655 return true; 6656 } 6657 6658 Register DstReg = MI.getOperand(0).getReg(); 6659 const LLT EltTy = Ty.getScalarType(); 6660 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 6661 6662 // Confirm that the return type is large enough for the dmask specified 6663 if (NumElts < DMaskLanes) 6664 return false; 6665 6666 if (NumElts > 4 || DMaskLanes > 4) 6667 return false; 6668 6669 // Image atomic instructions are using DMask to specify how many bits 6670 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). 6671 // DMaskLanes for image atomic has default value '0'. 6672 // We must be sure that atomic variants (especially packed) will not be 6673 // truncated from v2s16 or v4s16 to s16 type. 6674 // 6675 // ChangeElementCount will be needed for image load where Ty is always scalar. 6676 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6677 const LLT AdjustedTy = 6678 DMaskLanes == 0 6679 ? Ty 6680 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 6681 6682 // The raw dword aligned data component of the load. The only legal cases 6683 // where this matters should be when using the packed D16 format, for 6684 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 6685 LLT RoundedTy; 6686 6687 // S32 vector to cover all data, plus TFE result element. 6688 LLT TFETy; 6689 6690 // Register type to use for each loaded component. Will be S32 or V2S16. 6691 LLT RegTy; 6692 6693 if (IsD16 && ST.hasUnpackedD16VMem()) { 6694 RoundedTy = 6695 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6696 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 6697 RegTy = S32; 6698 } else { 6699 unsigned EltSize = EltTy.getSizeInBits(); 6700 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 6701 unsigned RoundedSize = 32 * RoundedElts; 6702 RoundedTy = LLT::scalarOrVector( 6703 ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6704 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 6705 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 6706 } 6707 6708 // The return type does not need adjustment. 6709 // TODO: Should we change s16 case to s32 or <2 x s16>? 6710 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 6711 return true; 6712 6713 Register Dst1Reg; 6714 6715 // Insert after the instruction. 6716 B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 6717 6718 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 6719 // s16> instead of s32, we would only need 1 bitcast instead of multiple. 6720 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 6721 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 6722 6723 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 6724 6725 MI.getOperand(0).setReg(NewResultReg); 6726 6727 // In the IR, TFE is supposed to be used with a 2 element struct return 6728 // type. The instruction really returns these two values in one contiguous 6729 // register, with one additional dword beyond the loaded data. Rewrite the 6730 // return type to use a single register result. 6731 6732 if (IsTFE) { 6733 Dst1Reg = MI.getOperand(1).getReg(); 6734 if (MRI->getType(Dst1Reg) != S32) 6735 return false; 6736 6737 // TODO: Make sure the TFE operand bit is set. 6738 MI.removeOperand(1); 6739 6740 // Handle the easy case that requires no repack instructions. 6741 if (Ty == S32) { 6742 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 6743 return true; 6744 } 6745 } 6746 6747 // Now figure out how to copy the new result register back into the old 6748 // result. 6749 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 6750 6751 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 6752 6753 if (ResultNumRegs == 1) { 6754 assert(!IsTFE); 6755 ResultRegs[0] = NewResultReg; 6756 } else { 6757 // We have to repack into a new vector of some kind. 6758 for (int I = 0; I != NumDataRegs; ++I) 6759 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 6760 B.buildUnmerge(ResultRegs, NewResultReg); 6761 6762 // Drop the final TFE element to get the data part. The TFE result is 6763 // directly written to the right place already. 6764 if (IsTFE) 6765 ResultRegs.resize(NumDataRegs); 6766 } 6767 6768 // For an s16 scalar result, we form an s32 result with a truncate regardless 6769 // of packed vs. unpacked. 6770 if (IsD16 && !Ty.isVector()) { 6771 B.buildTrunc(DstReg, ResultRegs[0]); 6772 return true; 6773 } 6774 6775 // Avoid a build/concat_vector of 1 entry. 6776 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 6777 B.buildBitcast(DstReg, ResultRegs[0]); 6778 return true; 6779 } 6780 6781 assert(Ty.isVector()); 6782 6783 if (IsD16) { 6784 // For packed D16 results with TFE enabled, all the data components are 6785 // S32. Cast back to the expected type. 6786 // 6787 // TODO: We don't really need to use load s32 elements. We would only need one 6788 // cast for the TFE result if a multiple of v2s16 was used. 6789 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 6790 for (Register &Reg : ResultRegs) 6791 Reg = B.buildBitcast(V2S16, Reg).getReg(0); 6792 } else if (ST.hasUnpackedD16VMem()) { 6793 for (Register &Reg : ResultRegs) 6794 Reg = B.buildTrunc(S16, Reg).getReg(0); 6795 } 6796 } 6797 6798 auto padWithUndef = [&](LLT Ty, int NumElts) { 6799 if (NumElts == 0) 6800 return; 6801 Register Undef = B.buildUndef(Ty).getReg(0); 6802 for (int I = 0; I != NumElts; ++I) 6803 ResultRegs.push_back(Undef); 6804 }; 6805 6806 // Pad out any elements eliminated due to the dmask. 6807 LLT ResTy = MRI->getType(ResultRegs[0]); 6808 if (!ResTy.isVector()) { 6809 padWithUndef(ResTy, NumElts - ResultRegs.size()); 6810 B.buildBuildVector(DstReg, ResultRegs); 6811 return true; 6812 } 6813 6814 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 6815 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 6816 6817 // Deal with the one annoying legal case. 6818 const LLT V3S16 = LLT::fixed_vector(3, 16); 6819 if (Ty == V3S16) { 6820 if (IsTFE) { 6821 if (ResultRegs.size() == 1) { 6822 NewResultReg = ResultRegs[0]; 6823 } else if (ResultRegs.size() == 2) { 6824 LLT V4S16 = LLT::fixed_vector(4, 16); 6825 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 6826 } else { 6827 return false; 6828 } 6829 } 6830 6831 if (MRI->getType(DstReg).getNumElements() < 6832 MRI->getType(NewResultReg).getNumElements()) { 6833 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 6834 } else { 6835 B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 6836 } 6837 return true; 6838 } 6839 6840 padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 6841 B.buildConcatVectors(DstReg, ResultRegs); 6842 return true; 6843 } 6844 6845 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, 6846 MachineInstr &MI) const { 6847 MachineIRBuilder &B = Helper.MIRBuilder; 6848 GISelChangeObserver &Observer = Helper.Observer; 6849 6850 Register OrigDst = MI.getOperand(0).getReg(); 6851 Register Dst; 6852 LLT Ty = B.getMRI()->getType(OrigDst); 6853 unsigned Size = Ty.getSizeInBits(); 6854 MachineFunction &MF = B.getMF(); 6855 unsigned Opc = 0; 6856 if (Size < 32 && ST.hasScalarSubwordLoads()) { 6857 assert(Size == 8 || Size == 16); 6858 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE 6859 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT; 6860 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit 6861 // destination register. 6862 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); 6863 } else { 6864 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD; 6865 Dst = OrigDst; 6866 } 6867 6868 Observer.changingInstr(MI); 6869 6870 // Handle needing to s.buffer.load() a p8 value. 6871 if (hasBufferRsrcWorkaround(Ty)) { 6872 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 6873 B.setInsertPt(B.getMBB(), MI); 6874 } 6875 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6876 Ty = getBitcastRegisterType(Ty); 6877 Helper.bitcastDst(MI, Ty, 0); 6878 B.setInsertPt(B.getMBB(), MI); 6879 } 6880 6881 // FIXME: We don't really need this intermediate instruction. The intrinsic 6882 // should be fixed to have a memory operand. Since it's readnone, we're not 6883 // allowed to add one. 6884 MI.setDesc(B.getTII().get(Opc)); 6885 MI.removeOperand(1); // Remove intrinsic ID 6886 6887 // FIXME: When intrinsic definition is fixed, this should have an MMO already. 6888 const unsigned MemSize = (Size + 7) / 8; 6889 const Align MemAlign = B.getDataLayout().getABITypeAlign( 6890 getTypeForLLT(Ty, MF.getFunction().getContext())); 6891 MachineMemOperand *MMO = MF.getMachineMemOperand( 6892 MachinePointerInfo(), 6893 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6894 MachineMemOperand::MOInvariant, 6895 MemSize, MemAlign); 6896 MI.addMemOperand(MF, MMO); 6897 if (Dst != OrigDst) { 6898 MI.getOperand(0).setReg(Dst); 6899 B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 6900 B.buildTrunc(OrigDst, Dst); 6901 } 6902 6903 // If we don't have 96-bit result scalar loads, widening to 128-bit should 6904 // always be legal. We may need to restore this to a 96-bit result if it turns 6905 // out this needs to be converted to a vector load during RegBankSelect. 6906 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { 6907 if (Ty.isVector()) 6908 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 6909 else 6910 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 6911 } 6912 6913 Observer.changedInstr(MI); 6914 return true; 6915 } 6916 6917 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper, 6918 MachineInstr &MI) const { 6919 MachineIRBuilder &B = Helper.MIRBuilder; 6920 GISelChangeObserver &Observer = Helper.Observer; 6921 Observer.changingInstr(MI); 6922 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH)); 6923 MI.removeOperand(0); // Remove intrinsic ID 6924 castBufferRsrcArgToV4I32(MI, B, 0); 6925 Observer.changedInstr(MI); 6926 return true; 6927 } 6928 6929 // TODO: Move to selection 6930 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI, 6931 MachineRegisterInfo &MRI, 6932 MachineIRBuilder &B) const { 6933 if (!ST.isTrapHandlerEnabled() || 6934 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6935 return legalizeTrapEndpgm(MI, MRI, B); 6936 6937 return ST.supportsGetDoorbellID() ? 6938 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6939 } 6940 6941 bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6942 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6943 const DebugLoc &DL = MI.getDebugLoc(); 6944 MachineBasicBlock &BB = B.getMBB(); 6945 MachineFunction *MF = BB.getParent(); 6946 6947 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 6948 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6949 .addImm(0); 6950 MI.eraseFromParent(); 6951 return true; 6952 } 6953 6954 // We need a block split to make the real endpgm a terminator. We also don't 6955 // want to break phis in successor blocks, so we can't just delete to the 6956 // end of the block. 6957 BB.splitAt(MI, false /*UpdateLiveIns*/); 6958 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 6959 MF->push_back(TrapBB); 6960 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 6961 .addImm(0); 6962 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 6963 .addMBB(TrapBB); 6964 6965 BB.addSuccessor(TrapBB); 6966 MI.eraseFromParent(); 6967 return true; 6968 } 6969 6970 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6971 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6972 MachineFunction &MF = B.getMF(); 6973 const LLT S64 = LLT::scalar(64); 6974 6975 Register SGPR01(AMDGPU::SGPR0_SGPR1); 6976 // For code object version 5, queue_ptr is passed through implicit kernarg. 6977 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 6978 AMDGPU::AMDHSA_COV5) { 6979 AMDGPUTargetLowering::ImplicitParameter Param = 6980 AMDGPUTargetLowering::QUEUE_PTR; 6981 uint64_t Offset = 6982 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 6983 6984 Register KernargPtrReg = MRI.createGenericVirtualRegister( 6985 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6986 6987 if (!loadInputValue(KernargPtrReg, B, 6988 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 6989 return false; 6990 6991 // TODO: can we be smarter about machine pointer info? 6992 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 6993 MachineMemOperand *MMO = MF.getMachineMemOperand( 6994 PtrInfo, 6995 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 6996 MachineMemOperand::MOInvariant, 6997 LLT::scalar(64), commonAlignment(Align(64), Offset)); 6998 6999 // Pointer address 7000 Register LoadAddr = MRI.createGenericVirtualRegister( 7001 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 7002 B.buildPtrAdd(LoadAddr, KernargPtrReg, 7003 B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 7004 // Load address 7005 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 7006 B.buildCopy(SGPR01, Temp); 7007 B.buildInstr(AMDGPU::S_TRAP) 7008 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 7009 .addReg(SGPR01, RegState::Implicit); 7010 MI.eraseFromParent(); 7011 return true; 7012 } 7013 7014 // Pass queue pointer to trap handler as input, and insert trap instruction 7015 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 7016 Register LiveIn = 7017 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 7018 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 7019 return false; 7020 7021 B.buildCopy(SGPR01, LiveIn); 7022 B.buildInstr(AMDGPU::S_TRAP) 7023 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 7024 .addReg(SGPR01, RegState::Implicit); 7025 7026 MI.eraseFromParent(); 7027 return true; 7028 } 7029 7030 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI, 7031 MachineRegisterInfo &MRI, 7032 MachineIRBuilder &B) const { 7033 // We need to simulate the 's_trap 2' instruction on targets that run in 7034 // PRIV=1 (where it is treated as a nop). 7035 if (ST.hasPrivEnabledTrap2NopBug()) { 7036 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI, 7037 MI.getDebugLoc()); 7038 MI.eraseFromParent(); 7039 return true; 7040 } 7041 7042 B.buildInstr(AMDGPU::S_TRAP) 7043 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 7044 MI.eraseFromParent(); 7045 return true; 7046 } 7047 7048 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI, 7049 MachineRegisterInfo &MRI, 7050 MachineIRBuilder &B) const { 7051 // Is non-HSA path or trap-handler disabled? Then, report a warning 7052 // accordingly 7053 if (!ST.isTrapHandlerEnabled() || 7054 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 7055 Function &Fn = B.getMF().getFunction(); 7056 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 7057 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning)); 7058 } else { 7059 // Insert debug-trap instruction 7060 B.buildInstr(AMDGPU::S_TRAP) 7061 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 7062 } 7063 7064 MI.eraseFromParent(); 7065 return true; 7066 } 7067 7068 bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic( 7069 MachineInstr &MI, MachineIRBuilder &B) const { 7070 MachineRegisterInfo &MRI = *B.getMRI(); 7071 const LLT S16 = LLT::scalar(16); 7072 const LLT S32 = LLT::scalar(32); 7073 const LLT V2S16 = LLT::fixed_vector(2, 16); 7074 const LLT V3S32 = LLT::fixed_vector(3, 32); 7075 7076 Register DstReg = MI.getOperand(0).getReg(); 7077 Register NodePtr = MI.getOperand(2).getReg(); 7078 Register RayExtent = MI.getOperand(3).getReg(); 7079 Register RayOrigin = MI.getOperand(4).getReg(); 7080 Register RayDir = MI.getOperand(5).getReg(); 7081 Register RayInvDir = MI.getOperand(6).getReg(); 7082 Register TDescr = MI.getOperand(7).getReg(); 7083 7084 if (!ST.hasGFX10_AEncoding()) { 7085 Function &Fn = B.getMF().getFunction(); 7086 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 7087 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc())); 7088 return false; 7089 } 7090 7091 const bool IsGFX11 = AMDGPU::isGFX11(ST); 7092 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 7093 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); 7094 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 7095 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 7096 const unsigned NumVDataDwords = 4; 7097 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 7098 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 7099 const bool UseNSA = 7100 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); 7101 7102 const unsigned BaseOpcodes[2][2] = { 7103 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 7104 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 7105 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 7106 int Opcode; 7107 if (UseNSA) { 7108 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 7109 IsGFX12Plus ? AMDGPU::MIMGEncGfx12 7110 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 7111 : AMDGPU::MIMGEncGfx10NSA, 7112 NumVDataDwords, NumVAddrDwords); 7113 } else { 7114 assert(!IsGFX12Plus); 7115 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 7116 IsGFX11 ? AMDGPU::MIMGEncGfx11Default 7117 : AMDGPU::MIMGEncGfx10Default, 7118 NumVDataDwords, NumVAddrDwords); 7119 } 7120 assert(Opcode != -1); 7121 7122 SmallVector<Register, 12> Ops; 7123 if (UseNSA && IsGFX11Plus) { 7124 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 7125 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 7126 auto Merged = B.buildMergeLikeInstr( 7127 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 7128 Ops.push_back(Merged.getReg(0)); 7129 }; 7130 7131 Ops.push_back(NodePtr); 7132 Ops.push_back(RayExtent); 7133 packLanes(RayOrigin); 7134 7135 if (IsA16) { 7136 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 7137 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 7138 auto MergedDir = B.buildMergeLikeInstr( 7139 V3S32, 7140 {B.buildBitcast( 7141 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 7142 UnmergeRayDir.getReg(0)})) 7143 .getReg(0), 7144 B.buildBitcast( 7145 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 7146 UnmergeRayDir.getReg(1)})) 7147 .getReg(0), 7148 B.buildBitcast( 7149 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 7150 UnmergeRayDir.getReg(2)})) 7151 .getReg(0)}); 7152 Ops.push_back(MergedDir.getReg(0)); 7153 } else { 7154 packLanes(RayDir); 7155 packLanes(RayInvDir); 7156 } 7157 } else { 7158 if (Is64) { 7159 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 7160 Ops.push_back(Unmerge.getReg(0)); 7161 Ops.push_back(Unmerge.getReg(1)); 7162 } else { 7163 Ops.push_back(NodePtr); 7164 } 7165 Ops.push_back(RayExtent); 7166 7167 auto packLanes = [&Ops, &S32, &B](Register Src) { 7168 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 7169 Ops.push_back(Unmerge.getReg(0)); 7170 Ops.push_back(Unmerge.getReg(1)); 7171 Ops.push_back(Unmerge.getReg(2)); 7172 }; 7173 7174 packLanes(RayOrigin); 7175 if (IsA16) { 7176 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 7177 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 7178 Register R1 = MRI.createGenericVirtualRegister(S32); 7179 Register R2 = MRI.createGenericVirtualRegister(S32); 7180 Register R3 = MRI.createGenericVirtualRegister(S32); 7181 B.buildMergeLikeInstr(R1, 7182 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 7183 B.buildMergeLikeInstr( 7184 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 7185 B.buildMergeLikeInstr( 7186 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 7187 Ops.push_back(R1); 7188 Ops.push_back(R2); 7189 Ops.push_back(R3); 7190 } else { 7191 packLanes(RayDir); 7192 packLanes(RayInvDir); 7193 } 7194 } 7195 7196 if (!UseNSA) { 7197 // Build a single vector containing all the operands so far prepared. 7198 LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 7199 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 7200 Ops.clear(); 7201 Ops.push_back(MergedOps); 7202 } 7203 7204 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY) 7205 .addDef(DstReg) 7206 .addImm(Opcode); 7207 7208 for (Register R : Ops) { 7209 MIB.addUse(R); 7210 } 7211 7212 MIB.addUse(TDescr) 7213 .addImm(IsA16 ? 1 : 0) 7214 .cloneMemRefs(MI); 7215 7216 MI.eraseFromParent(); 7217 return true; 7218 } 7219 7220 bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic( 7221 MachineInstr &MI, MachineIRBuilder &B) const { 7222 const LLT S32 = LLT::scalar(32); 7223 const LLT V2S32 = LLT::fixed_vector(2, 32); 7224 7225 Register DstReg = MI.getOperand(0).getReg(); 7226 Register DstOrigin = MI.getOperand(1).getReg(); 7227 Register DstDir = MI.getOperand(2).getReg(); 7228 Register NodePtr = MI.getOperand(4).getReg(); 7229 Register RayExtent = MI.getOperand(5).getReg(); 7230 Register InstanceMask = MI.getOperand(6).getReg(); 7231 Register RayOrigin = MI.getOperand(7).getReg(); 7232 Register RayDir = MI.getOperand(8).getReg(); 7233 Register Offsets = MI.getOperand(9).getReg(); 7234 Register TDescr = MI.getOperand(10).getReg(); 7235 7236 if (!ST.hasBVHDualAndBVH8Insts()) { 7237 Function &Fn = B.getMF().getFunction(); 7238 Fn.getContext().diagnose(DiagnosticInfoUnsupported( 7239 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc())); 7240 return false; 7241 } 7242 7243 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() == 7244 Intrinsic::amdgcn_image_bvh8_intersect_ray; 7245 const unsigned NumVDataDwords = 10; 7246 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12; 7247 int Opcode = AMDGPU::getMIMGOpcode( 7248 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY 7249 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY, 7250 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords); 7251 assert(Opcode != -1); 7252 7253 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr( 7254 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)}); 7255 7256 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY 7257 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY) 7258 .addDef(DstReg) 7259 .addDef(DstOrigin) 7260 .addDef(DstDir) 7261 .addImm(Opcode) 7262 .addUse(NodePtr) 7263 .addUse(RayExtentInstanceMaskVec.getReg(0)) 7264 .addUse(RayOrigin) 7265 .addUse(RayDir) 7266 .addUse(Offsets) 7267 .addUse(TDescr) 7268 .cloneMemRefs(MI); 7269 7270 MI.eraseFromParent(); 7271 return true; 7272 } 7273 7274 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, 7275 MachineIRBuilder &B) const { 7276 const SITargetLowering *TLI = ST.getTargetLowering(); 7277 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); 7278 Register DstReg = MI.getOperand(0).getReg(); 7279 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); 7280 MI.eraseFromParent(); 7281 return true; 7282 } 7283 7284 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, 7285 MachineIRBuilder &B) const { 7286 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 7287 if (!ST.hasArchitectedSGPRs()) 7288 return false; 7289 LLT S32 = LLT::scalar(32); 7290 Register DstReg = MI.getOperand(0).getReg(); 7291 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); 7292 auto LSB = B.buildConstant(S32, 25); 7293 auto Width = B.buildConstant(S32, 5); 7294 B.buildUbfx(DstReg, TTMP8, LSB, Width); 7295 MI.eraseFromParent(); 7296 return true; 7297 } 7298 7299 static constexpr unsigned FPEnvModeBitField = 7300 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23); 7301 7302 static constexpr unsigned FPEnvTrapBitField = 7303 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5); 7304 7305 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI, 7306 MachineRegisterInfo &MRI, 7307 MachineIRBuilder &B) const { 7308 Register Src = MI.getOperand(0).getReg(); 7309 if (MRI.getType(Src) != S64) 7310 return false; 7311 7312 auto ModeReg = 7313 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32}, 7314 /*HasSideEffects=*/true, /*isConvergent=*/false) 7315 .addImm(FPEnvModeBitField); 7316 auto TrapReg = 7317 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32}, 7318 /*HasSideEffects=*/true, /*isConvergent=*/false) 7319 .addImm(FPEnvTrapBitField); 7320 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg}); 7321 MI.eraseFromParent(); 7322 return true; 7323 } 7324 7325 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI, 7326 MachineRegisterInfo &MRI, 7327 MachineIRBuilder &B) const { 7328 Register Src = MI.getOperand(0).getReg(); 7329 if (MRI.getType(Src) != S64) 7330 return false; 7331 7332 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0)); 7333 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(), 7334 /*HasSideEffects=*/true, /*isConvergent=*/false) 7335 .addImm(static_cast<int16_t>(FPEnvModeBitField)) 7336 .addReg(Unmerge.getReg(0)); 7337 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(), 7338 /*HasSideEffects=*/true, /*isConvergent=*/false) 7339 .addImm(static_cast<int16_t>(FPEnvTrapBitField)) 7340 .addReg(Unmerge.getReg(1)); 7341 MI.eraseFromParent(); 7342 return true; 7343 } 7344 7345 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 7346 MachineInstr &MI) const { 7347 MachineIRBuilder &B = Helper.MIRBuilder; 7348 MachineRegisterInfo &MRI = *B.getMRI(); 7349 7350 // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 7351 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 7352 switch (IntrID) { 7353 case Intrinsic::amdgcn_if: 7354 case Intrinsic::amdgcn_else: { 7355 MachineInstr *Br = nullptr; 7356 MachineBasicBlock *UncondBrTarget = nullptr; 7357 bool Negated = false; 7358 if (MachineInstr *BrCond = 7359 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 7360 const SIRegisterInfo *TRI 7361 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 7362 7363 Register Def = MI.getOperand(1).getReg(); 7364 Register Use = MI.getOperand(3).getReg(); 7365 7366 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 7367 7368 if (Negated) 7369 std::swap(CondBrTarget, UncondBrTarget); 7370 7371 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 7372 if (IntrID == Intrinsic::amdgcn_if) { 7373 B.buildInstr(AMDGPU::SI_IF) 7374 .addDef(Def) 7375 .addUse(Use) 7376 .addMBB(UncondBrTarget); 7377 } else { 7378 B.buildInstr(AMDGPU::SI_ELSE) 7379 .addDef(Def) 7380 .addUse(Use) 7381 .addMBB(UncondBrTarget); 7382 } 7383 7384 if (Br) { 7385 Br->getOperand(0).setMBB(CondBrTarget); 7386 } else { 7387 // The IRTranslator skips inserting the G_BR for fallthrough cases, but 7388 // since we're swapping branch targets it needs to be reinserted. 7389 // FIXME: IRTranslator should probably not do this 7390 B.buildBr(*CondBrTarget); 7391 } 7392 7393 MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 7394 MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 7395 MI.eraseFromParent(); 7396 BrCond->eraseFromParent(); 7397 return true; 7398 } 7399 7400 return false; 7401 } 7402 case Intrinsic::amdgcn_loop: { 7403 MachineInstr *Br = nullptr; 7404 MachineBasicBlock *UncondBrTarget = nullptr; 7405 bool Negated = false; 7406 if (MachineInstr *BrCond = 7407 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 7408 const SIRegisterInfo *TRI 7409 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 7410 7411 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 7412 Register Reg = MI.getOperand(2).getReg(); 7413 7414 if (Negated) 7415 std::swap(CondBrTarget, UncondBrTarget); 7416 7417 B.setInsertPt(B.getMBB(), BrCond->getIterator()); 7418 B.buildInstr(AMDGPU::SI_LOOP) 7419 .addUse(Reg) 7420 .addMBB(UncondBrTarget); 7421 7422 if (Br) 7423 Br->getOperand(0).setMBB(CondBrTarget); 7424 else 7425 B.buildBr(*CondBrTarget); 7426 7427 MI.eraseFromParent(); 7428 BrCond->eraseFromParent(); 7429 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 7430 return true; 7431 } 7432 7433 return false; 7434 } 7435 case Intrinsic::amdgcn_addrspacecast_nonnull: 7436 return legalizeAddrSpaceCast(MI, MRI, B); 7437 case Intrinsic::amdgcn_make_buffer_rsrc: 7438 return legalizePointerAsRsrcIntrin(MI, MRI, B); 7439 case Intrinsic::amdgcn_kernarg_segment_ptr: 7440 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 7441 // This only makes sense to call in a kernel, so just lower to null. 7442 B.buildConstant(MI.getOperand(0).getReg(), 0); 7443 MI.eraseFromParent(); 7444 return true; 7445 } 7446 7447 return legalizePreloadedArgIntrin( 7448 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 7449 case Intrinsic::amdgcn_implicitarg_ptr: 7450 return legalizeImplicitArgPtr(MI, MRI, B); 7451 case Intrinsic::amdgcn_workitem_id_x: 7452 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 7453 AMDGPUFunctionArgInfo::WORKITEM_ID_X); 7454 case Intrinsic::amdgcn_workitem_id_y: 7455 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 7456 AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 7457 case Intrinsic::amdgcn_workitem_id_z: 7458 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 7459 AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 7460 case Intrinsic::amdgcn_workgroup_id_x: 7461 return legalizePreloadedArgIntrin(MI, MRI, B, 7462 AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 7463 case Intrinsic::amdgcn_workgroup_id_y: 7464 return legalizePreloadedArgIntrin(MI, MRI, B, 7465 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 7466 case Intrinsic::amdgcn_workgroup_id_z: 7467 return legalizePreloadedArgIntrin(MI, MRI, B, 7468 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 7469 case Intrinsic::amdgcn_wave_id: 7470 return legalizeWaveID(MI, B); 7471 case Intrinsic::amdgcn_lds_kernel_id: 7472 return legalizePreloadedArgIntrin(MI, MRI, B, 7473 AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 7474 case Intrinsic::amdgcn_dispatch_ptr: 7475 return legalizePreloadedArgIntrin(MI, MRI, B, 7476 AMDGPUFunctionArgInfo::DISPATCH_PTR); 7477 case Intrinsic::amdgcn_queue_ptr: 7478 return legalizePreloadedArgIntrin(MI, MRI, B, 7479 AMDGPUFunctionArgInfo::QUEUE_PTR); 7480 case Intrinsic::amdgcn_implicit_buffer_ptr: 7481 return legalizePreloadedArgIntrin( 7482 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 7483 case Intrinsic::amdgcn_dispatch_id: 7484 return legalizePreloadedArgIntrin(MI, MRI, B, 7485 AMDGPUFunctionArgInfo::DISPATCH_ID); 7486 case Intrinsic::r600_read_ngroups_x: 7487 // TODO: Emit error for hsa 7488 return legalizeKernargMemParameter(MI, B, 7489 SI::KernelInputOffsets::NGROUPS_X); 7490 case Intrinsic::r600_read_ngroups_y: 7491 return legalizeKernargMemParameter(MI, B, 7492 SI::KernelInputOffsets::NGROUPS_Y); 7493 case Intrinsic::r600_read_ngroups_z: 7494 return legalizeKernargMemParameter(MI, B, 7495 SI::KernelInputOffsets::NGROUPS_Z); 7496 case Intrinsic::r600_read_local_size_x: 7497 // TODO: Could insert G_ASSERT_ZEXT from s16 7498 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 7499 case Intrinsic::r600_read_local_size_y: 7500 // TODO: Could insert G_ASSERT_ZEXT from s16 7501 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 7502 // TODO: Could insert G_ASSERT_ZEXT from s16 7503 case Intrinsic::r600_read_local_size_z: 7504 return legalizeKernargMemParameter(MI, B, 7505 SI::KernelInputOffsets::LOCAL_SIZE_Z); 7506 case Intrinsic::amdgcn_fdiv_fast: 7507 return legalizeFDIVFastIntrin(MI, MRI, B); 7508 case Intrinsic::amdgcn_is_shared: 7509 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 7510 case Intrinsic::amdgcn_is_private: 7511 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 7512 case Intrinsic::amdgcn_wavefrontsize: { 7513 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 7514 MI.eraseFromParent(); 7515 return true; 7516 } 7517 case Intrinsic::amdgcn_s_buffer_load: 7518 return legalizeSBufferLoad(Helper, MI); 7519 case Intrinsic::amdgcn_raw_buffer_store: 7520 case Intrinsic::amdgcn_raw_ptr_buffer_store: 7521 case Intrinsic::amdgcn_struct_buffer_store: 7522 case Intrinsic::amdgcn_struct_ptr_buffer_store: 7523 return legalizeBufferStore(MI, Helper, false, false); 7524 case Intrinsic::amdgcn_raw_buffer_store_format: 7525 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 7526 case Intrinsic::amdgcn_struct_buffer_store_format: 7527 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 7528 return legalizeBufferStore(MI, Helper, false, true); 7529 case Intrinsic::amdgcn_raw_tbuffer_store: 7530 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 7531 case Intrinsic::amdgcn_struct_tbuffer_store: 7532 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 7533 return legalizeBufferStore(MI, Helper, true, true); 7534 case Intrinsic::amdgcn_raw_buffer_load: 7535 case Intrinsic::amdgcn_raw_ptr_buffer_load: 7536 case Intrinsic::amdgcn_raw_atomic_buffer_load: 7537 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: 7538 case Intrinsic::amdgcn_struct_buffer_load: 7539 case Intrinsic::amdgcn_struct_ptr_buffer_load: 7540 case Intrinsic::amdgcn_struct_atomic_buffer_load: 7541 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: 7542 return legalizeBufferLoad(MI, Helper, false, false); 7543 case Intrinsic::amdgcn_raw_buffer_load_format: 7544 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 7545 case Intrinsic::amdgcn_struct_buffer_load_format: 7546 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 7547 return legalizeBufferLoad(MI, Helper, true, false); 7548 case Intrinsic::amdgcn_raw_tbuffer_load: 7549 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 7550 case Intrinsic::amdgcn_struct_tbuffer_load: 7551 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 7552 return legalizeBufferLoad(MI, Helper, true, true); 7553 case Intrinsic::amdgcn_raw_buffer_atomic_swap: 7554 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 7555 case Intrinsic::amdgcn_struct_buffer_atomic_swap: 7556 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 7557 case Intrinsic::amdgcn_raw_buffer_atomic_add: 7558 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 7559 case Intrinsic::amdgcn_struct_buffer_atomic_add: 7560 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 7561 case Intrinsic::amdgcn_raw_buffer_atomic_sub: 7562 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 7563 case Intrinsic::amdgcn_struct_buffer_atomic_sub: 7564 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 7565 case Intrinsic::amdgcn_raw_buffer_atomic_smin: 7566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 7567 case Intrinsic::amdgcn_struct_buffer_atomic_smin: 7568 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 7569 case Intrinsic::amdgcn_raw_buffer_atomic_umin: 7570 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 7571 case Intrinsic::amdgcn_struct_buffer_atomic_umin: 7572 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 7573 case Intrinsic::amdgcn_raw_buffer_atomic_smax: 7574 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 7575 case Intrinsic::amdgcn_struct_buffer_atomic_smax: 7576 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 7577 case Intrinsic::amdgcn_raw_buffer_atomic_umax: 7578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 7579 case Intrinsic::amdgcn_struct_buffer_atomic_umax: 7580 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 7581 case Intrinsic::amdgcn_raw_buffer_atomic_and: 7582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 7583 case Intrinsic::amdgcn_struct_buffer_atomic_and: 7584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 7585 case Intrinsic::amdgcn_raw_buffer_atomic_or: 7586 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 7587 case Intrinsic::amdgcn_struct_buffer_atomic_or: 7588 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 7589 case Intrinsic::amdgcn_raw_buffer_atomic_xor: 7590 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 7591 case Intrinsic::amdgcn_struct_buffer_atomic_xor: 7592 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 7593 case Intrinsic::amdgcn_raw_buffer_atomic_inc: 7594 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 7595 case Intrinsic::amdgcn_struct_buffer_atomic_inc: 7596 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 7597 case Intrinsic::amdgcn_raw_buffer_atomic_dec: 7598 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 7599 case Intrinsic::amdgcn_struct_buffer_atomic_dec: 7600 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 7601 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 7602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 7603 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 7604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 7605 case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 7606 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 7607 case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 7608 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 7609 case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 7610 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 7611 case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 7612 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 7613 case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 7614 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 7615 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 7616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 7617 return legalizeBufferAtomic(MI, B, IntrID); 7618 case Intrinsic::amdgcn_rsq_clamp: 7619 return legalizeRsqClampIntrinsic(MI, MRI, B); 7620 case Intrinsic::amdgcn_image_bvh_intersect_ray: 7621 return legalizeBVHIntersectRayIntrinsic(MI, B); 7622 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: 7623 case Intrinsic::amdgcn_image_bvh8_intersect_ray: 7624 return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B); 7625 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 7626 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 7627 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 7628 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 7629 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 7630 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 7631 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 7632 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 7633 Register Index = MI.getOperand(5).getReg(); 7634 LLT S32 = LLT::scalar(32); 7635 if (MRI.getType(Index) != S32) 7636 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7637 return true; 7638 } 7639 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 7640 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 7641 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 7642 Register Index = MI.getOperand(7).getReg(); 7643 LLT S32 = LLT::scalar(32); 7644 if (MRI.getType(Index) != S32) 7645 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7646 return true; 7647 } 7648 case Intrinsic::amdgcn_fmed3: { 7649 GISelChangeObserver &Observer = Helper.Observer; 7650 7651 // FIXME: This is to workaround the inability of tablegen match combiners to 7652 // match intrinsics in patterns. 7653 Observer.changingInstr(MI); 7654 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 7655 MI.removeOperand(1); 7656 Observer.changedInstr(MI); 7657 return true; 7658 } 7659 case Intrinsic::amdgcn_readlane: 7660 case Intrinsic::amdgcn_writelane: 7661 case Intrinsic::amdgcn_readfirstlane: 7662 case Intrinsic::amdgcn_permlane16: 7663 case Intrinsic::amdgcn_permlanex16: 7664 case Intrinsic::amdgcn_permlane64: 7665 case Intrinsic::amdgcn_set_inactive: 7666 case Intrinsic::amdgcn_set_inactive_chain_arg: 7667 case Intrinsic::amdgcn_mov_dpp8: 7668 case Intrinsic::amdgcn_update_dpp: 7669 return legalizeLaneOp(Helper, MI, IntrID); 7670 case Intrinsic::amdgcn_s_buffer_prefetch_data: 7671 return legalizeSBufferPrefetch(Helper, MI); 7672 case Intrinsic::amdgcn_dead: { 7673 // TODO: Use poison instead of undef 7674 for (const MachineOperand &Def : MI.defs()) 7675 B.buildUndef(Def); 7676 MI.eraseFromParent(); 7677 return true; 7678 } 7679 default: { 7680 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 7681 AMDGPU::getImageDimIntrinsicInfo(IntrID)) 7682 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 7683 return true; 7684 } 7685 } 7686 7687 return true; 7688 } 7689