1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47 "amdgpu-global-isel-new-legality",
48 cl::desc("Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
50 cl::init(false),
51 cl::ReallyHidden);
52
53 static constexpr unsigned MaxRegisterSize = 1024;
54
55 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)56 static LLT getPow2VectorType(LLT Ty) {
57 unsigned NElts = Ty.getNumElements();
58 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
59 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61
62 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)63 static LLT getPow2ScalarType(LLT Ty) {
64 unsigned Bits = Ty.getSizeInBits();
65 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
66 return LLT::scalar(Pow2Bits);
67 }
68
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73 return [=](const LegalityQuery &Query) {
74 const LLT Ty = Query.Types[TypeIdx];
75 if (!Ty.isVector())
76 return false;
77
78 const LLT EltTy = Ty.getElementType();
79 const unsigned EltSize = EltTy.getSizeInBits();
80 return Ty.getNumElements() % 2 != 0 &&
81 EltSize > 1 && EltSize < 32 &&
82 Ty.getSizeInBits() % 32 != 0;
83 };
84 }
85
sizeIsMultipleOf32(unsigned TypeIdx)86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87 return [=](const LegalityQuery &Query) {
88 const LLT Ty = Query.Types[TypeIdx];
89 return Ty.getSizeInBits() % 32 == 0;
90 };
91 }
92
isWideVec16(unsigned TypeIdx)93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94 return [=](const LegalityQuery &Query) {
95 const LLT Ty = Query.Types[TypeIdx];
96 const LLT EltTy = Ty.getScalarType();
97 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98 };
99 }
100
oneMoreElement(unsigned TypeIdx)101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102 return [=](const LegalityQuery &Query) {
103 const LLT Ty = Query.Types[TypeIdx];
104 const LLT EltTy = Ty.getElementType();
105 return std::pair(TypeIdx,
106 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107 };
108 }
109
fewerEltsToSize64Vector(unsigned TypeIdx)110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111 return [=](const LegalityQuery &Query) {
112 const LLT Ty = Query.Types[TypeIdx];
113 const LLT EltTy = Ty.getElementType();
114 unsigned Size = Ty.getSizeInBits();
115 unsigned Pieces = (Size + 63) / 64;
116 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117 return std::pair(TypeIdx, LLT::scalarOrVector(
118 ElementCount::getFixed(NewNumElts), EltTy));
119 };
120 }
121
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
moreEltsToNext32Bit(unsigned TypeIdx)124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125 return [=](const LegalityQuery &Query) {
126 const LLT Ty = Query.Types[TypeIdx];
127
128 const LLT EltTy = Ty.getElementType();
129 const int Size = Ty.getSizeInBits();
130 const int EltSize = EltTy.getSizeInBits();
131 const int NextMul32 = (Size + 31) / 32;
132
133 assert(EltSize < 32);
134
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137 };
138 }
139
140 // Increase the number of vector elements to reach the next legal RegClass.
moreElementsToNextExistingRegClass(unsigned TypeIdx)141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142 return [=](const LegalityQuery &Query) {
143 const LLT Ty = Query.Types[TypeIdx];
144 const unsigned NumElts = Ty.getNumElements();
145 const unsigned EltSize = Ty.getElementType().getSizeInBits();
146 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147
148 assert(EltSize == 32 || EltSize == 64);
149 assert(Ty.getSizeInBits() < MaxRegisterSize);
150
151 unsigned NewNumElts;
152 // Find the nearest legal RegClass that is larger than the current type.
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155 break;
156 }
157
158 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159 };
160 }
161
getBufferRsrcScalarType(const LLT Ty)162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163 if (!Ty.isVector())
164 return LLT::scalar(128);
165 const ElementCount NumElems = Ty.getElementCount();
166 return LLT::vector(NumElems, LLT::scalar(128));
167 }
168
getBufferRsrcRegisterType(const LLT Ty)169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170 if (!Ty.isVector())
171 return LLT::fixed_vector(4, LLT::scalar(32));
172 const unsigned NumElems = Ty.getElementCount().getFixedValue();
173 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175
getBitcastRegisterType(const LLT Ty)176 static LLT getBitcastRegisterType(const LLT Ty) {
177 const unsigned Size = Ty.getSizeInBits();
178
179 if (Size <= 32) {
180 // <2 x s8> -> s16
181 // <4 x s8> -> s32
182 return LLT::scalar(Size);
183 }
184
185 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187
bitcastToRegisterType(unsigned TypeIdx)188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189 return [=](const LegalityQuery &Query) {
190 const LLT Ty = Query.Types[TypeIdx];
191 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192 };
193 }
194
bitcastToVectorElement32(unsigned TypeIdx)195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196 return [=](const LegalityQuery &Query) {
197 const LLT Ty = Query.Types[TypeIdx];
198 unsigned Size = Ty.getSizeInBits();
199 assert(Size % 32 == 0);
200 return std::pair(
201 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202 };
203 }
204
vectorSmallerThan(unsigned TypeIdx,unsigned Size)205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206 return [=](const LegalityQuery &Query) {
207 const LLT QueryTy = Query.Types[TypeIdx];
208 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209 };
210 }
211
vectorWiderThan(unsigned TypeIdx,unsigned Size)212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213 return [=](const LegalityQuery &Query) {
214 const LLT QueryTy = Query.Types[TypeIdx];
215 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216 };
217 }
218
numElementsNotEven(unsigned TypeIdx)219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223 };
224 }
225
isRegisterSize(unsigned Size)226 static bool isRegisterSize(unsigned Size) {
227 return Size % 32 == 0 && Size <= MaxRegisterSize;
228 }
229
isRegisterVectorElementType(LLT EltTy)230 static bool isRegisterVectorElementType(LLT EltTy) {
231 const int EltSize = EltTy.getSizeInBits();
232 return EltSize == 16 || EltSize % 32 == 0;
233 }
234
isRegisterVectorType(LLT Ty)235 static bool isRegisterVectorType(LLT Ty) {
236 const int EltSize = Ty.getElementType().getSizeInBits();
237 return EltSize == 32 || EltSize == 64 ||
238 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239 EltSize == 128 || EltSize == 256;
240 }
241
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
isRegisterType(LLT Ty)243 static bool isRegisterType(LLT Ty) {
244 if (!isRegisterSize(Ty.getSizeInBits()))
245 return false;
246
247 if (Ty.isVector())
248 return isRegisterVectorType(Ty);
249
250 return true;
251 }
252
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)255 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256 return [=](const LegalityQuery &Query) {
257 return isRegisterType(Query.Types[TypeIdx]);
258 };
259 }
260
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
isIllegalRegisterType(unsigned TypeIdx)264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265 return [=](const LegalityQuery &Query) {
266 LLT Ty = Query.Types[TypeIdx];
267 return isRegisterType(Ty) &&
268 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
269 };
270 }
271
elementTypeIsLegal(unsigned TypeIdx)272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273 return [=](const LegalityQuery &Query) {
274 const LLT QueryTy = Query.Types[TypeIdx];
275 if (!QueryTy.isVector())
276 return false;
277 const LLT EltTy = QueryTy.getElementType();
278 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279 };
280 }
281
282 static const LLT S1 = LLT::scalar(1);
283 static const LLT S8 = LLT::scalar(8);
284 static const LLT S16 = LLT::scalar(16);
285 static const LLT S32 = LLT::scalar(32);
286 static const LLT F32 = LLT::float32();
287 static const LLT S64 = LLT::scalar(64);
288 static const LLT F64 = LLT::float64();
289 static const LLT S96 = LLT::scalar(96);
290 static const LLT S128 = LLT::scalar(128);
291 static const LLT S160 = LLT::scalar(160);
292 static const LLT S224 = LLT::scalar(224);
293 static const LLT S256 = LLT::scalar(256);
294 static const LLT S512 = LLT::scalar(512);
295 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
296
297 static const LLT V2S8 = LLT::fixed_vector(2, 8);
298 static const LLT V2S16 = LLT::fixed_vector(2, 16);
299 static const LLT V4S16 = LLT::fixed_vector(4, 16);
300 static const LLT V6S16 = LLT::fixed_vector(6, 16);
301 static const LLT V8S16 = LLT::fixed_vector(8, 16);
302 static const LLT V10S16 = LLT::fixed_vector(10, 16);
303 static const LLT V12S16 = LLT::fixed_vector(12, 16);
304 static const LLT V16S16 = LLT::fixed_vector(16, 16);
305
306 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
307 static const LLT V2BF16 = V2F16; // FIXME
308
309 static const LLT V2S32 = LLT::fixed_vector(2, 32);
310 static const LLT V3S32 = LLT::fixed_vector(3, 32);
311 static const LLT V4S32 = LLT::fixed_vector(4, 32);
312 static const LLT V5S32 = LLT::fixed_vector(5, 32);
313 static const LLT V6S32 = LLT::fixed_vector(6, 32);
314 static const LLT V7S32 = LLT::fixed_vector(7, 32);
315 static const LLT V8S32 = LLT::fixed_vector(8, 32);
316 static const LLT V9S32 = LLT::fixed_vector(9, 32);
317 static const LLT V10S32 = LLT::fixed_vector(10, 32);
318 static const LLT V11S32 = LLT::fixed_vector(11, 32);
319 static const LLT V12S32 = LLT::fixed_vector(12, 32);
320 static const LLT V16S32 = LLT::fixed_vector(16, 32);
321 static const LLT V32S32 = LLT::fixed_vector(32, 32);
322
323 static const LLT V2S64 = LLT::fixed_vector(2, 64);
324 static const LLT V3S64 = LLT::fixed_vector(3, 64);
325 static const LLT V4S64 = LLT::fixed_vector(4, 64);
326 static const LLT V5S64 = LLT::fixed_vector(5, 64);
327 static const LLT V6S64 = LLT::fixed_vector(6, 64);
328 static const LLT V7S64 = LLT::fixed_vector(7, 64);
329 static const LLT V8S64 = LLT::fixed_vector(8, 64);
330 static const LLT V16S64 = LLT::fixed_vector(16, 64);
331
332 static const LLT V2S128 = LLT::fixed_vector(2, 128);
333 static const LLT V4S128 = LLT::fixed_vector(4, 128);
334
335 static std::initializer_list<LLT> AllScalarTypes = {S32, S64, S96, S128,
336 S160, S224, S256, S512};
337
338 static std::initializer_list<LLT> AllS16Vectors{
339 V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
340
341 static std::initializer_list<LLT> AllS32Vectors = {
342 V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
343 V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
344
345 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
346 V6S64, V7S64, V8S64, V16S64};
347
348 // Checks whether a type is in the list of legal register types.
isRegisterClassType(LLT Ty)349 static bool isRegisterClassType(LLT Ty) {
350 if (Ty.isPointerOrPointerVector())
351 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
352
353 return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
354 is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
355 }
356
isRegisterClassType(unsigned TypeIdx)357 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358 return [TypeIdx](const LegalityQuery &Query) {
359 return isRegisterClassType(Query.Types[TypeIdx]);
360 };
361 }
362
363 // If we have a truncating store or an extending load with a data size larger
364 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)365 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
366 return [=](const LegalityQuery &Query) {
367 const LLT Ty = Query.Types[TypeIdx];
368 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370 };
371 }
372
373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374 // handle some operations by just promoting the register during
375 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad,bool IsAtomic)376 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377 bool IsLoad, bool IsAtomic) {
378 switch (AS) {
379 case AMDGPUAS::PRIVATE_ADDRESS:
380 // FIXME: Private element size.
381 return ST.enableFlatScratch() ? 128 : 32;
382 case AMDGPUAS::LOCAL_ADDRESS:
383 return ST.useDS128() ? 128 : 64;
384 case AMDGPUAS::GLOBAL_ADDRESS:
385 case AMDGPUAS::CONSTANT_ADDRESS:
386 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
387 case AMDGPUAS::BUFFER_RESOURCE:
388 // Treat constant and global as identical. SMRD loads are sometimes usable for
389 // global loads (ideally constant address space should be eliminated)
390 // depending on the context. Legality cannot be context dependent, but
391 // RegBankSelect can split the load as necessary depending on the pointer
392 // register bank/uniformity and if the memory is invariant or not written in a
393 // kernel.
394 return IsLoad ? 512 : 128;
395 default:
396 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397 // if they may alias scratch depending on the subtarget. This needs to be
398 // moved to custom handling to use addressMayBeAccessedAsPrivate
399 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400 }
401 }
402
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)403 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404 const LegalityQuery &Query) {
405 const LLT Ty = Query.Types[0];
406
407 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409
410 unsigned RegSize = Ty.getSizeInBits();
411 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413 unsigned AS = Query.Types[1].getAddressSpace();
414
415 // All of these need to be custom lowered to cast the pointer operand.
416 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
417 return false;
418
419 // Do not handle extending vector loads.
420 if (Ty.isVector() && MemSize != RegSize)
421 return false;
422
423 // TODO: We should be able to widen loads if the alignment is high enough, but
424 // we also need to modify the memory access size.
425 #if 0
426 // Accept widening loads based on alignment.
427 if (IsLoad && MemSize < Size)
428 MemSize = std::max(MemSize, Align);
429 #endif
430
431 // Only 1-byte and 2-byte to 32-bit extloads are valid.
432 if (MemSize != RegSize && RegSize != 32)
433 return false;
434
435 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436 Query.MMODescrs[0].Ordering !=
437 AtomicOrdering::NotAtomic))
438 return false;
439
440 switch (MemSize) {
441 case 8:
442 case 16:
443 case 32:
444 case 64:
445 case 128:
446 break;
447 case 96:
448 if (!ST.hasDwordx3LoadStores())
449 return false;
450 break;
451 case 256:
452 case 512:
453 // These may contextually need to be broken down.
454 break;
455 default:
456 return false;
457 }
458
459 assert(RegSize >= MemSize);
460
461 if (AlignBits < MemSize) {
462 const SITargetLowering *TLI = ST.getTargetLowering();
463 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464 Align(AlignBits / 8)))
465 return false;
466 }
467
468 return true;
469 }
470
471 // The newer buffer intrinsic forms take their resource arguments as
472 // pointers in address space 8, aka s128 values. However, in order to not break
473 // SelectionDAG, the underlying operations have to continue to take v4i32
474 // arguments. Therefore, we convert resource pointers - or vectors of them
475 // to integer values here.
hasBufferRsrcWorkaround(const LLT Ty)476 static bool hasBufferRsrcWorkaround(const LLT Ty) {
477 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
478 return true;
479 if (Ty.isVector()) {
480 const LLT ElemTy = Ty.getElementType();
481 return hasBufferRsrcWorkaround(ElemTy);
482 }
483 return false;
484 }
485
486 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487 // workaround this. Eventually it should ignore the type for loads and only care
488 // about the size. Return true in cases where we will workaround this for now by
489 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)490 static bool loadStoreBitcastWorkaround(const LLT Ty) {
491 if (EnableNewLegality)
492 return false;
493
494 const unsigned Size = Ty.getSizeInBits();
495 if (Size <= 64)
496 return false;
497 // Address space 8 pointers get their own workaround.
498 if (hasBufferRsrcWorkaround(Ty))
499 return false;
500 if (!Ty.isVector())
501 return true;
502
503 if (Ty.isPointerVector())
504 return true;
505
506 unsigned EltSize = Ty.getScalarSizeInBits();
507 return EltSize != 32 && EltSize != 64;
508 }
509
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)510 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511 const LLT Ty = Query.Types[0];
512 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
513 !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
514 }
515
516 /// Return true if a load or store of the type should be lowered with a bitcast
517 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)518 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519 const LLT MemTy) {
520 const unsigned MemSizeInBits = MemTy.getSizeInBits();
521 const unsigned Size = Ty.getSizeInBits();
522 if (Size != MemSizeInBits)
523 return Size <= 32 && Ty.isVector();
524
525 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
526 return true;
527
528 // Don't try to handle bitcasting vector ext loads for now.
529 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530 (Size <= 32 || isRegisterSize(Size)) &&
531 !isRegisterVectorElementType(Ty.getElementType());
532 }
533
534 /// Return true if we should legalize a load by widening an odd sized memory
535 /// access up to the alignment. Note this case when the memory access itself
536 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,uint64_t AlignInBits,unsigned AddrSpace,unsigned Opcode)537 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538 uint64_t AlignInBits, unsigned AddrSpace,
539 unsigned Opcode) {
540 unsigned SizeInBits = MemoryTy.getSizeInBits();
541 // We don't want to widen cases that are naturally legal.
542 if (isPowerOf2_32(SizeInBits))
543 return false;
544
545 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546 // end up widening these for a scalar load during RegBankSelect, if we don't
547 // have 96-bit scalar loads.
548 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549 return false;
550
551 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552 return false;
553
554 // A load is known dereferenceable up to the alignment, so it's legal to widen
555 // to it.
556 //
557 // TODO: Could check dereferenceable for less aligned cases.
558 unsigned RoundedSize = NextPowerOf2(SizeInBits);
559 if (AlignInBits < RoundedSize)
560 return false;
561
562 // Do not widen if it would introduce a slow unaligned load.
563 const SITargetLowering *TLI = ST.getTargetLowering();
564 unsigned Fast = 0;
565 return TLI->allowsMisalignedMemoryAccessesImpl(
566 RoundedSize, AddrSpace, Align(AlignInBits / 8),
567 MachineMemOperand::MOLoad, &Fast) &&
568 Fast;
569 }
570
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)571 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572 unsigned Opcode) {
573 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574 return false;
575
576 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577 Query.MMODescrs[0].AlignInBits,
578 Query.Types[1].getAddressSpace(), Opcode);
579 }
580
581 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
castBufferRsrcFromV4I32(MachineInstr & MI,MachineIRBuilder & B,MachineRegisterInfo & MRI,unsigned Idx)584 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
585 MachineRegisterInfo &MRI, unsigned Idx) {
586 MachineOperand &MO = MI.getOperand(Idx);
587
588 const LLT PointerTy = MRI.getType(MO.getReg());
589
590 // Paranoidly prevent us from doing this multiple times.
591 if (!hasBufferRsrcWorkaround(PointerTy))
592 return PointerTy;
593
594 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596 if (!PointerTy.isVector()) {
597 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599 const LLT S32 = LLT::scalar(32);
600
601 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602 std::array<Register, 4> VectorElems;
603 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604 for (unsigned I = 0; I < NumParts; ++I)
605 VectorElems[I] =
606 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607 B.buildMergeValues(MO, VectorElems);
608 MO.setReg(VectorReg);
609 return VectorTy;
610 }
611 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614 B.buildIntToPtr(MO, Scalar);
615 MO.setReg(BitcastReg);
616
617 return VectorTy;
618 }
619
620 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621 /// the form in which the value must be in order to be passed to the low-level
622 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623 /// needed in order to account for the fact that we can't define a register
624 /// class for s128 without breaking SelectionDAG.
castBufferRsrcToV4I32(Register Pointer,MachineIRBuilder & B)625 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
626 MachineRegisterInfo &MRI = *B.getMRI();
627 const LLT PointerTy = MRI.getType(Pointer);
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630
631 if (!PointerTy.isVector()) {
632 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633 SmallVector<Register, 4> PointerParts;
634 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636 for (unsigned I = 0; I < NumParts; ++I)
637 PointerParts.push_back(Unmerged.getReg(I));
638 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639 }
640 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641 return B.buildBitcast(VectorTy, Scalar).getReg(0);
642 }
643
castBufferRsrcArgToV4I32(MachineInstr & MI,MachineIRBuilder & B,unsigned Idx)644 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
645 unsigned Idx) {
646 MachineOperand &MO = MI.getOperand(Idx);
647
648 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649 // Paranoidly prevent us from doing this multiple times.
650 if (!hasBufferRsrcWorkaround(PointerTy))
651 return;
652 MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
653 }
654
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)655 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
656 const GCNTargetMachine &TM)
657 : ST(ST_) {
658 using namespace TargetOpcode;
659
660 auto GetAddrSpacePtr = [&TM](unsigned AS) {
661 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662 };
663
664 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673 const LLT BufferStridedPtr =
674 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675
676 const LLT CodePtr = FlatPtr;
677
678 const std::initializer_list<LLT> AddrSpaces64 = {
679 GlobalPtr, ConstantPtr, FlatPtr
680 };
681
682 const std::initializer_list<LLT> AddrSpaces32 = {
683 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684 };
685
686 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687
688 const std::initializer_list<LLT> FPTypesBase = {
689 S32, S64
690 };
691
692 const std::initializer_list<LLT> FPTypes16 = {
693 S32, S64, S16
694 };
695
696 const std::initializer_list<LLT> FPTypesPK16 = {
697 S32, S64, S16, V2S16
698 };
699
700 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701
702 // s1 for VCC branches, s32 for SCC branches.
703 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
704
705 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706 // elements for v3s16
707 getActionDefinitionsBuilder(G_PHI)
708 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
709 .legalFor(AllS32Vectors)
710 .legalFor(AllS64Vectors)
711 .legalFor(AddrSpaces64)
712 .legalFor(AddrSpaces32)
713 .legalFor(AddrSpaces128)
714 .legalIf(isPointer(0))
715 .clampScalar(0, S16, S256)
716 .widenScalarToNextPow2(0, 32)
717 .clampMaxNumElements(0, S32, 16)
718 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
719 .scalarize(0);
720
721 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722 // Full set of gfx9 features.
723 if (ST.hasScalarAddSub64()) {
724 getActionDefinitionsBuilder({G_ADD, G_SUB})
725 .legalFor({S64, S32, S16, V2S16})
726 .clampMaxNumElementsStrict(0, S16, 2)
727 .scalarize(0)
728 .minScalar(0, S16)
729 .widenScalarToNextMultipleOf(0, 32)
730 .maxScalar(0, S32);
731 } else {
732 getActionDefinitionsBuilder({G_ADD, G_SUB})
733 .legalFor({S32, S16, V2S16})
734 .clampMaxNumElementsStrict(0, S16, 2)
735 .scalarize(0)
736 .minScalar(0, S16)
737 .widenScalarToNextMultipleOf(0, 32)
738 .maxScalar(0, S32);
739 }
740
741 if (ST.hasScalarSMulU64()) {
742 getActionDefinitionsBuilder(G_MUL)
743 .legalFor({S64, S32, S16, V2S16})
744 .clampMaxNumElementsStrict(0, S16, 2)
745 .scalarize(0)
746 .minScalar(0, S16)
747 .widenScalarToNextMultipleOf(0, 32)
748 .custom();
749 } else {
750 getActionDefinitionsBuilder(G_MUL)
751 .legalFor({S32, S16, V2S16})
752 .clampMaxNumElementsStrict(0, S16, 2)
753 .scalarize(0)
754 .minScalar(0, S16)
755 .widenScalarToNextMultipleOf(0, 32)
756 .custom();
757 }
758 assert(ST.hasMad64_32());
759
760 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761 .legalFor({S32, S16, V2S16}) // Clamp modifier
762 .minScalarOrElt(0, S16)
763 .clampMaxNumElementsStrict(0, S16, 2)
764 .scalarize(0)
765 .widenScalarToNextPow2(0, 32)
766 .lower();
767 } else if (ST.has16BitInsts()) {
768 getActionDefinitionsBuilder({G_ADD, G_SUB})
769 .legalFor({S32, S16})
770 .minScalar(0, S16)
771 .widenScalarToNextMultipleOf(0, 32)
772 .maxScalar(0, S32)
773 .scalarize(0);
774
775 getActionDefinitionsBuilder(G_MUL)
776 .legalFor({S32, S16})
777 .scalarize(0)
778 .minScalar(0, S16)
779 .widenScalarToNextMultipleOf(0, 32)
780 .custom();
781 assert(ST.hasMad64_32());
782
783 // Technically the saturating operations require clamp bit support, but this
784 // was introduced at the same time as 16-bit operations.
785 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786 .legalFor({S32, S16}) // Clamp modifier
787 .minScalar(0, S16)
788 .scalarize(0)
789 .widenScalarToNextPow2(0, 16)
790 .lower();
791
792 // We're just lowering this, but it helps get a better result to try to
793 // coerce to the desired type first.
794 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795 .minScalar(0, S16)
796 .scalarize(0)
797 .lower();
798 } else {
799 getActionDefinitionsBuilder({G_ADD, G_SUB})
800 .legalFor({S32})
801 .widenScalarToNextMultipleOf(0, 32)
802 .clampScalar(0, S32, S32)
803 .scalarize(0);
804
805 auto &Mul = getActionDefinitionsBuilder(G_MUL)
806 .legalFor({S32})
807 .scalarize(0)
808 .minScalar(0, S32)
809 .widenScalarToNextMultipleOf(0, 32);
810
811 if (ST.hasMad64_32())
812 Mul.custom();
813 else
814 Mul.maxScalar(0, S32);
815
816 if (ST.hasIntClamp()) {
817 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818 .legalFor({S32}) // Clamp modifier.
819 .scalarize(0)
820 .minScalarOrElt(0, S32)
821 .lower();
822 } else {
823 // Clamp bit support was added in VI, along with 16-bit operations.
824 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825 .minScalar(0, S32)
826 .scalarize(0)
827 .lower();
828 }
829
830 // FIXME: DAG expansion gets better results. The widening uses the smaller
831 // range values and goes for the min/max lowering directly.
832 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833 .minScalar(0, S32)
834 .scalarize(0)
835 .lower();
836 }
837
838 getActionDefinitionsBuilder(
839 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840 .customFor({S32, S64})
841 .clampScalar(0, S32, S64)
842 .widenScalarToNextPow2(0, 32)
843 .scalarize(0);
844
845 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846 .legalFor({S32})
847 .maxScalar(0, S32);
848
849 if (ST.hasVOP3PInsts()) {
850 Mulh
851 .clampMaxNumElements(0, S8, 2)
852 .lowerFor({V2S8});
853 }
854
855 Mulh
856 .scalarize(0)
857 .lower();
858
859 // Report legal for any types we can handle anywhere. For the cases only legal
860 // on the SALU, RegBankSelect will be able to re-legalize.
861 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863 .clampScalar(0, S32, S64)
864 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
865 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
866 .widenScalarToNextPow2(0)
867 .scalarize(0);
868
869 getActionDefinitionsBuilder(
870 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871 .legalFor({{S32, S1}, {S32, S32}})
872 .clampScalar(0, S32, S32)
873 .scalarize(0);
874
875 getActionDefinitionsBuilder(G_BITCAST)
876 // Don't worry about the size constraint.
877 .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
878 .lower();
879
880 getActionDefinitionsBuilder(G_CONSTANT)
881 .legalFor({S1, S32, S64, S16, GlobalPtr,
882 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883 .legalIf(isPointer(0))
884 .clampScalar(0, S32, S64)
885 .widenScalarToNextPow2(0);
886
887 getActionDefinitionsBuilder(G_FCONSTANT)
888 .legalFor({S32, S64, S16})
889 .clampScalar(0, S16, S64);
890
891 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892 .legalIf(isRegisterType(0))
893 // s1 and s16 are special cases because they have legal operations on
894 // them, but don't really occupy registers in the normal way.
895 .legalFor({S1, S16})
896 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
897 .clampScalarOrElt(0, S32, MaxScalar)
898 .widenScalarToNextPow2(0, 32)
899 .clampMaxNumElements(0, S32, 16);
900
901 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902
903 // If the amount is divergent, we have to do a wave reduction to get the
904 // maximum value, so this is expanded during RegBankSelect.
905 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906 .legalFor({{PrivatePtr, S32}});
907
908 getActionDefinitionsBuilder(G_STACKSAVE)
909 .customFor({PrivatePtr});
910 getActionDefinitionsBuilder(G_STACKRESTORE)
911 .legalFor({PrivatePtr});
912
913 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914
915 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916 .customIf(typeIsNot(0, PrivatePtr));
917
918 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919
920 auto &FPOpActions = getActionDefinitionsBuilder(
921 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923 .legalFor({S32, S64});
924 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925 .customFor({S32, S64});
926 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927 .customFor({S32, S64});
928
929 if (ST.has16BitInsts()) {
930 if (ST.hasVOP3PInsts())
931 FPOpActions.legalFor({S16, V2S16});
932 else
933 FPOpActions.legalFor({S16});
934
935 TrigActions.customFor({S16});
936 FDIVActions.customFor({S16});
937 }
938
939 if (ST.hasPackedFP32Ops()) {
940 FPOpActions.legalFor({V2S32});
941 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942 }
943
944 auto &MinNumMaxNum = getActionDefinitionsBuilder({
945 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946
947 if (ST.hasVOP3PInsts()) {
948 MinNumMaxNum.customFor(FPTypesPK16)
949 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
950 .clampMaxNumElements(0, S16, 2)
951 .clampScalar(0, S16, S64)
952 .scalarize(0);
953 } else if (ST.has16BitInsts()) {
954 MinNumMaxNum.customFor(FPTypes16)
955 .clampScalar(0, S16, S64)
956 .scalarize(0);
957 } else {
958 MinNumMaxNum.customFor(FPTypesBase)
959 .clampScalar(0, S32, S64)
960 .scalarize(0);
961 }
962
963 if (ST.hasVOP3PInsts())
964 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965
966 FPOpActions
967 .scalarize(0)
968 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969
970 TrigActions
971 .scalarize(0)
972 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973
974 FDIVActions
975 .scalarize(0)
976 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977
978 getActionDefinitionsBuilder({G_FNEG, G_FABS})
979 .legalFor(FPTypesPK16)
980 .clampMaxNumElementsStrict(0, S16, 2)
981 .scalarize(0)
982 .clampScalar(0, S16, S64);
983
984 if (ST.has16BitInsts()) {
985 getActionDefinitionsBuilder(G_FSQRT)
986 .legalFor({S16})
987 .customFor({S32, S64})
988 .scalarize(0)
989 .unsupported();
990 getActionDefinitionsBuilder(G_FFLOOR)
991 .legalFor({S32, S64, S16})
992 .scalarize(0)
993 .clampScalar(0, S16, S64);
994
995 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997 .scalarize(0)
998 .maxScalarIf(typeIs(0, S16), 1, S16)
999 .clampScalar(1, S32, S32)
1000 .lower();
1001
1002 getActionDefinitionsBuilder(G_FFREXP)
1003 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004 .scalarize(0)
1005 .lower();
1006 } else {
1007 getActionDefinitionsBuilder(G_FSQRT)
1008 .customFor({S32, S64, S16})
1009 .scalarize(0)
1010 .unsupported();
1011
1012
1013 if (ST.hasFractBug()) {
1014 getActionDefinitionsBuilder(G_FFLOOR)
1015 .customFor({S64})
1016 .legalFor({S32, S64})
1017 .scalarize(0)
1018 .clampScalar(0, S32, S64);
1019 } else {
1020 getActionDefinitionsBuilder(G_FFLOOR)
1021 .legalFor({S32, S64})
1022 .scalarize(0)
1023 .clampScalar(0, S32, S64);
1024 }
1025
1026 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027 .legalFor({{S32, S32}, {S64, S32}})
1028 .scalarize(0)
1029 .clampScalar(0, S32, S64)
1030 .clampScalar(1, S32, S32)
1031 .lower();
1032
1033 getActionDefinitionsBuilder(G_FFREXP)
1034 .customFor({{S32, S32}, {S64, S32}})
1035 .scalarize(0)
1036 .minScalar(0, S32)
1037 .clampScalar(1, S32, S32)
1038 .lower();
1039 }
1040
1041 getActionDefinitionsBuilder(G_FPTRUNC)
1042 .legalFor({{S32, S64}, {S16, S32}})
1043 .scalarize(0)
1044 .lower();
1045
1046 getActionDefinitionsBuilder(G_FPEXT)
1047 .legalFor({{S64, S32}, {S32, S16}})
1048 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049 .scalarize(0);
1050
1051 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052 if (ST.has16BitInsts()) {
1053 FSubActions
1054 // Use actual fsub instruction
1055 .legalFor({S32, S16})
1056 // Must use fadd + fneg
1057 .lowerFor({S64, V2S16});
1058 } else {
1059 FSubActions
1060 // Use actual fsub instruction
1061 .legalFor({S32})
1062 // Must use fadd + fneg
1063 .lowerFor({S64, S16, V2S16});
1064 }
1065
1066 FSubActions
1067 .scalarize(0)
1068 .clampScalar(0, S32, S64);
1069
1070 // Whether this is legal depends on the floating point mode for the function.
1071 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073 FMad.customFor({S32, S16});
1074 else if (ST.hasMadMacF32Insts())
1075 FMad.customFor({S32});
1076 else if (ST.hasMadF16())
1077 FMad.customFor({S16});
1078 FMad.scalarize(0)
1079 .lower();
1080
1081 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082 if (ST.has16BitInsts()) {
1083 FRem.customFor({S16, S32, S64});
1084 } else {
1085 FRem.minScalar(0, S32)
1086 .customFor({S32, S64});
1087 }
1088 FRem.scalarize(0);
1089
1090 // TODO: Do we need to clamp maximum bitwidth?
1091 getActionDefinitionsBuilder(G_TRUNC)
1092 .legalIf(isScalar(0))
1093 .legalFor({{V2S16, V2S32}})
1094 .clampMaxNumElements(0, S16, 2)
1095 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096 // situations (like an invalid implicit use), we don't want to infinite loop
1097 // in the legalizer.
1098 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1099 .alwaysLegal();
1100
1101 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103 {S32, S1}, {S64, S1}, {S16, S1}})
1104 .scalarize(0)
1105 .clampScalar(0, S32, S64)
1106 .widenScalarToNextPow2(1, 32);
1107
1108 // TODO: Split s1->s64 during regbankselect for VALU.
1109 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111 .lowerIf(typeIs(1, S1))
1112 .customFor({{S32, S64}, {S64, S64}});
1113 if (ST.has16BitInsts())
1114 IToFP.legalFor({{S16, S16}});
1115 IToFP.clampScalar(1, S32, S64)
1116 .minScalar(0, S32)
1117 .scalarize(0)
1118 .widenScalarToNextPow2(1);
1119
1120 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122 .customFor({{S64, S32}, {S64, S64}})
1123 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124 if (ST.has16BitInsts())
1125 FPToI.legalFor({{S16, S16}});
1126 else
1127 FPToI.minScalar(1, S32);
1128
1129 FPToI.minScalar(0, S32)
1130 .widenScalarToNextPow2(0, 32)
1131 .scalarize(0)
1132 .lower();
1133
1134 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1135 .customFor({S16, S32})
1136 .scalarize(0)
1137 .lower();
1138
1139 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141 .scalarize(0)
1142 .lower();
1143
1144 if (ST.has16BitInsts()) {
1145 getActionDefinitionsBuilder(
1146 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147 .legalFor({S16, S32, S64})
1148 .clampScalar(0, S16, S64)
1149 .scalarize(0);
1150 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1151 getActionDefinitionsBuilder(
1152 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1153 .legalFor({S32, S64})
1154 .clampScalar(0, S32, S64)
1155 .scalarize(0);
1156 } else {
1157 getActionDefinitionsBuilder(
1158 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159 .legalFor({S32})
1160 .customFor({S64})
1161 .clampScalar(0, S32, S64)
1162 .scalarize(0);
1163 }
1164
1165 getActionDefinitionsBuilder(G_PTR_ADD)
1166 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167 .legalIf(all(isPointer(0), sameSize(0, 1)))
1168 .scalarize(0)
1169 .scalarSameSizeAs(1, 0);
1170
1171 getActionDefinitionsBuilder(G_PTRMASK)
1172 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1173 .scalarSameSizeAs(1, 0)
1174 .scalarize(0);
1175
1176 auto &CmpBuilder =
1177 getActionDefinitionsBuilder(G_ICMP)
1178 // The compare output type differs based on the register bank of the output,
1179 // so make both s1 and s32 legal.
1180 //
1181 // Scalar compares producing output in scc will be promoted to s32, as that
1182 // is the allocatable register type that will be needed for the copy from
1183 // scc. This will be promoted during RegBankSelect, and we assume something
1184 // before that won't try to use s32 result types.
1185 //
1186 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187 // bank.
1188 .legalForCartesianProduct(
1189 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190 .legalForCartesianProduct(
1191 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192 if (ST.has16BitInsts()) {
1193 CmpBuilder.legalFor({{S1, S16}});
1194 }
1195
1196 CmpBuilder
1197 .widenScalarToNextPow2(1)
1198 .clampScalar(1, S32, S64)
1199 .scalarize(0)
1200 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1201
1202 auto &FCmpBuilder =
1203 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1204 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1205
1206 if (ST.hasSALUFloatInsts())
1207 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1208
1209 FCmpBuilder
1210 .widenScalarToNextPow2(1)
1211 .clampScalar(1, S32, S64)
1212 .scalarize(0);
1213
1214 // FIXME: fpow has a selection pattern that should move to custom lowering.
1215 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1216 if (ST.has16BitInsts())
1217 ExpOps.customFor({{S32}, {S16}});
1218 else
1219 ExpOps.customFor({S32});
1220 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1221 .scalarize(0);
1222
1223 getActionDefinitionsBuilder(G_FPOWI)
1224 .clampScalar(0, MinScalarFPTy, S32)
1225 .lower();
1226
1227 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1228 Log2Ops.customFor({S32});
1229 if (ST.has16BitInsts())
1230 Log2Ops.legalFor({S16});
1231 else
1232 Log2Ops.customFor({S16});
1233 Log2Ops.scalarize(0)
1234 .lower();
1235
1236 auto &LogOps =
1237 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238 LogOps.customFor({S32, S16});
1239 LogOps.clampScalar(0, MinScalarFPTy, S32)
1240 .scalarize(0);
1241
1242 // The 64-bit versions produce 32-bit results, but only on the SALU.
1243 getActionDefinitionsBuilder(G_CTPOP)
1244 .legalFor({{S32, S32}, {S32, S64}})
1245 .clampScalar(0, S32, S32)
1246 .widenScalarToNextPow2(1, 32)
1247 .clampScalar(1, S32, S64)
1248 .scalarize(0)
1249 .widenScalarToNextPow2(0, 32);
1250
1251 // If no 16 bit instr is available, lower into different instructions.
1252 if (ST.has16BitInsts())
1253 getActionDefinitionsBuilder(G_IS_FPCLASS)
1254 .legalForCartesianProduct({S1}, FPTypes16)
1255 .widenScalarToNextPow2(1)
1256 .scalarize(0)
1257 .lower();
1258 else
1259 getActionDefinitionsBuilder(G_IS_FPCLASS)
1260 .legalForCartesianProduct({S1}, FPTypesBase)
1261 .lowerFor({S1, S16})
1262 .widenScalarToNextPow2(1)
1263 .scalarize(0)
1264 .lower();
1265
1266 // The hardware instructions return a different result on 0 than the generic
1267 // instructions expect. The hardware produces -1, but these produce the
1268 // bitwidth.
1269 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1270 .scalarize(0)
1271 .clampScalar(0, S32, S32)
1272 .clampScalar(1, S32, S64)
1273 .widenScalarToNextPow2(0, 32)
1274 .widenScalarToNextPow2(1, 32)
1275 .custom();
1276
1277 // The 64-bit versions produce 32-bit results, but only on the SALU.
1278 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1279 .legalFor({{S32, S32}, {S32, S64}})
1280 .customIf(scalarNarrowerThan(1, 32))
1281 .clampScalar(0, S32, S32)
1282 .clampScalar(1, S32, S64)
1283 .scalarize(0)
1284 .widenScalarToNextPow2(0, 32)
1285 .widenScalarToNextPow2(1, 32);
1286
1287 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1288 .legalFor({{S32, S32}, {S32, S64}})
1289 .clampScalar(0, S32, S32)
1290 .clampScalar(1, S32, S64)
1291 .scalarize(0)
1292 .widenScalarToNextPow2(0, 32)
1293 .widenScalarToNextPow2(1, 32);
1294
1295 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296 // RegBankSelect.
1297 getActionDefinitionsBuilder(G_BITREVERSE)
1298 .legalFor({S32, S64})
1299 .clampScalar(0, S32, S64)
1300 .scalarize(0)
1301 .widenScalarToNextPow2(0);
1302
1303 if (ST.has16BitInsts()) {
1304 getActionDefinitionsBuilder(G_BSWAP)
1305 .legalFor({S16, S32, V2S16})
1306 .clampMaxNumElementsStrict(0, S16, 2)
1307 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1308 // narrowScalar limitation.
1309 .widenScalarToNextPow2(0)
1310 .clampScalar(0, S16, S32)
1311 .scalarize(0);
1312
1313 if (ST.hasVOP3PInsts()) {
1314 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1315 .legalFor({S32, S16, V2S16})
1316 .clampMaxNumElements(0, S16, 2)
1317 .minScalar(0, S16)
1318 .widenScalarToNextPow2(0)
1319 .scalarize(0)
1320 .lower();
1321 } else {
1322 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1323 .legalFor({S32, S16})
1324 .widenScalarToNextPow2(0)
1325 .minScalar(0, S16)
1326 .scalarize(0)
1327 .lower();
1328 }
1329 } else {
1330 // TODO: Should have same legality without v_perm_b32
1331 getActionDefinitionsBuilder(G_BSWAP)
1332 .legalFor({S32})
1333 .lowerIf(scalarNarrowerThan(0, 32))
1334 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1335 // narrowScalar limitation.
1336 .widenScalarToNextPow2(0)
1337 .maxScalar(0, S32)
1338 .scalarize(0)
1339 .lower();
1340
1341 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1342 .legalFor({S32})
1343 .minScalar(0, S32)
1344 .widenScalarToNextPow2(0)
1345 .scalarize(0)
1346 .lower();
1347 }
1348
1349 getActionDefinitionsBuilder(G_INTTOPTR)
1350 // List the common cases
1351 .legalForCartesianProduct(AddrSpaces64, {S64})
1352 .legalForCartesianProduct(AddrSpaces32, {S32})
1353 .scalarize(0)
1354 // Accept any address space as long as the size matches
1355 .legalIf(sameSize(0, 1))
1356 .widenScalarIf(smallerThan(1, 0),
1357 [](const LegalityQuery &Query) {
1358 return std::pair(
1359 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1360 })
1361 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1362 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1363 });
1364
1365 getActionDefinitionsBuilder(G_PTRTOINT)
1366 // List the common cases
1367 .legalForCartesianProduct(AddrSpaces64, {S64})
1368 .legalForCartesianProduct(AddrSpaces32, {S32})
1369 .scalarize(0)
1370 // Accept any address space as long as the size matches
1371 .legalIf(sameSize(0, 1))
1372 .widenScalarIf(smallerThan(0, 1),
1373 [](const LegalityQuery &Query) {
1374 return std::pair(
1375 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1376 })
1377 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1378 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1379 });
1380
1381 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1382 .scalarize(0)
1383 .custom();
1384
1385 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1386 bool IsLoad) -> bool {
1387 const LLT DstTy = Query.Types[0];
1388
1389 // Split vector extloads.
1390 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1391
1392 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1393 return true;
1394
1395 const LLT PtrTy = Query.Types[1];
1396 unsigned AS = PtrTy.getAddressSpace();
1397 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1398 Query.MMODescrs[0].Ordering !=
1399 AtomicOrdering::NotAtomic))
1400 return true;
1401
1402 // Catch weird sized loads that don't evenly divide into the access sizes
1403 // TODO: May be able to widen depending on alignment etc.
1404 unsigned NumRegs = (MemSize + 31) / 32;
1405 if (NumRegs == 3) {
1406 if (!ST.hasDwordx3LoadStores())
1407 return true;
1408 } else {
1409 // If the alignment allows, these should have been widened.
1410 if (!isPowerOf2_32(NumRegs))
1411 return true;
1412 }
1413
1414 return false;
1415 };
1416
1417 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1420
1421 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1422 // LDS
1423 // TODO: Unsupported flat for SI.
1424
1425 for (unsigned Op : {G_LOAD, G_STORE}) {
1426 const bool IsStore = Op == G_STORE;
1427
1428 auto &Actions = getActionDefinitionsBuilder(Op);
1429 // Explicitly list some common cases.
1430 // TODO: Does this help compile time at all?
1431 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1432 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1433 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1434 {S64, GlobalPtr, S64, GlobalAlign32},
1435 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1436 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1437 {S32, GlobalPtr, S8, GlobalAlign8},
1438 {S32, GlobalPtr, S16, GlobalAlign16},
1439
1440 {S32, LocalPtr, S32, 32},
1441 {S64, LocalPtr, S64, 32},
1442 {V2S32, LocalPtr, V2S32, 32},
1443 {S32, LocalPtr, S8, 8},
1444 {S32, LocalPtr, S16, 16},
1445 {V2S16, LocalPtr, S32, 32},
1446
1447 {S32, PrivatePtr, S32, 32},
1448 {S32, PrivatePtr, S8, 8},
1449 {S32, PrivatePtr, S16, 16},
1450 {V2S16, PrivatePtr, S32, 32},
1451
1452 {S32, ConstantPtr, S32, GlobalAlign32},
1453 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1454 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1455 {S64, ConstantPtr, S64, GlobalAlign32},
1456 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1457 Actions.legalIf(
1458 [=](const LegalityQuery &Query) -> bool {
1459 return isLoadStoreLegal(ST, Query);
1460 });
1461
1462 // The custom pointers (fat pointers, buffer resources) don't work with load
1463 // and store at this level. Fat pointers should have been lowered to
1464 // intrinsics before the translation to MIR.
1465 Actions.unsupportedIf(
1466 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1467
1468 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469 // ptrtoint. This is needed to account for the fact that we can't have i128
1470 // as a register class for SelectionDAG reasons.
1471 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1472 return hasBufferRsrcWorkaround(Query.Types[0]);
1473 });
1474
1475 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476 // 64-bits.
1477 //
1478 // TODO: Should generalize bitcast action into coerce, which will also cover
1479 // inserting addrspacecasts.
1480 Actions.customIf(typeIs(1, Constant32Ptr));
1481
1482 // Turn any illegal element vectors into something easier to deal
1483 // with. These will ultimately produce 32-bit scalar shifts to extract the
1484 // parts anyway.
1485 //
1486 // For odd 16-bit element vectors, prefer to split those into pieces with
1487 // 16-bit vector parts.
1488 Actions.bitcastIf(
1489 [=](const LegalityQuery &Query) -> bool {
1490 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1491 Query.MMODescrs[0].MemoryTy);
1492 }, bitcastToRegisterType(0));
1493
1494 if (!IsStore) {
1495 // Widen suitably aligned loads by loading extra bytes. The standard
1496 // legalization actions can't properly express widening memory operands.
1497 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1498 return shouldWidenLoad(ST, Query, G_LOAD);
1499 });
1500 }
1501
1502 // FIXME: load/store narrowing should be moved to lower action
1503 Actions
1504 .narrowScalarIf(
1505 [=](const LegalityQuery &Query) -> bool {
1506 return !Query.Types[0].isVector() &&
1507 needToSplitMemOp(Query, Op == G_LOAD);
1508 },
1509 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1510 const LLT DstTy = Query.Types[0];
1511 const LLT PtrTy = Query.Types[1];
1512
1513 const unsigned DstSize = DstTy.getSizeInBits();
1514 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1515
1516 // Split extloads.
1517 if (DstSize > MemSize)
1518 return std::pair(0, LLT::scalar(MemSize));
1519
1520 unsigned MaxSize = maxSizeForAddrSpace(
1521 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1522 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1523 if (MemSize > MaxSize)
1524 return std::pair(0, LLT::scalar(MaxSize));
1525
1526 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1527 return std::pair(0, LLT::scalar(Align));
1528 })
1529 .fewerElementsIf(
1530 [=](const LegalityQuery &Query) -> bool {
1531 return Query.Types[0].isVector() &&
1532 needToSplitMemOp(Query, Op == G_LOAD);
1533 },
1534 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1535 const LLT DstTy = Query.Types[0];
1536 const LLT PtrTy = Query.Types[1];
1537
1538 LLT EltTy = DstTy.getElementType();
1539 unsigned MaxSize = maxSizeForAddrSpace(
1540 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1541 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1542
1543 // FIXME: Handle widened to power of 2 results better. This ends
1544 // up scalarizing.
1545 // FIXME: 3 element stores scalarized on SI
1546
1547 // Split if it's too large for the address space.
1548 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1549 if (MemSize > MaxSize) {
1550 unsigned NumElts = DstTy.getNumElements();
1551 unsigned EltSize = EltTy.getSizeInBits();
1552
1553 if (MaxSize % EltSize == 0) {
1554 return std::pair(
1555 0, LLT::scalarOrVector(
1556 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1557 }
1558
1559 unsigned NumPieces = MemSize / MaxSize;
1560
1561 // FIXME: Refine when odd breakdowns handled
1562 // The scalars will need to be re-legalized.
1563 if (NumPieces == 1 || NumPieces >= NumElts ||
1564 NumElts % NumPieces != 0)
1565 return std::pair(0, EltTy);
1566
1567 return std::pair(0,
1568 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1569 }
1570
1571 // FIXME: We could probably handle weird extending loads better.
1572 if (DstTy.getSizeInBits() > MemSize)
1573 return std::pair(0, EltTy);
1574
1575 unsigned EltSize = EltTy.getSizeInBits();
1576 unsigned DstSize = DstTy.getSizeInBits();
1577 if (!isPowerOf2_32(DstSize)) {
1578 // We're probably decomposing an odd sized store. Try to split
1579 // to the widest type. TODO: Account for alignment. As-is it
1580 // should be OK, since the new parts will be further legalized.
1581 unsigned FloorSize = llvm::bit_floor(DstSize);
1582 return std::pair(
1583 0, LLT::scalarOrVector(
1584 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1585 }
1586
1587 // May need relegalization for the scalars.
1588 return std::pair(0, EltTy);
1589 })
1590 .minScalar(0, S32)
1591 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1592 .widenScalarToNextPow2(0)
1593 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1594 .lower();
1595 }
1596
1597 // FIXME: Unaligned accesses not lowered.
1598 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1599 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1600 {S32, GlobalPtr, S16, 2 * 8},
1601 {S32, LocalPtr, S8, 8},
1602 {S32, LocalPtr, S16, 16},
1603 {S32, PrivatePtr, S8, 8},
1604 {S32, PrivatePtr, S16, 16},
1605 {S32, ConstantPtr, S8, 8},
1606 {S32, ConstantPtr, S16, 2 * 8}})
1607 .legalIf(
1608 [=](const LegalityQuery &Query) -> bool {
1609 return isLoadStoreLegal(ST, Query);
1610 });
1611
1612 if (ST.hasFlatAddressSpace()) {
1613 ExtLoads.legalForTypesWithMemDesc(
1614 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1615 }
1616
1617 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618 // 64-bits.
1619 //
1620 // TODO: Should generalize bitcast action into coerce, which will also cover
1621 // inserting addrspacecasts.
1622 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1623
1624 ExtLoads.clampScalar(0, S32, S32)
1625 .widenScalarToNextPow2(0)
1626 .lower();
1627
1628 auto &Atomics = getActionDefinitionsBuilder(
1629 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1634 {S64, GlobalPtr}, {S64, LocalPtr},
1635 {S32, RegionPtr}, {S64, RegionPtr}});
1636 if (ST.hasFlatAddressSpace()) {
1637 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1638 }
1639
1640 // TODO: v2bf16 operations, and fat buffer pointer support.
1641 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1642 if (ST.hasLDSFPAtomicAddF32()) {
1643 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1644 if (ST.hasLdsAtomicAddF64())
1645 Atomic.legalFor({{S64, LocalPtr}});
1646 if (ST.hasAtomicDsPkAdd16Insts())
1647 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1648 }
1649 if (ST.hasAtomicFaddInsts())
1650 Atomic.legalFor({{S32, GlobalPtr}});
1651 if (ST.hasFlatAtomicFaddF32Inst())
1652 Atomic.legalFor({{S32, FlatPtr}});
1653
1654 if (ST.hasGFX90AInsts()) {
1655 // These are legal with some caveats, and should have undergone expansion in
1656 // the IR in most situations
1657 // TODO: Move atomic expansion into legalizer
1658 Atomic.legalFor({
1659 {S32, GlobalPtr},
1660 {S64, GlobalPtr},
1661 {S64, FlatPtr}
1662 });
1663 }
1664
1665 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666 ST.hasAtomicBufferGlobalPkAddF16Insts())
1667 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668 if (ST.hasAtomicGlobalPkAddBF16Inst())
1669 Atomic.legalFor({{V2BF16, GlobalPtr}});
1670 if (ST.hasAtomicFlatPkAdd16Insts())
1671 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1672
1673
1674 // Most of the legalization work here is done by AtomicExpand. We could
1675 // probably use a simpler legality rule that just assumes anything is OK.
1676 auto &AtomicFMinFMax =
1677 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1678 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1679
1680 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1681 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1682 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1683 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1684 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1685 AtomicFMinFMax.legalFor({F32, FlatPtr});
1686 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1687 AtomicFMinFMax.legalFor({F64, FlatPtr});
1688
1689 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1690 // demarshalling
1691 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1692 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1693 {S32, FlatPtr}, {S64, FlatPtr}})
1694 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1695 {S32, RegionPtr}, {S64, RegionPtr}});
1696 // TODO: Pointer types, any 32-bit or 64-bit vector
1697
1698 // Condition should be s32 for scalar, s1 for vector.
1699 getActionDefinitionsBuilder(G_SELECT)
1700 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1701 LocalPtr, FlatPtr, PrivatePtr,
1702 LLT::fixed_vector(2, LocalPtr),
1703 LLT::fixed_vector(2, PrivatePtr)},
1704 {S1, S32})
1705 .clampScalar(0, S16, S64)
1706 .scalarize(1)
1707 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1708 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1709 .clampMaxNumElements(0, S32, 2)
1710 .clampMaxNumElements(0, LocalPtr, 2)
1711 .clampMaxNumElements(0, PrivatePtr, 2)
1712 .scalarize(0)
1713 .widenScalarToNextPow2(0)
1714 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1715
1716 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1717 // be more flexible with the shift amount type.
1718 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1719 .legalFor({{S32, S32}, {S64, S32}});
1720 if (ST.has16BitInsts()) {
1721 if (ST.hasVOP3PInsts()) {
1722 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1723 .clampMaxNumElements(0, S16, 2);
1724 } else
1725 Shifts.legalFor({{S16, S16}});
1726
1727 // TODO: Support 16-bit shift amounts for all types
1728 Shifts.widenScalarIf(
1729 [=](const LegalityQuery &Query) {
1730 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731 // 32-bit amount.
1732 const LLT ValTy = Query.Types[0];
1733 const LLT AmountTy = Query.Types[1];
1734 return ValTy.getSizeInBits() <= 16 &&
1735 AmountTy.getSizeInBits() < 16;
1736 }, changeTo(1, S16));
1737 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1738 Shifts.clampScalar(1, S32, S32);
1739 Shifts.widenScalarToNextPow2(0, 16);
1740 Shifts.clampScalar(0, S16, S64);
1741
1742 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1743 .minScalar(0, S16)
1744 .scalarize(0)
1745 .lower();
1746 } else {
1747 // Make sure we legalize the shift amount type first, as the general
1748 // expansion for the shifted type will produce much worse code if it hasn't
1749 // been truncated already.
1750 Shifts.clampScalar(1, S32, S32);
1751 Shifts.widenScalarToNextPow2(0, 32);
1752 Shifts.clampScalar(0, S32, S64);
1753
1754 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1755 .minScalar(0, S32)
1756 .scalarize(0)
1757 .lower();
1758 }
1759 Shifts.scalarize(0);
1760
1761 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1762 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1763 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1764 unsigned IdxTypeIdx = 2;
1765
1766 getActionDefinitionsBuilder(Op)
1767 .customIf([=](const LegalityQuery &Query) {
1768 const LLT EltTy = Query.Types[EltTypeIdx];
1769 const LLT VecTy = Query.Types[VecTypeIdx];
1770 const LLT IdxTy = Query.Types[IdxTypeIdx];
1771 const unsigned EltSize = EltTy.getSizeInBits();
1772 const bool isLegalVecType =
1773 !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1774 // Address space 8 pointers are 128-bit wide values, but the logic
1775 // below will try to bitcast them to 2N x s64, which will fail.
1776 // Therefore, as an intermediate step, wrap extracts/insertions from a
1777 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1778 // extraction result) in order to produce a vector operation that can
1779 // be handled by the logic below.
1780 if (EltTy.isPointer() && EltSize > 64)
1781 return true;
1782 return (EltSize == 32 || EltSize == 64) &&
1783 VecTy.getSizeInBits() % 32 == 0 &&
1784 VecTy.getSizeInBits() <= MaxRegisterSize &&
1785 IdxTy.getSizeInBits() == 32 &&
1786 isLegalVecType;
1787 })
1788 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1789 bitcastToVectorElement32(VecTypeIdx))
1790 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1791 .bitcastIf(
1792 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1793 [=](const LegalityQuery &Query) {
1794 // For > 64-bit element types, try to turn this into a 64-bit
1795 // element vector since we may be able to do better indexing
1796 // if this is scalar. If not, fall back to 32.
1797 const LLT EltTy = Query.Types[EltTypeIdx];
1798 const LLT VecTy = Query.Types[VecTypeIdx];
1799 const unsigned DstEltSize = EltTy.getSizeInBits();
1800 const unsigned VecSize = VecTy.getSizeInBits();
1801
1802 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1803 return std::pair(
1804 VecTypeIdx,
1805 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1806 })
1807 .clampScalar(EltTypeIdx, S32, S64)
1808 .clampScalar(VecTypeIdx, S32, S64)
1809 .clampScalar(IdxTypeIdx, S32, S32)
1810 .clampMaxNumElements(VecTypeIdx, S32, 32)
1811 // TODO: Clamp elements for 64-bit vectors?
1812 .moreElementsIf(
1813 isIllegalRegisterType(VecTypeIdx),
1814 moreElementsToNextExistingRegClass(VecTypeIdx))
1815 // It should only be necessary with variable indexes.
1816 // As a last resort, lower to the stack
1817 .lower();
1818 }
1819
1820 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1821 .unsupportedIf([=](const LegalityQuery &Query) {
1822 const LLT &EltTy = Query.Types[1].getElementType();
1823 return Query.Types[0] != EltTy;
1824 });
1825
1826 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1827 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1828 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1829
1830 // FIXME: Doesn't handle extract of illegal sizes.
1831 getActionDefinitionsBuilder(Op)
1832 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1833 .lowerIf([=](const LegalityQuery &Query) {
1834 // Sub-vector(or single element) insert and extract.
1835 // TODO: verify immediate offset here since lower only works with
1836 // whole elements.
1837 const LLT BigTy = Query.Types[BigTyIdx];
1838 return BigTy.isVector();
1839 })
1840 // FIXME: Multiples of 16 should not be legal.
1841 .legalIf([=](const LegalityQuery &Query) {
1842 const LLT BigTy = Query.Types[BigTyIdx];
1843 const LLT LitTy = Query.Types[LitTyIdx];
1844 return (BigTy.getSizeInBits() % 32 == 0) &&
1845 (LitTy.getSizeInBits() % 16 == 0);
1846 })
1847 .widenScalarIf(
1848 [=](const LegalityQuery &Query) {
1849 const LLT BigTy = Query.Types[BigTyIdx];
1850 return (BigTy.getScalarSizeInBits() < 16);
1851 },
1852 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1853 .widenScalarIf(
1854 [=](const LegalityQuery &Query) {
1855 const LLT LitTy = Query.Types[LitTyIdx];
1856 return (LitTy.getScalarSizeInBits() < 16);
1857 },
1858 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1859 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1860 .widenScalarToNextPow2(BigTyIdx, 32);
1861
1862 }
1863
1864 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1865 .legalForCartesianProduct(AllS32Vectors, {S32})
1866 .legalForCartesianProduct(AllS64Vectors, {S64})
1867 .clampNumElements(0, V16S32, V32S32)
1868 .clampNumElements(0, V2S64, V16S64)
1869 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1870 .moreElementsIf(
1871 isIllegalRegisterType(0),
1872 moreElementsToNextExistingRegClass(0));
1873
1874 if (ST.hasScalarPackInsts()) {
1875 BuildVector
1876 // FIXME: Should probably widen s1 vectors straight to s32
1877 .minScalarOrElt(0, S16)
1878 .minScalar(1, S16);
1879
1880 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1881 .legalFor({V2S16, S32})
1882 .lower();
1883 } else {
1884 BuildVector.customFor({V2S16, S16});
1885 BuildVector.minScalarOrElt(0, S32);
1886
1887 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1888 .customFor({V2S16, S32})
1889 .lower();
1890 }
1891
1892 BuildVector.legalIf(isRegisterType(0));
1893
1894 // FIXME: Clamp maximum size
1895 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1896 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1897 .clampMaxNumElements(0, S32, 32)
1898 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1899 .clampMaxNumElements(0, S16, 64);
1900
1901 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1902
1903 // Merge/Unmerge
1904 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1905 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1906 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1907
1908 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1909 const LLT Ty = Query.Types[TypeIdx];
1910 if (Ty.isVector()) {
1911 const LLT &EltTy = Ty.getElementType();
1912 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1913 return true;
1914 if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1915 return true;
1916 }
1917 return false;
1918 };
1919
1920 auto &Builder = getActionDefinitionsBuilder(Op)
1921 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1922 .lowerFor({{S16, V2S16}})
1923 .lowerIf([=](const LegalityQuery &Query) {
1924 const LLT BigTy = Query.Types[BigTyIdx];
1925 return BigTy.getSizeInBits() == 32;
1926 })
1927 // Try to widen to s16 first for small types.
1928 // TODO: Only do this on targets with legal s16 shifts
1929 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1930 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1931 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1932 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1933 elementTypeIs(1, S16)),
1934 changeTo(1, V2S16))
1935 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1936 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1937 // valid.
1938 .clampScalar(LitTyIdx, S32, S512)
1939 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1940 // Break up vectors with weird elements into scalars
1941 .fewerElementsIf(
1942 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1943 scalarize(0))
1944 .fewerElementsIf(
1945 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1946 scalarize(1))
1947 .clampScalar(BigTyIdx, S32, MaxScalar);
1948
1949 if (Op == G_MERGE_VALUES) {
1950 Builder.widenScalarIf(
1951 // TODO: Use 16-bit shifts if legal for 8-bit values?
1952 [=](const LegalityQuery &Query) {
1953 const LLT Ty = Query.Types[LitTyIdx];
1954 return Ty.getSizeInBits() < 32;
1955 },
1956 changeTo(LitTyIdx, S32));
1957 }
1958
1959 Builder.widenScalarIf(
1960 [=](const LegalityQuery &Query) {
1961 const LLT Ty = Query.Types[BigTyIdx];
1962 return Ty.getSizeInBits() % 16 != 0;
1963 },
1964 [=](const LegalityQuery &Query) {
1965 // Pick the next power of 2, or a multiple of 64 over 128.
1966 // Whichever is smaller.
1967 const LLT &Ty = Query.Types[BigTyIdx];
1968 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1969 if (NewSizeInBits >= 256) {
1970 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1971 if (RoundedTo < NewSizeInBits)
1972 NewSizeInBits = RoundedTo;
1973 }
1974 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1975 })
1976 // Any vectors left are the wrong size. Scalarize them.
1977 .scalarize(0)
1978 .scalarize(1);
1979 }
1980
1981 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1982 // RegBankSelect.
1983 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1984 .legalFor({{S32}, {S64}});
1985
1986 if (ST.hasVOP3PInsts()) {
1987 SextInReg.lowerFor({{V2S16}})
1988 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1989 // get more vector shift opportunities, since we'll get those when
1990 // expanded.
1991 .clampMaxNumElementsStrict(0, S16, 2);
1992 } else if (ST.has16BitInsts()) {
1993 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1994 } else {
1995 // Prefer to promote to s32 before lowering if we don't have 16-bit
1996 // shifts. This avoid a lot of intermediate truncate and extend operations.
1997 SextInReg.lowerFor({{S32}, {S64}});
1998 }
1999
2000 SextInReg
2001 .scalarize(0)
2002 .clampScalar(0, S32, S64)
2003 .lower();
2004
2005 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2006 .scalarize(0)
2007 .lower();
2008
2009 // TODO: Only Try to form v2s16 with legal packed instructions.
2010 getActionDefinitionsBuilder(G_FSHR)
2011 .legalFor({{S32, S32}})
2012 .lowerFor({{V2S16, V2S16}})
2013 .clampMaxNumElementsStrict(0, S16, 2)
2014 .scalarize(0)
2015 .lower();
2016
2017 if (ST.hasVOP3PInsts()) {
2018 getActionDefinitionsBuilder(G_FSHL)
2019 .lowerFor({{V2S16, V2S16}})
2020 .clampMaxNumElementsStrict(0, S16, 2)
2021 .scalarize(0)
2022 .lower();
2023 } else {
2024 getActionDefinitionsBuilder(G_FSHL)
2025 .scalarize(0)
2026 .lower();
2027 }
2028
2029 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2030 .legalFor({S64});
2031
2032 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2033
2034 getActionDefinitionsBuilder(G_FENCE)
2035 .alwaysLegal();
2036
2037 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2038 .scalarize(0)
2039 .minScalar(0, S32)
2040 .lower();
2041
2042 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2043 .legalFor({{S32, S32}, {S64, S32}})
2044 .clampScalar(1, S32, S32)
2045 .clampScalar(0, S32, S64)
2046 .widenScalarToNextPow2(0)
2047 .scalarize(0);
2048
2049 getActionDefinitionsBuilder(
2050 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2051 G_FCOPYSIGN,
2052
2053 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2054 G_READ_REGISTER, G_WRITE_REGISTER,
2055
2056 G_SADDO, G_SSUBO})
2057 .lower();
2058
2059 if (ST.hasIEEEMinMax()) {
2060 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2061 .legalFor(FPTypesPK16)
2062 .clampMaxNumElements(0, S16, 2)
2063 .scalarize(0);
2064 } else {
2065 // TODO: Implement
2066 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2067 }
2068
2069 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2070 .lower();
2071
2072 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2073
2074 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2075 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2076 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2077 .unsupported();
2078
2079 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2080
2081 getLegacyLegalizerInfo().computeTables();
2082 verify(*ST.getInstrInfo());
2083 }
2084
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI,LostDebugLocObserver & LocObserver) const2085 bool AMDGPULegalizerInfo::legalizeCustom(
2086 LegalizerHelper &Helper, MachineInstr &MI,
2087 LostDebugLocObserver &LocObserver) const {
2088 MachineIRBuilder &B = Helper.MIRBuilder;
2089 MachineRegisterInfo &MRI = *B.getMRI();
2090
2091 switch (MI.getOpcode()) {
2092 case TargetOpcode::G_ADDRSPACE_CAST:
2093 return legalizeAddrSpaceCast(MI, MRI, B);
2094 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2095 return legalizeFroundeven(MI, MRI, B);
2096 case TargetOpcode::G_FCEIL:
2097 return legalizeFceil(MI, MRI, B);
2098 case TargetOpcode::G_FREM:
2099 return legalizeFrem(MI, MRI, B);
2100 case TargetOpcode::G_INTRINSIC_TRUNC:
2101 return legalizeIntrinsicTrunc(MI, MRI, B);
2102 case TargetOpcode::G_SITOFP:
2103 return legalizeITOFP(MI, MRI, B, true);
2104 case TargetOpcode::G_UITOFP:
2105 return legalizeITOFP(MI, MRI, B, false);
2106 case TargetOpcode::G_FPTOSI:
2107 return legalizeFPTOI(MI, MRI, B, true);
2108 case TargetOpcode::G_FPTOUI:
2109 return legalizeFPTOI(MI, MRI, B, false);
2110 case TargetOpcode::G_FMINNUM:
2111 case TargetOpcode::G_FMAXNUM:
2112 case TargetOpcode::G_FMINNUM_IEEE:
2113 case TargetOpcode::G_FMAXNUM_IEEE:
2114 return legalizeMinNumMaxNum(Helper, MI);
2115 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2116 return legalizeExtractVectorElt(MI, MRI, B);
2117 case TargetOpcode::G_INSERT_VECTOR_ELT:
2118 return legalizeInsertVectorElt(MI, MRI, B);
2119 case TargetOpcode::G_FSIN:
2120 case TargetOpcode::G_FCOS:
2121 return legalizeSinCos(MI, MRI, B);
2122 case TargetOpcode::G_GLOBAL_VALUE:
2123 return legalizeGlobalValue(MI, MRI, B);
2124 case TargetOpcode::G_LOAD:
2125 case TargetOpcode::G_SEXTLOAD:
2126 case TargetOpcode::G_ZEXTLOAD:
2127 return legalizeLoad(Helper, MI);
2128 case TargetOpcode::G_STORE:
2129 return legalizeStore(Helper, MI);
2130 case TargetOpcode::G_FMAD:
2131 return legalizeFMad(MI, MRI, B);
2132 case TargetOpcode::G_FDIV:
2133 return legalizeFDIV(MI, MRI, B);
2134 case TargetOpcode::G_FFREXP:
2135 return legalizeFFREXP(MI, MRI, B);
2136 case TargetOpcode::G_FSQRT:
2137 return legalizeFSQRT(MI, MRI, B);
2138 case TargetOpcode::G_UDIV:
2139 case TargetOpcode::G_UREM:
2140 case TargetOpcode::G_UDIVREM:
2141 return legalizeUnsignedDIV_REM(MI, MRI, B);
2142 case TargetOpcode::G_SDIV:
2143 case TargetOpcode::G_SREM:
2144 case TargetOpcode::G_SDIVREM:
2145 return legalizeSignedDIV_REM(MI, MRI, B);
2146 case TargetOpcode::G_ATOMIC_CMPXCHG:
2147 return legalizeAtomicCmpXChg(MI, MRI, B);
2148 case TargetOpcode::G_FLOG2:
2149 return legalizeFlog2(MI, B);
2150 case TargetOpcode::G_FLOG:
2151 case TargetOpcode::G_FLOG10:
2152 return legalizeFlogCommon(MI, B);
2153 case TargetOpcode::G_FEXP2:
2154 return legalizeFExp2(MI, B);
2155 case TargetOpcode::G_FEXP:
2156 case TargetOpcode::G_FEXP10:
2157 return legalizeFExp(MI, B);
2158 case TargetOpcode::G_FPOW:
2159 return legalizeFPow(MI, B);
2160 case TargetOpcode::G_FFLOOR:
2161 return legalizeFFloor(MI, MRI, B);
2162 case TargetOpcode::G_BUILD_VECTOR:
2163 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2164 return legalizeBuildVector(MI, MRI, B);
2165 case TargetOpcode::G_MUL:
2166 return legalizeMul(Helper, MI);
2167 case TargetOpcode::G_CTLZ:
2168 case TargetOpcode::G_CTTZ:
2169 return legalizeCTLZ_CTTZ(MI, MRI, B);
2170 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2171 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2172 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2173 return legalizeFPTruncRound(MI, B);
2174 case TargetOpcode::G_STACKSAVE:
2175 return legalizeStackSave(MI, B);
2176 case TargetOpcode::G_GET_FPENV:
2177 return legalizeGetFPEnv(MI, MRI, B);
2178 case TargetOpcode::G_SET_FPENV:
2179 return legalizeSetFPEnv(MI, MRI, B);
2180 case TargetOpcode::G_TRAP:
2181 return legalizeTrap(MI, MRI, B);
2182 case TargetOpcode::G_DEBUGTRAP:
2183 return legalizeDebugTrap(MI, MRI, B);
2184 default:
2185 return false;
2186 }
2187
2188 llvm_unreachable("expected switch to return");
2189 }
2190
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2191 Register AMDGPULegalizerInfo::getSegmentAperture(
2192 unsigned AS,
2193 MachineRegisterInfo &MRI,
2194 MachineIRBuilder &B) const {
2195 MachineFunction &MF = B.getMF();
2196 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2197 const LLT S32 = LLT::scalar(32);
2198 const LLT S64 = LLT::scalar(64);
2199
2200 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2201
2202 if (ST.hasApertureRegs()) {
2203 // Note: this register is somewhat broken. When used as a 32-bit operand,
2204 // it only returns zeroes. The real value is in the upper 32 bits.
2205 // Thus, we must emit extract the high 32 bits.
2206 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2207 ? AMDGPU::SRC_SHARED_BASE
2208 : AMDGPU::SRC_PRIVATE_BASE;
2209 // FIXME: It would be more natural to emit a COPY here, but then copy
2210 // coalescing would kick in and it would think it's okay to use the "HI"
2211 // subregister (instead of extracting the HI 32 bits) which is an artificial
2212 // (unusable) register.
2213 // Register TableGen definitions would need an overhaul to get rid of the
2214 // artificial "HI" aperture registers and prevent this kind of issue from
2215 // happening.
2216 Register Dst = MRI.createGenericVirtualRegister(S64);
2217 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2218 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2219 return B.buildUnmerge(S32, Dst).getReg(1);
2220 }
2221
2222 // TODO: can we be smarter about machine pointer info?
2223 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2224 Register LoadAddr = MRI.createGenericVirtualRegister(
2225 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2226 // For code object version 5, private_base and shared_base are passed through
2227 // implicit kernargs.
2228 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2229 AMDGPU::AMDHSA_COV5) {
2230 AMDGPUTargetLowering::ImplicitParameter Param =
2231 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2232 : AMDGPUTargetLowering::PRIVATE_BASE;
2233 uint64_t Offset =
2234 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2235
2236 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2237 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2238
2239 if (!loadInputValue(KernargPtrReg, B,
2240 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2241 return Register();
2242
2243 MachineMemOperand *MMO = MF.getMachineMemOperand(
2244 PtrInfo,
2245 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2246 MachineMemOperand::MOInvariant,
2247 LLT::scalar(32), commonAlignment(Align(64), Offset));
2248
2249 // Pointer address
2250 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2251 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2252 // Load address
2253 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2254 }
2255
2256 Register QueuePtr = MRI.createGenericVirtualRegister(
2257 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2258
2259 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2260 return Register();
2261
2262 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2263 // private_segment_aperture_base_hi.
2264 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2265
2266 MachineMemOperand *MMO = MF.getMachineMemOperand(
2267 PtrInfo,
2268 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2269 MachineMemOperand::MOInvariant,
2270 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2271
2272 B.buildPtrAdd(LoadAddr, QueuePtr,
2273 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2274 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2275 }
2276
2277 /// Return true if the value is a known valid address, such that a null check is
2278 /// not necessary.
isKnownNonNull(Register Val,MachineRegisterInfo & MRI,const AMDGPUTargetMachine & TM,unsigned AddrSpace)2279 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2280 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2281 MachineInstr *Def = MRI.getVRegDef(Val);
2282 switch (Def->getOpcode()) {
2283 case AMDGPU::G_FRAME_INDEX:
2284 case AMDGPU::G_GLOBAL_VALUE:
2285 case AMDGPU::G_BLOCK_ADDR:
2286 return true;
2287 case AMDGPU::G_CONSTANT: {
2288 const ConstantInt *CI = Def->getOperand(1).getCImm();
2289 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2290 }
2291 default:
2292 return false;
2293 }
2294
2295 return false;
2296 }
2297
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2298 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2299 MachineInstr &MI, MachineRegisterInfo &MRI,
2300 MachineIRBuilder &B) const {
2301 MachineFunction &MF = B.getMF();
2302
2303 // MI can either be a G_ADDRSPACE_CAST or a
2304 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2306 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2307 Intrinsic::amdgcn_addrspacecast_nonnull));
2308
2309 const LLT S32 = LLT::scalar(32);
2310 Register Dst = MI.getOperand(0).getReg();
2311 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2312 : MI.getOperand(1).getReg();
2313 LLT DstTy = MRI.getType(Dst);
2314 LLT SrcTy = MRI.getType(Src);
2315 unsigned DestAS = DstTy.getAddressSpace();
2316 unsigned SrcAS = SrcTy.getAddressSpace();
2317
2318 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2319 // vector element.
2320 assert(!DstTy.isVector());
2321
2322 const AMDGPUTargetMachine &TM
2323 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2324
2325 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2326 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2327 return true;
2328 }
2329
2330 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2331 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2332 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2333 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334 // G_ADDRSPACE_CAST we need to guess.
2335 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2336 // Extract low 32-bits of the pointer.
2337 B.buildExtract(Dst, Src, 0);
2338 MI.eraseFromParent();
2339 return true;
2340 }
2341
2342 unsigned NullVal = TM.getNullPointerValue(DestAS);
2343
2344 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2345 auto FlatNull = B.buildConstant(SrcTy, 0);
2346
2347 // Extract low 32-bits of the pointer.
2348 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2349
2350 auto CmpRes =
2351 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2352 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2353
2354 MI.eraseFromParent();
2355 return true;
2356 }
2357
2358 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2359 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2360 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2361 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2362 if (!ApertureReg.isValid())
2363 return false;
2364
2365 // Coerce the type of the low half of the result so we can use merge_values.
2366 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2367
2368 // TODO: Should we allow mismatched types but matching sizes in merges to
2369 // avoid the ptrtoint?
2370 auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2371
2372 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373 // G_ADDRSPACE_CAST we need to guess.
2374 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2375 B.buildCopy(Dst, BuildPtr);
2376 MI.eraseFromParent();
2377 return true;
2378 }
2379
2380 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2381 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2382
2383 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2384 SegmentNull.getReg(0));
2385
2386 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2387
2388 MI.eraseFromParent();
2389 return true;
2390 }
2391
2392 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2393 SrcTy.getSizeInBits() == 64) {
2394 // Truncate.
2395 B.buildExtract(Dst, Src, 0);
2396 MI.eraseFromParent();
2397 return true;
2398 }
2399
2400 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2401 DstTy.getSizeInBits() == 64) {
2402 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2403 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2404 auto PtrLo = B.buildPtrToInt(S32, Src);
2405 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2406 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2407 MI.eraseFromParent();
2408 return true;
2409 }
2410
2411 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2412 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2413
2414 LLVMContext &Ctx = MF.getFunction().getContext();
2415 Ctx.diagnose(InvalidAddrSpaceCast);
2416 B.buildUndef(Dst);
2417 MI.eraseFromParent();
2418 return true;
2419 }
2420
legalizeFroundeven(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2421 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2422 MachineRegisterInfo &MRI,
2423 MachineIRBuilder &B) const {
2424 Register Src = MI.getOperand(1).getReg();
2425 LLT Ty = MRI.getType(Src);
2426 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2427
2428 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2429 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2430
2431 auto C1 = B.buildFConstant(Ty, C1Val);
2432 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2433
2434 // TODO: Should this propagate fast-math-flags?
2435 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2436 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2437
2438 auto C2 = B.buildFConstant(Ty, C2Val);
2439 auto Fabs = B.buildFAbs(Ty, Src);
2440
2441 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2442 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2443 MI.eraseFromParent();
2444 return true;
2445 }
2446
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2447 bool AMDGPULegalizerInfo::legalizeFceil(
2448 MachineInstr &MI, MachineRegisterInfo &MRI,
2449 MachineIRBuilder &B) const {
2450
2451 const LLT S1 = LLT::scalar(1);
2452 const LLT S64 = LLT::scalar(64);
2453
2454 Register Src = MI.getOperand(1).getReg();
2455 assert(MRI.getType(Src) == S64);
2456
2457 // result = trunc(src)
2458 // if (src > 0.0 && src != result)
2459 // result += 1.0
2460
2461 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2462
2463 const auto Zero = B.buildFConstant(S64, 0.0);
2464 const auto One = B.buildFConstant(S64, 1.0);
2465 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2466 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2467 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2468 auto Add = B.buildSelect(S64, And, One, Zero);
2469
2470 // TODO: Should this propagate fast-math-flags?
2471 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2472 MI.eraseFromParent();
2473 return true;
2474 }
2475
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2476 bool AMDGPULegalizerInfo::legalizeFrem(
2477 MachineInstr &MI, MachineRegisterInfo &MRI,
2478 MachineIRBuilder &B) const {
2479 Register DstReg = MI.getOperand(0).getReg();
2480 Register Src0Reg = MI.getOperand(1).getReg();
2481 Register Src1Reg = MI.getOperand(2).getReg();
2482 auto Flags = MI.getFlags();
2483 LLT Ty = MRI.getType(DstReg);
2484
2485 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2486 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2487 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2488 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2489 MI.eraseFromParent();
2490 return true;
2491 }
2492
extractF64Exponent(Register Hi,MachineIRBuilder & B)2493 static MachineInstrBuilder extractF64Exponent(Register Hi,
2494 MachineIRBuilder &B) {
2495 const unsigned FractBits = 52;
2496 const unsigned ExpBits = 11;
2497 LLT S32 = LLT::scalar(32);
2498
2499 auto Const0 = B.buildConstant(S32, FractBits - 32);
2500 auto Const1 = B.buildConstant(S32, ExpBits);
2501
2502 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2503 .addUse(Hi)
2504 .addUse(Const0.getReg(0))
2505 .addUse(Const1.getReg(0));
2506
2507 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2508 }
2509
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2510 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2511 MachineInstr &MI, MachineRegisterInfo &MRI,
2512 MachineIRBuilder &B) const {
2513 const LLT S1 = LLT::scalar(1);
2514 const LLT S32 = LLT::scalar(32);
2515 const LLT S64 = LLT::scalar(64);
2516
2517 Register Src = MI.getOperand(1).getReg();
2518 assert(MRI.getType(Src) == S64);
2519
2520 // TODO: Should this use extract since the low half is unused?
2521 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2522 Register Hi = Unmerge.getReg(1);
2523
2524 // Extract the upper half, since this is where we will find the sign and
2525 // exponent.
2526 auto Exp = extractF64Exponent(Hi, B);
2527
2528 const unsigned FractBits = 52;
2529
2530 // Extract the sign bit.
2531 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2532 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2533
2534 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2535
2536 const auto Zero32 = B.buildConstant(S32, 0);
2537
2538 // Extend back to 64-bits.
2539 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2540
2541 auto Shr = B.buildAShr(S64, FractMask, Exp);
2542 auto Not = B.buildNot(S64, Shr);
2543 auto Tmp0 = B.buildAnd(S64, Src, Not);
2544 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2545
2546 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2547 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2548
2549 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2550 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2551 MI.eraseFromParent();
2552 return true;
2553 }
2554
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2555 bool AMDGPULegalizerInfo::legalizeITOFP(
2556 MachineInstr &MI, MachineRegisterInfo &MRI,
2557 MachineIRBuilder &B, bool Signed) const {
2558
2559 Register Dst = MI.getOperand(0).getReg();
2560 Register Src = MI.getOperand(1).getReg();
2561
2562 const LLT S64 = LLT::scalar(64);
2563 const LLT S32 = LLT::scalar(32);
2564
2565 assert(MRI.getType(Src) == S64);
2566
2567 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2568 auto ThirtyTwo = B.buildConstant(S32, 32);
2569
2570 if (MRI.getType(Dst) == S64) {
2571 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2572 : B.buildUITOFP(S64, Unmerge.getReg(1));
2573
2574 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2575 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2576
2577 // TODO: Should this propagate fast-math-flags?
2578 B.buildFAdd(Dst, LdExp, CvtLo);
2579 MI.eraseFromParent();
2580 return true;
2581 }
2582
2583 assert(MRI.getType(Dst) == S32);
2584
2585 auto One = B.buildConstant(S32, 1);
2586
2587 MachineInstrBuilder ShAmt;
2588 if (Signed) {
2589 auto ThirtyOne = B.buildConstant(S32, 31);
2590 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2591 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2592 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2593 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2594 .addUse(Unmerge.getReg(1));
2595 auto LS2 = B.buildSub(S32, LS, One);
2596 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2597 } else
2598 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2599 auto Norm = B.buildShl(S64, Src, ShAmt);
2600 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2601 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2602 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2603 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2604 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2605 B.buildFLdexp(Dst, FVal, Scale);
2606 MI.eraseFromParent();
2607 return true;
2608 }
2609
2610 // TODO: Copied from DAG implementation. Verify logic and document how this
2611 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2612 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2613 MachineRegisterInfo &MRI,
2614 MachineIRBuilder &B,
2615 bool Signed) const {
2616
2617 Register Dst = MI.getOperand(0).getReg();
2618 Register Src = MI.getOperand(1).getReg();
2619
2620 const LLT S64 = LLT::scalar(64);
2621 const LLT S32 = LLT::scalar(32);
2622
2623 const LLT SrcLT = MRI.getType(Src);
2624 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2625
2626 unsigned Flags = MI.getFlags();
2627
2628 // The basic idea of converting a floating point number into a pair of 32-bit
2629 // integers is illustrated as follows:
2630 //
2631 // tf := trunc(val);
2632 // hif := floor(tf * 2^-32);
2633 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2634 // hi := fptoi(hif);
2635 // lo := fptoi(lof);
2636 //
2637 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2638 MachineInstrBuilder Sign;
2639 if (Signed && SrcLT == S32) {
2640 // However, a 32-bit floating point number has only 23 bits mantissa and
2641 // it's not enough to hold all the significant bits of `lof` if val is
2642 // negative. To avoid the loss of precision, We need to take the absolute
2643 // value after truncating and flip the result back based on the original
2644 // signedness.
2645 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2646 Trunc = B.buildFAbs(S32, Trunc, Flags);
2647 }
2648 MachineInstrBuilder K0, K1;
2649 if (SrcLT == S64) {
2650 K0 = B.buildFConstant(
2651 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2652 K1 = B.buildFConstant(
2653 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2654 } else {
2655 K0 = B.buildFConstant(
2656 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2657 K1 = B.buildFConstant(
2658 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2659 }
2660
2661 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2662 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2663 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2664
2665 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2666 : B.buildFPTOUI(S32, FloorMul);
2667 auto Lo = B.buildFPTOUI(S32, Fma);
2668
2669 if (Signed && SrcLT == S32) {
2670 // Flip the result based on the signedness, which is either all 0s or 1s.
2671 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2672 // r := xor({lo, hi}, sign) - sign;
2673 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2674 Sign);
2675 } else
2676 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2677 MI.eraseFromParent();
2678
2679 return true;
2680 }
2681
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2682 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2683 MachineInstr &MI) const {
2684 MachineFunction &MF = Helper.MIRBuilder.getMF();
2685 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2686
2687 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2688 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2689
2690 // With ieee_mode disabled, the instructions have the correct behavior
2691 // already for G_FMINNUM/G_FMAXNUM
2692 if (!MFI->getMode().IEEE)
2693 return !IsIEEEOp;
2694
2695 if (IsIEEEOp)
2696 return true;
2697
2698 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2699 }
2700
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2701 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2702 MachineInstr &MI, MachineRegisterInfo &MRI,
2703 MachineIRBuilder &B) const {
2704 // TODO: Should move some of this into LegalizerHelper.
2705
2706 // TODO: Promote dynamic indexing of s16 to s32
2707
2708 Register Dst = MI.getOperand(0).getReg();
2709 Register Vec = MI.getOperand(1).getReg();
2710
2711 LLT VecTy = MRI.getType(Vec);
2712 LLT EltTy = VecTy.getElementType();
2713 assert(EltTy == MRI.getType(Dst));
2714
2715 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2716 // but we can't go directly to that logic becasue you can't bitcast a vector
2717 // of pointers to a vector of integers. Therefore, introduce an intermediate
2718 // vector of integers using ptrtoint (and inttoptr on the output) in order to
2719 // drive the legalization forward.
2720 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2721 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2722 LLT IntVecTy = VecTy.changeElementType(IntTy);
2723
2724 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2725 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2726 B.buildIntToPtr(Dst, IntElt);
2727
2728 MI.eraseFromParent();
2729 return true;
2730 }
2731
2732 // FIXME: Artifact combiner probably should have replaced the truncated
2733 // constant before this, so we shouldn't need
2734 // getIConstantVRegValWithLookThrough.
2735 std::optional<ValueAndVReg> MaybeIdxVal =
2736 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2737 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2738 return true;
2739 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2740
2741 if (IdxVal < VecTy.getNumElements()) {
2742 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2743 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2744 } else {
2745 B.buildUndef(Dst);
2746 }
2747
2748 MI.eraseFromParent();
2749 return true;
2750 }
2751
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2753 MachineInstr &MI, MachineRegisterInfo &MRI,
2754 MachineIRBuilder &B) const {
2755 // TODO: Should move some of this into LegalizerHelper.
2756
2757 // TODO: Promote dynamic indexing of s16 to s32
2758
2759 Register Dst = MI.getOperand(0).getReg();
2760 Register Vec = MI.getOperand(1).getReg();
2761 Register Ins = MI.getOperand(2).getReg();
2762
2763 LLT VecTy = MRI.getType(Vec);
2764 LLT EltTy = VecTy.getElementType();
2765 assert(EltTy == MRI.getType(Ins));
2766
2767 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2768 // but we can't go directly to that logic becasue you can't bitcast a vector
2769 // of pointers to a vector of integers. Therefore, make the pointer vector
2770 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2771 // new value, and then inttoptr the result vector back. This will then allow
2772 // the rest of legalization to take over.
2773 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2774 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2775 LLT IntVecTy = VecTy.changeElementType(IntTy);
2776
2777 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2778 auto IntIns = B.buildPtrToInt(IntTy, Ins);
2779 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2780 MI.getOperand(3));
2781 B.buildIntToPtr(Dst, IntVecDest);
2782 MI.eraseFromParent();
2783 return true;
2784 }
2785
2786 // FIXME: Artifact combiner probably should have replaced the truncated
2787 // constant before this, so we shouldn't need
2788 // getIConstantVRegValWithLookThrough.
2789 std::optional<ValueAndVReg> MaybeIdxVal =
2790 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2791 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2792 return true;
2793
2794 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2795
2796 unsigned NumElts = VecTy.getNumElements();
2797 if (IdxVal < NumElts) {
2798 SmallVector<Register, 8> SrcRegs;
2799 for (unsigned i = 0; i < NumElts; ++i)
2800 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2801 B.buildUnmerge(SrcRegs, Vec);
2802
2803 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2804 B.buildMergeLikeInstr(Dst, SrcRegs);
2805 } else {
2806 B.buildUndef(Dst);
2807 }
2808
2809 MI.eraseFromParent();
2810 return true;
2811 }
2812
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2813 bool AMDGPULegalizerInfo::legalizeSinCos(
2814 MachineInstr &MI, MachineRegisterInfo &MRI,
2815 MachineIRBuilder &B) const {
2816
2817 Register DstReg = MI.getOperand(0).getReg();
2818 Register SrcReg = MI.getOperand(1).getReg();
2819 LLT Ty = MRI.getType(DstReg);
2820 unsigned Flags = MI.getFlags();
2821
2822 Register TrigVal;
2823 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2824 if (ST.hasTrigReducedRange()) {
2825 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2826 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2827 .addUse(MulVal.getReg(0))
2828 .setMIFlags(Flags)
2829 .getReg(0);
2830 } else
2831 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2832
2833 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2834 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2835 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2836 .addUse(TrigVal)
2837 .setMIFlags(Flags);
2838 MI.eraseFromParent();
2839 return true;
2840 }
2841
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2842 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2843 MachineIRBuilder &B,
2844 const GlobalValue *GV,
2845 int64_t Offset,
2846 unsigned GAFlags) const {
2847 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2848 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2849 // to the following code sequence:
2850 //
2851 // For constant address space:
2852 // s_getpc_b64 s[0:1]
2853 // s_add_u32 s0, s0, $symbol
2854 // s_addc_u32 s1, s1, 0
2855 //
2856 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2857 // a fixup or relocation is emitted to replace $symbol with a literal
2858 // constant, which is a pc-relative offset from the encoding of the $symbol
2859 // operand to the global variable.
2860 //
2861 // For global address space:
2862 // s_getpc_b64 s[0:1]
2863 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2864 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2865 //
2866 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867 // fixups or relocations are emitted to replace $symbol@*@lo and
2868 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2869 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2870 // operand to the global variable.
2871
2872 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2873
2874 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2875 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2876
2877 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2878 .addDef(PCReg);
2879
2880 MIB.addGlobalAddress(GV, Offset, GAFlags);
2881 if (GAFlags == SIInstrInfo::MO_NONE)
2882 MIB.addImm(0);
2883 else
2884 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2885
2886 if (!B.getMRI()->getRegClassOrNull(PCReg))
2887 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2888
2889 if (PtrTy.getSizeInBits() == 32)
2890 B.buildExtract(DstReg, PCReg, 0);
2891 return true;
2892 }
2893
2894 // Emit a ABS32_LO / ABS32_HI relocation stub.
buildAbsGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,MachineRegisterInfo & MRI) const2895 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2896 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2897 MachineRegisterInfo &MRI) const {
2898 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2899
2900 LLT S32 = LLT::scalar(32);
2901
2902 // Use the destination directly, if and only if we store the lower address
2903 // part only and we don't have a register class being set.
2904 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2905 ? DstReg
2906 : MRI.createGenericVirtualRegister(S32);
2907
2908 if (!MRI.getRegClassOrNull(AddrLo))
2909 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2910
2911 // Write the lower half.
2912 B.buildInstr(AMDGPU::S_MOV_B32)
2913 .addDef(AddrLo)
2914 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2915
2916 // If required, write the upper half as well.
2917 if (RequiresHighHalf) {
2918 assert(PtrTy.getSizeInBits() == 64 &&
2919 "Must provide a 64-bit pointer type!");
2920
2921 Register AddrHi = MRI.createGenericVirtualRegister(S32);
2922 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2923
2924 B.buildInstr(AMDGPU::S_MOV_B32)
2925 .addDef(AddrHi)
2926 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2927
2928 // Use the destination directly, if and only if we don't have a register
2929 // class being set.
2930 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2931 ? DstReg
2932 : MRI.createGenericVirtualRegister(LLT::scalar(64));
2933
2934 if (!MRI.getRegClassOrNull(AddrDst))
2935 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2936
2937 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2938
2939 // If we created a new register for the destination, cast the result into
2940 // the final output.
2941 if (AddrDst != DstReg)
2942 B.buildCast(DstReg, AddrDst);
2943 } else if (AddrLo != DstReg) {
2944 // If we created a new register for the destination, cast the result into
2945 // the final output.
2946 B.buildCast(DstReg, AddrLo);
2947 }
2948 }
2949
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2950 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2951 MachineInstr &MI, MachineRegisterInfo &MRI,
2952 MachineIRBuilder &B) const {
2953 Register DstReg = MI.getOperand(0).getReg();
2954 LLT Ty = MRI.getType(DstReg);
2955 unsigned AS = Ty.getAddressSpace();
2956
2957 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2958 MachineFunction &MF = B.getMF();
2959 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2960
2961 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2962 if (!MFI->isModuleEntryFunction() &&
2963 GV->getName() != "llvm.amdgcn.module.lds") {
2964 const Function &Fn = MF.getFunction();
2965 DiagnosticInfoUnsupported BadLDSDecl(
2966 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2967 DS_Warning);
2968 Fn.getContext().diagnose(BadLDSDecl);
2969
2970 // We currently don't have a way to correctly allocate LDS objects that
2971 // aren't directly associated with a kernel. We do force inlining of
2972 // functions that use local objects. However, if these dead functions are
2973 // not eliminated, we don't want a compile time error. Just emit a warning
2974 // and a trap, since there should be no callable path here.
2975 B.buildTrap();
2976 B.buildUndef(DstReg);
2977 MI.eraseFromParent();
2978 return true;
2979 }
2980
2981 // TODO: We could emit code to handle the initialization somewhere.
2982 // We ignore the initializer for now and legalize it to allow selection.
2983 // The initializer will anyway get errored out during assembly emission.
2984 const SITargetLowering *TLI = ST.getTargetLowering();
2985 if (!TLI->shouldUseLDSConstAddress(GV)) {
2986 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2987 return true; // Leave in place;
2988 }
2989
2990 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991 Type *Ty = GV->getValueType();
2992 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2993 // zero-sized type in other languages to declare the dynamic shared
2994 // memory which size is not known at the compile time. They will be
2995 // allocated by the runtime and placed directly after the static
2996 // allocated ones. They all share the same offset.
2997 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2998 // Adjust alignment for that dynamic shared memory array.
2999 MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3000 LLT S32 = LLT::scalar(32);
3001 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3002 B.buildIntToPtr(DstReg, Sz);
3003 MI.eraseFromParent();
3004 return true;
3005 }
3006 }
3007
3008 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3009 *cast<GlobalVariable>(GV)));
3010 MI.eraseFromParent();
3011 return true;
3012 }
3013
3014 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3015 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3016 MI.eraseFromParent();
3017 return true;
3018 }
3019
3020 const SITargetLowering *TLI = ST.getTargetLowering();
3021
3022 if (TLI->shouldEmitFixup(GV)) {
3023 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3024 MI.eraseFromParent();
3025 return true;
3026 }
3027
3028 if (TLI->shouldEmitPCReloc(GV)) {
3029 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3030 MI.eraseFromParent();
3031 return true;
3032 }
3033
3034 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3035 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3036
3037 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3038 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3039 MachinePointerInfo::getGOT(MF),
3040 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3041 MachineMemOperand::MOInvariant,
3042 LoadTy, Align(8));
3043
3044 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3045
3046 if (Ty.getSizeInBits() == 32) {
3047 // Truncate if this is a 32-bit constant address.
3048 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3049 B.buildExtract(DstReg, Load, 0);
3050 } else
3051 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3052
3053 MI.eraseFromParent();
3054 return true;
3055 }
3056
widenToNextPowerOf2(LLT Ty)3057 static LLT widenToNextPowerOf2(LLT Ty) {
3058 if (Ty.isVector())
3059 return Ty.changeElementCount(
3060 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3061 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3062 }
3063
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const3064 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3065 MachineInstr &MI) const {
3066 MachineIRBuilder &B = Helper.MIRBuilder;
3067 MachineRegisterInfo &MRI = *B.getMRI();
3068 GISelChangeObserver &Observer = Helper.Observer;
3069
3070 Register PtrReg = MI.getOperand(1).getReg();
3071 LLT PtrTy = MRI.getType(PtrReg);
3072 unsigned AddrSpace = PtrTy.getAddressSpace();
3073
3074 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3075 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3076 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3077 Observer.changingInstr(MI);
3078 MI.getOperand(1).setReg(Cast.getReg(0));
3079 Observer.changedInstr(MI);
3080 return true;
3081 }
3082
3083 if (MI.getOpcode() != AMDGPU::G_LOAD)
3084 return false;
3085
3086 Register ValReg = MI.getOperand(0).getReg();
3087 LLT ValTy = MRI.getType(ValReg);
3088
3089 if (hasBufferRsrcWorkaround(ValTy)) {
3090 Observer.changingInstr(MI);
3091 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3092 Observer.changedInstr(MI);
3093 return true;
3094 }
3095
3096 MachineMemOperand *MMO = *MI.memoperands_begin();
3097 const unsigned ValSize = ValTy.getSizeInBits();
3098 const LLT MemTy = MMO->getMemoryType();
3099 const Align MemAlign = MMO->getAlign();
3100 const unsigned MemSize = MemTy.getSizeInBits();
3101 const uint64_t AlignInBits = 8 * MemAlign.value();
3102
3103 // Widen non-power-of-2 loads to the alignment if needed
3104 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3105 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3106
3107 // This was already the correct extending load result type, so just adjust
3108 // the memory type.
3109 if (WideMemSize == ValSize) {
3110 MachineFunction &MF = B.getMF();
3111
3112 MachineMemOperand *WideMMO =
3113 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3114 Observer.changingInstr(MI);
3115 MI.setMemRefs(MF, {WideMMO});
3116 Observer.changedInstr(MI);
3117 return true;
3118 }
3119
3120 // Don't bother handling edge case that should probably never be produced.
3121 if (ValSize > WideMemSize)
3122 return false;
3123
3124 LLT WideTy = widenToNextPowerOf2(ValTy);
3125
3126 Register WideLoad;
3127 if (!WideTy.isVector()) {
3128 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3129 B.buildTrunc(ValReg, WideLoad).getReg(0);
3130 } else {
3131 // Extract the subvector.
3132
3133 if (isRegisterType(ValTy)) {
3134 // If this a case where G_EXTRACT is legal, use it.
3135 // (e.g. <3 x s32> -> <4 x s32>)
3136 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3137 B.buildExtract(ValReg, WideLoad, 0);
3138 } else {
3139 // For cases where the widened type isn't a nice register value, unmerge
3140 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3141 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3142 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3143 }
3144 }
3145
3146 MI.eraseFromParent();
3147 return true;
3148 }
3149
3150 return false;
3151 }
3152
legalizeStore(LegalizerHelper & Helper,MachineInstr & MI) const3153 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3154 MachineInstr &MI) const {
3155 MachineIRBuilder &B = Helper.MIRBuilder;
3156 MachineRegisterInfo &MRI = *B.getMRI();
3157 GISelChangeObserver &Observer = Helper.Observer;
3158
3159 Register DataReg = MI.getOperand(0).getReg();
3160 LLT DataTy = MRI.getType(DataReg);
3161
3162 if (hasBufferRsrcWorkaround(DataTy)) {
3163 Observer.changingInstr(MI);
3164 castBufferRsrcArgToV4I32(MI, B, 0);
3165 Observer.changedInstr(MI);
3166 return true;
3167 }
3168 return false;
3169 }
3170
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3171 bool AMDGPULegalizerInfo::legalizeFMad(
3172 MachineInstr &MI, MachineRegisterInfo &MRI,
3173 MachineIRBuilder &B) const {
3174 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3175 assert(Ty.isScalar());
3176
3177 MachineFunction &MF = B.getMF();
3178 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3179
3180 // TODO: Always legal with future ftz flag.
3181 // FIXME: Do we need just output?
3182 if (Ty == LLT::float32() &&
3183 MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3184 return true;
3185 if (Ty == LLT::float16() &&
3186 MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3187 return true;
3188
3189 MachineIRBuilder HelperBuilder(MI);
3190 GISelObserverWrapper DummyObserver;
3191 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3192 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3193 }
3194
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3195 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3197 Register DstReg = MI.getOperand(0).getReg();
3198 Register PtrReg = MI.getOperand(1).getReg();
3199 Register CmpVal = MI.getOperand(2).getReg();
3200 Register NewVal = MI.getOperand(3).getReg();
3201
3202 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3203 "this should not have been custom lowered");
3204
3205 LLT ValTy = MRI.getType(CmpVal);
3206 LLT VecTy = LLT::fixed_vector(2, ValTy);
3207
3208 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3209
3210 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3211 .addDef(DstReg)
3212 .addUse(PtrReg)
3213 .addUse(PackedVal)
3214 .setMemRefs(MI.memoperands());
3215
3216 MI.eraseFromParent();
3217 return true;
3218 }
3219
3220 /// Return true if it's known that \p Src can never be an f32 denormal value.
valueIsKnownNeverF32Denorm(const MachineRegisterInfo & MRI,Register Src)3221 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3222 Register Src) {
3223 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3224 switch (DefMI->getOpcode()) {
3225 case TargetOpcode::G_INTRINSIC: {
3226 switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3227 case Intrinsic::amdgcn_frexp_mant:
3228 return true;
3229 default:
3230 break;
3231 }
3232
3233 break;
3234 }
3235 case TargetOpcode::G_FFREXP: {
3236 if (DefMI->getOperand(0).getReg() == Src)
3237 return true;
3238 break;
3239 }
3240 case TargetOpcode::G_FPEXT: {
3241 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3242 }
3243 default:
3244 return false;
3245 }
3246
3247 return false;
3248 }
3249
allowApproxFunc(const MachineFunction & MF,unsigned Flags)3250 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3251 if (Flags & MachineInstr::FmAfn)
3252 return true;
3253 const auto &Options = MF.getTarget().Options;
3254 return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3255 }
3256
needsDenormHandlingF32(const MachineFunction & MF,Register Src,unsigned Flags)3257 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3258 unsigned Flags) {
3259 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3260 MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3261 DenormalMode::PreserveSign;
3262 }
3263
3264 std::pair<Register, Register>
getScaledLogInput(MachineIRBuilder & B,Register Src,unsigned Flags) const3265 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3266 unsigned Flags) const {
3267 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3268 return {};
3269
3270 const LLT F32 = LLT::scalar(32);
3271 auto SmallestNormal = B.buildFConstant(
3272 F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3273 auto IsLtSmallestNormal =
3274 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3275
3276 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3277 auto One = B.buildFConstant(F32, 1.0);
3278 auto ScaleFactor =
3279 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3280 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3281
3282 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3283 }
3284
legalizeFlog2(MachineInstr & MI,MachineIRBuilder & B) const3285 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3286 MachineIRBuilder &B) const {
3287 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3288 // If we have to handle denormals, scale up the input and adjust the result.
3289
3290 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3291 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3292
3293 Register Dst = MI.getOperand(0).getReg();
3294 Register Src = MI.getOperand(1).getReg();
3295 LLT Ty = B.getMRI()->getType(Dst);
3296 unsigned Flags = MI.getFlags();
3297
3298 if (Ty == LLT::scalar(16)) {
3299 const LLT F32 = LLT::scalar(32);
3300 // Nothing in half is a denormal when promoted to f32.
3301 auto Ext = B.buildFPExt(F32, Src, Flags);
3302 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3303 .addUse(Ext.getReg(0))
3304 .setMIFlags(Flags);
3305 B.buildFPTrunc(Dst, Log2, Flags);
3306 MI.eraseFromParent();
3307 return true;
3308 }
3309
3310 assert(Ty == LLT::scalar(32));
3311
3312 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3313 if (!ScaledInput) {
3314 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3315 .addUse(Src)
3316 .setMIFlags(Flags);
3317 MI.eraseFromParent();
3318 return true;
3319 }
3320
3321 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3322 .addUse(ScaledInput)
3323 .setMIFlags(Flags);
3324
3325 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3326 auto Zero = B.buildFConstant(Ty, 0.0);
3327 auto ResultOffset =
3328 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3329 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3330
3331 MI.eraseFromParent();
3332 return true;
3333 }
3334
getMad(MachineIRBuilder & B,LLT Ty,Register X,Register Y,Register Z,unsigned Flags)3335 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3336 Register Z, unsigned Flags) {
3337 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3338 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3339 }
3340
legalizeFlogCommon(MachineInstr & MI,MachineIRBuilder & B) const3341 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3342 MachineIRBuilder &B) const {
3343 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3344 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3345
3346 MachineRegisterInfo &MRI = *B.getMRI();
3347 Register Dst = MI.getOperand(0).getReg();
3348 Register X = MI.getOperand(1).getReg();
3349 unsigned Flags = MI.getFlags();
3350 const LLT Ty = MRI.getType(X);
3351 MachineFunction &MF = B.getMF();
3352
3353 const LLT F32 = LLT::scalar(32);
3354 const LLT F16 = LLT::scalar(16);
3355
3356 const AMDGPUTargetMachine &TM =
3357 static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3358
3359 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3360 TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3361 if (Ty == F16 && !ST.has16BitInsts()) {
3362 Register LogVal = MRI.createGenericVirtualRegister(F32);
3363 auto PromoteSrc = B.buildFPExt(F32, X);
3364 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3365 B.buildFPTrunc(Dst, LogVal);
3366 } else {
3367 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3368 }
3369
3370 MI.eraseFromParent();
3371 return true;
3372 }
3373
3374 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3375 if (ScaledInput)
3376 X = ScaledInput;
3377
3378 auto Y =
3379 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3380
3381 Register R;
3382 if (ST.hasFastFMAF32()) {
3383 // c+cc are ln(2)/ln(10) to more than 49 bits
3384 const float c_log10 = 0x1.344134p-2f;
3385 const float cc_log10 = 0x1.09f79ep-26f;
3386
3387 // c + cc is ln(2) to more than 49 bits
3388 const float c_log = 0x1.62e42ep-1f;
3389 const float cc_log = 0x1.efa39ep-25f;
3390
3391 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3392 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3393
3394 R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3395 auto NegR = B.buildFNeg(Ty, R, Flags);
3396 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3397 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3398 R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3399 } else {
3400 // ch+ct is ln(2)/ln(10) to more than 36 bits
3401 const float ch_log10 = 0x1.344000p-2f;
3402 const float ct_log10 = 0x1.3509f6p-18f;
3403
3404 // ch + ct is ln(2) to more than 36 bits
3405 const float ch_log = 0x1.62e000p-1f;
3406 const float ct_log = 0x1.0bfbe8p-15f;
3407
3408 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3409 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3410
3411 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3412 auto YH = B.buildAnd(Ty, Y, MaskConst);
3413 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3414 auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3415
3416 Register Mad0 =
3417 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3418 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3419 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3420 }
3421
3422 const bool IsFiniteOnly =
3423 (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3424 (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3425
3426 if (!IsFiniteOnly) {
3427 // Expand isfinite(x) => fabs(x) < inf
3428 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3429 auto Fabs = B.buildFAbs(Ty, Y);
3430 auto IsFinite =
3431 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3432 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3433 }
3434
3435 if (ScaledInput) {
3436 auto Zero = B.buildFConstant(Ty, 0.0);
3437 auto ShiftK =
3438 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3439 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3440 B.buildFSub(Dst, R, Shift, Flags);
3441 } else {
3442 B.buildCopy(Dst, R);
3443 }
3444
3445 MI.eraseFromParent();
3446 return true;
3447 }
3448
legalizeFlogUnsafe(MachineIRBuilder & B,Register Dst,Register Src,bool IsLog10,unsigned Flags) const3449 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3450 Register Src, bool IsLog10,
3451 unsigned Flags) const {
3452 const double Log2BaseInverted =
3453 IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3454
3455 LLT Ty = B.getMRI()->getType(Dst);
3456
3457 if (Ty == LLT::scalar(32)) {
3458 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3459 if (ScaledInput) {
3460 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3461 .addUse(Src)
3462 .setMIFlags(Flags);
3463 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3464 auto Zero = B.buildFConstant(Ty, 0.0);
3465 auto ResultOffset =
3466 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3467 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3468
3469 if (ST.hasFastFMAF32())
3470 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3471 else {
3472 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3473 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3474 }
3475
3476 return true;
3477 }
3478 }
3479
3480 auto Log2Operand = Ty == LLT::scalar(16)
3481 ? B.buildFLog2(Ty, Src, Flags)
3482 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3483 .addUse(Src)
3484 .setMIFlags(Flags);
3485 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3486 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3487 return true;
3488 }
3489
legalizeFExp2(MachineInstr & MI,MachineIRBuilder & B) const3490 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3491 MachineIRBuilder &B) const {
3492 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3493 // If we have to handle denormals, scale up the input and adjust the result.
3494
3495 Register Dst = MI.getOperand(0).getReg();
3496 Register Src = MI.getOperand(1).getReg();
3497 unsigned Flags = MI.getFlags();
3498 LLT Ty = B.getMRI()->getType(Dst);
3499 const LLT F16 = LLT::scalar(16);
3500 const LLT F32 = LLT::scalar(32);
3501
3502 if (Ty == F16) {
3503 // Nothing in half is a denormal when promoted to f32.
3504 auto Ext = B.buildFPExt(F32, Src, Flags);
3505 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3506 .addUse(Ext.getReg(0))
3507 .setMIFlags(Flags);
3508 B.buildFPTrunc(Dst, Log2, Flags);
3509 MI.eraseFromParent();
3510 return true;
3511 }
3512
3513 assert(Ty == F32);
3514
3515 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3516 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3517 .addUse(Src)
3518 .setMIFlags(Flags);
3519 MI.eraseFromParent();
3520 return true;
3521 }
3522
3523 // bool needs_scaling = x < -0x1.f80000p+6f;
3524 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3525
3526 // -nextafter(128.0, -1)
3527 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3528 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3529 RangeCheckConst, Flags);
3530
3531 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3532 auto Zero = B.buildFConstant(Ty, 0.0);
3533 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3534 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3535
3536 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3537 .addUse(AddInput.getReg(0))
3538 .setMIFlags(Flags);
3539
3540 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3541 auto One = B.buildFConstant(Ty, 1.0);
3542 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3543 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3544 MI.eraseFromParent();
3545 return true;
3546 }
3547
legalizeFExpUnsafe(MachineIRBuilder & B,Register Dst,Register X,unsigned Flags) const3548 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3549 Register X, unsigned Flags) const {
3550 LLT Ty = B.getMRI()->getType(Dst);
3551 LLT F32 = LLT::scalar(32);
3552
3553 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3554 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3555 auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3556
3557 if (Ty == F32) {
3558 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3559 .addUse(Mul.getReg(0))
3560 .setMIFlags(Flags);
3561 } else {
3562 B.buildFExp2(Dst, Mul.getReg(0), Flags);
3563 }
3564
3565 return true;
3566 }
3567
3568 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3569 auto NeedsScaling =
3570 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3571 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3572 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3573 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3574
3575 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3576 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3577
3578 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3579 .addUse(ExpInput.getReg(0))
3580 .setMIFlags(Flags);
3581
3582 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3583 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3584 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3585 return true;
3586 }
3587
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const3588 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3589 MachineIRBuilder &B) const {
3590 Register Dst = MI.getOperand(0).getReg();
3591 Register X = MI.getOperand(1).getReg();
3592 const unsigned Flags = MI.getFlags();
3593 MachineFunction &MF = B.getMF();
3594 MachineRegisterInfo &MRI = *B.getMRI();
3595 LLT Ty = MRI.getType(Dst);
3596 const LLT F16 = LLT::scalar(16);
3597 const LLT F32 = LLT::scalar(32);
3598 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3599
3600 if (Ty == F16) {
3601 // v_exp_f16 (fmul x, log2e)
3602 if (allowApproxFunc(MF, Flags)) {
3603 // TODO: Does this really require fast?
3604 legalizeFExpUnsafe(B, Dst, X, Flags);
3605 MI.eraseFromParent();
3606 return true;
3607 }
3608
3609 // exp(f16 x) ->
3610 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3611
3612 // Nothing in half is a denormal when promoted to f32.
3613 auto Ext = B.buildFPExt(F32, X, Flags);
3614 Register Lowered = MRI.createGenericVirtualRegister(F32);
3615 legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3616 B.buildFPTrunc(Dst, Lowered, Flags);
3617 MI.eraseFromParent();
3618 return true;
3619 }
3620
3621 assert(Ty == F32);
3622
3623 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3624 // library behavior. Also, is known-not-daz source sufficient?
3625 if (allowApproxFunc(MF, Flags)) {
3626 legalizeFExpUnsafe(B, Dst, X, Flags);
3627 MI.eraseFromParent();
3628 return true;
3629 }
3630
3631 // Algorithm:
3632 //
3633 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3634 //
3635 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3636 // n = 64*m + j, 0 <= j < 64
3637 //
3638 // e^x = 2^((64*m + j + f)/64)
3639 // = (2^m) * (2^(j/64)) * 2^(f/64)
3640 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3641 //
3642 // f = x*(64/ln(2)) - n
3643 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3644 //
3645 // e^x = (2^m) * (2^(j/64)) * e^r
3646 //
3647 // (2^(j/64)) is precomputed
3648 //
3649 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3650 // e^r = 1 + q
3651 //
3652 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3653 //
3654 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3655 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3656 Register PH, PL;
3657
3658 if (ST.hasFastFMAF32()) {
3659 const float c_exp = numbers::log2ef;
3660 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3661 const float c_exp10 = 0x1.a934f0p+1f;
3662 const float cc_exp10 = 0x1.2f346ep-24f;
3663
3664 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3665 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3666 auto NegPH = B.buildFNeg(Ty, PH, Flags);
3667 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3668
3669 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3670 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3671 } else {
3672 const float ch_exp = 0x1.714000p+0f;
3673 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3674
3675 const float ch_exp10 = 0x1.a92000p+1f;
3676 const float cl_exp10 = 0x1.4f0978p-11f;
3677
3678 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3679 auto XH = B.buildAnd(Ty, X, MaskConst);
3680 auto XL = B.buildFSub(Ty, X, XH, Flags);
3681
3682 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3683 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3684
3685 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3686 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3687
3688 Register Mad0 =
3689 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3690 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3691 }
3692
3693 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3694
3695 // It is unsafe to contract this fsub into the PH multiply.
3696 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3697 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3698 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3699
3700 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3701 .addUse(A.getReg(0))
3702 .setMIFlags(Flags);
3703 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3704
3705 auto UnderflowCheckConst =
3706 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3707 auto Zero = B.buildFConstant(Ty, 0.0);
3708 auto Underflow =
3709 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3710
3711 R = B.buildSelect(Ty, Underflow, Zero, R);
3712
3713 const auto &Options = MF.getTarget().Options;
3714
3715 if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3716 auto OverflowCheckConst =
3717 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3718
3719 auto Overflow =
3720 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3721 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3722 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3723 }
3724
3725 B.buildCopy(Dst, R);
3726 MI.eraseFromParent();
3727 return true;
3728 }
3729
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const3730 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3731 MachineIRBuilder &B) const {
3732 Register Dst = MI.getOperand(0).getReg();
3733 Register Src0 = MI.getOperand(1).getReg();
3734 Register Src1 = MI.getOperand(2).getReg();
3735 unsigned Flags = MI.getFlags();
3736 LLT Ty = B.getMRI()->getType(Dst);
3737 const LLT F16 = LLT::float16();
3738 const LLT F32 = LLT::float32();
3739
3740 if (Ty == F32) {
3741 auto Log = B.buildFLog2(F32, Src0, Flags);
3742 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3743 .addUse(Log.getReg(0))
3744 .addUse(Src1)
3745 .setMIFlags(Flags);
3746 B.buildFExp2(Dst, Mul, Flags);
3747 } else if (Ty == F16) {
3748 // There's no f16 fmul_legacy, so we need to convert for it.
3749 auto Log = B.buildFLog2(F16, Src0, Flags);
3750 auto Ext0 = B.buildFPExt(F32, Log, Flags);
3751 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3752 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3753 .addUse(Ext0.getReg(0))
3754 .addUse(Ext1.getReg(0))
3755 .setMIFlags(Flags);
3756 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3757 } else
3758 return false;
3759
3760 MI.eraseFromParent();
3761 return true;
3762 }
3763
3764 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)3765 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3766 Register ModSrc = OrigSrc;
3767 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3768 ModSrc = SrcFNeg->getOperand(1).getReg();
3769 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3770 ModSrc = SrcFAbs->getOperand(1).getReg();
3771 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3772 ModSrc = SrcFAbs->getOperand(1).getReg();
3773 return ModSrc;
3774 }
3775
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3776 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3777 MachineRegisterInfo &MRI,
3778 MachineIRBuilder &B) const {
3779
3780 const LLT S1 = LLT::scalar(1);
3781 const LLT F64 = LLT::float64();
3782 Register Dst = MI.getOperand(0).getReg();
3783 Register OrigSrc = MI.getOperand(1).getReg();
3784 unsigned Flags = MI.getFlags();
3785 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3786 "this should not have been custom lowered");
3787
3788 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3789 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3790 // efficient way to implement it is using V_FRACT_F64. The workaround for the
3791 // V_FRACT bug is:
3792 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3793 //
3794 // Convert floor(x) to (x - fract(x))
3795
3796 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3797 .addUse(OrigSrc)
3798 .setMIFlags(Flags);
3799
3800 // Give source modifier matching some assistance before obscuring a foldable
3801 // pattern.
3802
3803 // TODO: We can avoid the neg on the fract? The input sign to fract
3804 // shouldn't matter?
3805 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3806
3807 auto Const =
3808 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3809
3810 Register Min = MRI.createGenericVirtualRegister(F64);
3811
3812 // We don't need to concern ourselves with the snan handling difference, so
3813 // use the one which will directly select.
3814 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3815 if (MFI->getMode().IEEE)
3816 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3817 else
3818 B.buildFMinNum(Min, Fract, Const, Flags);
3819
3820 Register CorrectedFract = Min;
3821 if (!MI.getFlag(MachineInstr::FmNoNans)) {
3822 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3823 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3824 }
3825
3826 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3827 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3828
3829 MI.eraseFromParent();
3830 return true;
3831 }
3832
3833 // Turn an illegal packed v2s16 build vector into bit operations.
3834 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3835 bool AMDGPULegalizerInfo::legalizeBuildVector(
3836 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3837 Register Dst = MI.getOperand(0).getReg();
3838 const LLT S32 = LLT::scalar(32);
3839 const LLT S16 = LLT::scalar(16);
3840 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3841
3842 Register Src0 = MI.getOperand(1).getReg();
3843 Register Src1 = MI.getOperand(2).getReg();
3844
3845 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3846 assert(MRI.getType(Src0) == S32);
3847 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3848 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3849 }
3850
3851 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3852 B.buildBitcast(Dst, Merge);
3853
3854 MI.eraseFromParent();
3855 return true;
3856 }
3857
3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3859 //
3860 // Source and accumulation registers must all be 32-bits.
3861 //
3862 // TODO: When the multiply is uniform, we should produce a code sequence
3863 // that is better suited to instruction selection on the SALU. Instead of
3864 // the outer loop going over parts of the result, the outer loop should go
3865 // over parts of one of the factors. This should result in instruction
3866 // selection that makes full use of S_ADDC_U32 instructions.
buildMultiply(LegalizerHelper & Helper,MutableArrayRef<Register> Accum,ArrayRef<Register> Src0,ArrayRef<Register> Src1,bool UsePartialMad64_32,bool SeparateOddAlignedProducts) const3867 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3868 MutableArrayRef<Register> Accum,
3869 ArrayRef<Register> Src0,
3870 ArrayRef<Register> Src1,
3871 bool UsePartialMad64_32,
3872 bool SeparateOddAlignedProducts) const {
3873 // Use (possibly empty) vectors of S1 registers to represent the set of
3874 // carries from one pair of positions to the next.
3875 using Carry = SmallVector<Register, 2>;
3876
3877 MachineIRBuilder &B = Helper.MIRBuilder;
3878 GISelKnownBits &KB = *Helper.getKnownBits();
3879
3880 const LLT S1 = LLT::scalar(1);
3881 const LLT S32 = LLT::scalar(32);
3882 const LLT S64 = LLT::scalar(64);
3883
3884 Register Zero32;
3885 Register Zero64;
3886
3887 auto getZero32 = [&]() -> Register {
3888 if (!Zero32)
3889 Zero32 = B.buildConstant(S32, 0).getReg(0);
3890 return Zero32;
3891 };
3892 auto getZero64 = [&]() -> Register {
3893 if (!Zero64)
3894 Zero64 = B.buildConstant(S64, 0).getReg(0);
3895 return Zero64;
3896 };
3897
3898 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3899 for (unsigned i = 0; i < Src0.size(); ++i) {
3900 Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3901 Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3902 }
3903
3904 // Merge the given carries into the 32-bit LocalAccum, which is modified
3905 // in-place.
3906 //
3907 // Returns the carry-out, which is a single S1 register or null.
3908 auto mergeCarry =
3909 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3910 if (CarryIn.empty())
3911 return Register();
3912
3913 bool HaveCarryOut = true;
3914 Register CarryAccum;
3915 if (CarryIn.size() == 1) {
3916 if (!LocalAccum) {
3917 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3918 return Register();
3919 }
3920
3921 CarryAccum = getZero32();
3922 } else {
3923 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3924 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3925 CarryAccum =
3926 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3927 .getReg(0);
3928 }
3929
3930 if (!LocalAccum) {
3931 LocalAccum = getZero32();
3932 HaveCarryOut = false;
3933 }
3934 }
3935
3936 auto Add =
3937 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3938 LocalAccum = Add.getReg(0);
3939 return HaveCarryOut ? Add.getReg(1) : Register();
3940 };
3941
3942 // Build a multiply-add chain to compute
3943 //
3944 // LocalAccum + (partial products at DstIndex)
3945 // + (opportunistic subset of CarryIn)
3946 //
3947 // LocalAccum is an array of one or two 32-bit registers that are updated
3948 // in-place. The incoming registers may be null.
3949 //
3950 // In some edge cases, carry-ins can be consumed "for free". In that case,
3951 // the consumed carry bits are removed from CarryIn in-place.
3952 auto buildMadChain =
3953 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3954 -> Carry {
3955 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3956 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3957
3958 Carry CarryOut;
3959 unsigned j0 = 0;
3960
3961 // Use plain 32-bit multiplication for the most significant part of the
3962 // result by default.
3963 if (LocalAccum.size() == 1 &&
3964 (!UsePartialMad64_32 || !CarryIn.empty())) {
3965 do {
3966 // Skip multiplication if one of the operands is 0
3967 unsigned j1 = DstIndex - j0;
3968 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3969 ++j0;
3970 continue;
3971 }
3972 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3973 if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3974 LocalAccum[0] = Mul.getReg(0);
3975 } else {
3976 if (CarryIn.empty()) {
3977 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3978 } else {
3979 LocalAccum[0] =
3980 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3981 .getReg(0);
3982 CarryIn.pop_back();
3983 }
3984 }
3985 ++j0;
3986 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3987 }
3988
3989 // Build full 64-bit multiplies.
3990 if (j0 <= DstIndex) {
3991 bool HaveSmallAccum = false;
3992 Register Tmp;
3993
3994 if (LocalAccum[0]) {
3995 if (LocalAccum.size() == 1) {
3996 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3997 HaveSmallAccum = true;
3998 } else if (LocalAccum[1]) {
3999 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4000 HaveSmallAccum = false;
4001 } else {
4002 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4003 HaveSmallAccum = true;
4004 }
4005 } else {
4006 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4007 Tmp = getZero64();
4008 HaveSmallAccum = true;
4009 }
4010
4011 do {
4012 unsigned j1 = DstIndex - j0;
4013 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4014 ++j0;
4015 continue;
4016 }
4017 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4018 {Src0[j0], Src1[j1], Tmp});
4019 Tmp = Mad.getReg(0);
4020 if (!HaveSmallAccum)
4021 CarryOut.push_back(Mad.getReg(1));
4022 HaveSmallAccum = false;
4023
4024 ++j0;
4025 } while (j0 <= DstIndex);
4026
4027 auto Unmerge = B.buildUnmerge(S32, Tmp);
4028 LocalAccum[0] = Unmerge.getReg(0);
4029 if (LocalAccum.size() > 1)
4030 LocalAccum[1] = Unmerge.getReg(1);
4031 }
4032
4033 return CarryOut;
4034 };
4035
4036 // Outer multiply loop, iterating over destination parts from least
4037 // significant to most significant parts.
4038 //
4039 // The columns of the following diagram correspond to the destination parts
4040 // affected by one iteration of the outer loop (ignoring boundary
4041 // conditions).
4042 //
4043 // Dest index relative to 2 * i: 1 0 -1
4044 // ------
4045 // Carries from previous iteration: e o
4046 // Even-aligned partial product sum: E E .
4047 // Odd-aligned partial product sum: O O
4048 //
4049 // 'o' is OddCarry, 'e' is EvenCarry.
4050 // EE and OO are computed from partial products via buildMadChain and use
4051 // accumulation where possible and appropriate.
4052 //
4053 Register SeparateOddCarry;
4054 Carry EvenCarry;
4055 Carry OddCarry;
4056
4057 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4058 Carry OddCarryIn = std::move(OddCarry);
4059 Carry EvenCarryIn = std::move(EvenCarry);
4060 OddCarry.clear();
4061 EvenCarry.clear();
4062
4063 // Partial products at offset 2 * i.
4064 if (2 * i < Accum.size()) {
4065 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4066 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4067 }
4068
4069 // Partial products at offset 2 * i - 1.
4070 if (i > 0) {
4071 if (!SeparateOddAlignedProducts) {
4072 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4073 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4074 } else {
4075 bool IsHighest = 2 * i >= Accum.size();
4076 Register SeparateOddOut[2];
4077 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4078 .take_front(IsHighest ? 1 : 2);
4079 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4080
4081 MachineInstr *Lo;
4082
4083 if (i == 1) {
4084 if (!IsHighest)
4085 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4086 else
4087 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4088 } else {
4089 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4090 SeparateOddCarry);
4091 }
4092 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4093
4094 if (!IsHighest) {
4095 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4096 Lo->getOperand(1).getReg());
4097 Accum[2 * i] = Hi.getReg(0);
4098 SeparateOddCarry = Hi.getReg(1);
4099 }
4100 }
4101 }
4102
4103 // Add in the carries from the previous iteration
4104 if (i > 0) {
4105 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4106 EvenCarryIn.push_back(CarryOut);
4107
4108 if (2 * i < Accum.size()) {
4109 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4110 OddCarry.push_back(CarryOut);
4111 }
4112 }
4113 }
4114 }
4115
4116 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4117 //
4118 // TODO: If the multiply is followed by an addition, we should attempt to
4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
legalizeMul(LegalizerHelper & Helper,MachineInstr & MI) const4120 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4121 MachineInstr &MI) const {
4122 assert(ST.hasMad64_32());
4123 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4124
4125 MachineIRBuilder &B = Helper.MIRBuilder;
4126 MachineRegisterInfo &MRI = *B.getMRI();
4127
4128 Register DstReg = MI.getOperand(0).getReg();
4129 Register Src0 = MI.getOperand(1).getReg();
4130 Register Src1 = MI.getOperand(2).getReg();
4131
4132 LLT Ty = MRI.getType(DstReg);
4133 assert(Ty.isScalar());
4134
4135 unsigned Size = Ty.getSizeInBits();
4136 unsigned NumParts = Size / 32;
4137 assert((Size % 32) == 0);
4138 assert(NumParts >= 2);
4139
4140 // Whether to use MAD_64_32 for partial products whose high half is
4141 // discarded. This avoids some ADD instructions but risks false dependency
4142 // stalls on some subtargets in some cases.
4143 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4144
4145 // Whether to compute odd-aligned partial products separately. This is
4146 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4147 // in an even-aligned VGPR.
4148 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4149
4150 LLT S32 = LLT::scalar(32);
4151 SmallVector<Register, 2> Src0Parts, Src1Parts;
4152 for (unsigned i = 0; i < NumParts; ++i) {
4153 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4154 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4155 }
4156 B.buildUnmerge(Src0Parts, Src0);
4157 B.buildUnmerge(Src1Parts, Src1);
4158
4159 SmallVector<Register, 2> AccumRegs(NumParts);
4160 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4161 SeparateOddAlignedProducts);
4162
4163 B.buildMergeLikeInstr(DstReg, AccumRegs);
4164 MI.eraseFromParent();
4165 return true;
4166 }
4167
4168 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170 // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4171 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4172 MachineRegisterInfo &MRI,
4173 MachineIRBuilder &B) const {
4174 Register Dst = MI.getOperand(0).getReg();
4175 Register Src = MI.getOperand(1).getReg();
4176 LLT DstTy = MRI.getType(Dst);
4177 LLT SrcTy = MRI.getType(Src);
4178
4179 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4180 ? AMDGPU::G_AMDGPU_FFBH_U32
4181 : AMDGPU::G_AMDGPU_FFBL_B32;
4182 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4183 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4184
4185 MI.eraseFromParent();
4186 return true;
4187 }
4188
legalizeCTLZ_ZERO_UNDEF(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4189 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4190 MachineRegisterInfo &MRI,
4191 MachineIRBuilder &B) const {
4192 Register Dst = MI.getOperand(0).getReg();
4193 Register Src = MI.getOperand(1).getReg();
4194 LLT SrcTy = MRI.getType(Src);
4195 TypeSize NumBits = SrcTy.getSizeInBits();
4196
4197 assert(NumBits < 32u);
4198
4199 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4200 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4201 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4202 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4203 B.buildTrunc(Dst, Ctlz);
4204 MI.eraseFromParent();
4205 return true;
4206 }
4207
4208 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)4209 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4210 if (MI.getOpcode() != TargetOpcode::G_XOR)
4211 return false;
4212 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4213 return ConstVal && *ConstVal == -1;
4214 }
4215
4216 // Return the use branch instruction, otherwise null if the usage is invalid.
4217 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)4218 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4219 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4220 Register CondDef = MI.getOperand(0).getReg();
4221 if (!MRI.hasOneNonDBGUse(CondDef))
4222 return nullptr;
4223
4224 MachineBasicBlock *Parent = MI.getParent();
4225 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4226
4227 if (isNot(MRI, *UseMI)) {
4228 Register NegatedCond = UseMI->getOperand(0).getReg();
4229 if (!MRI.hasOneNonDBGUse(NegatedCond))
4230 return nullptr;
4231
4232 // We're deleting the def of this value, so we need to remove it.
4233 eraseInstr(*UseMI, MRI);
4234
4235 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4236 Negated = true;
4237 }
4238
4239 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4240 return nullptr;
4241
4242 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4243 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4244 if (Next == Parent->end()) {
4245 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4246 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4247 return nullptr;
4248 UncondBrTarget = &*NextMBB;
4249 } else {
4250 if (Next->getOpcode() != AMDGPU::G_BR)
4251 return nullptr;
4252 Br = &*Next;
4253 UncondBrTarget = Br->getOperand(0).getMBB();
4254 }
4255
4256 return UseMI;
4257 }
4258
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const4259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4260 const ArgDescriptor *Arg,
4261 const TargetRegisterClass *ArgRC,
4262 LLT ArgTy) const {
4263 MCRegister SrcReg = Arg->getRegister();
4264 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4265 assert(DstReg.isVirtual() && "Virtual register expected");
4266
4267 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4268 *ArgRC, B.getDebugLoc(), ArgTy);
4269 if (Arg->isMasked()) {
4270 // TODO: Should we try to emit this once in the entry block?
4271 const LLT S32 = LLT::scalar(32);
4272 const unsigned Mask = Arg->getMask();
4273 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4274
4275 Register AndMaskSrc = LiveIn;
4276
4277 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4278 // 0.
4279 if (Shift != 0) {
4280 auto ShiftAmt = B.buildConstant(S32, Shift);
4281 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4282 }
4283
4284 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4285 } else {
4286 B.buildCopy(DstReg, LiveIn);
4287 }
4288
4289 return true;
4290 }
4291
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4292 bool AMDGPULegalizerInfo::loadInputValue(
4293 Register DstReg, MachineIRBuilder &B,
4294 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4295 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4296 const ArgDescriptor *Arg = nullptr;
4297 const TargetRegisterClass *ArgRC;
4298 LLT ArgTy;
4299
4300 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4301 const ArgDescriptor WorkGroupIDX =
4302 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4303 // If GridZ is not programmed in an entry function then the hardware will set
4304 // it to all zeros, so there is no need to mask the GridY value in the low
4305 // order bits.
4306 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4307 AMDGPU::TTMP7,
4308 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4309 const ArgDescriptor WorkGroupIDZ =
4310 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4311 if (ST.hasArchitectedSGPRs() &&
4312 (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4313 switch (ArgType) {
4314 case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4315 Arg = &WorkGroupIDX;
4316 ArgRC = &AMDGPU::SReg_32RegClass;
4317 ArgTy = LLT::scalar(32);
4318 break;
4319 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4320 Arg = &WorkGroupIDY;
4321 ArgRC = &AMDGPU::SReg_32RegClass;
4322 ArgTy = LLT::scalar(32);
4323 break;
4324 case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4325 Arg = &WorkGroupIDZ;
4326 ArgRC = &AMDGPU::SReg_32RegClass;
4327 ArgTy = LLT::scalar(32);
4328 break;
4329 default:
4330 break;
4331 }
4332 }
4333
4334 if (!Arg)
4335 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4336
4337 if (!Arg) {
4338 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4339 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340 // case the pointer argument may be missing and we use null.
4341 B.buildConstant(DstReg, 0);
4342 return true;
4343 }
4344
4345 // It's undefined behavior if a function marked with the amdgpu-no-*
4346 // attributes uses the corresponding intrinsic.
4347 B.buildUndef(DstReg);
4348 return true;
4349 }
4350
4351 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4352 return false; // TODO: Handle these
4353 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4354 }
4355
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4356 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4357 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4358 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4359 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4360 return false;
4361
4362 MI.eraseFromParent();
4363 return true;
4364 }
4365
replaceWithConstant(MachineIRBuilder & B,MachineInstr & MI,int64_t C)4366 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4367 int64_t C) {
4368 B.buildConstant(MI.getOperand(0).getReg(), C);
4369 MI.eraseFromParent();
4370 return true;
4371 }
4372
legalizeWorkitemIDIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned Dim,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4373 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4374 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4375 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4376 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4377 if (MaxID == 0)
4378 return replaceWithConstant(B, MI, 0);
4379
4380 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4381 const ArgDescriptor *Arg;
4382 const TargetRegisterClass *ArgRC;
4383 LLT ArgTy;
4384 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4385
4386 Register DstReg = MI.getOperand(0).getReg();
4387 if (!Arg) {
4388 // It's undefined behavior if a function marked with the amdgpu-no-*
4389 // attributes uses the corresponding intrinsic.
4390 B.buildUndef(DstReg);
4391 MI.eraseFromParent();
4392 return true;
4393 }
4394
4395 if (Arg->isMasked()) {
4396 // Don't bother inserting AssertZext for packed IDs since we're emitting the
4397 // masking operations anyway.
4398 //
4399 // TODO: We could assert the top bit is 0 for the source copy.
4400 if (!loadInputValue(DstReg, B, ArgType))
4401 return false;
4402 } else {
4403 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4404 if (!loadInputValue(TmpReg, B, ArgType))
4405 return false;
4406 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4407 }
4408
4409 MI.eraseFromParent();
4410 return true;
4411 }
4412
getKernargParameterPtr(MachineIRBuilder & B,int64_t Offset) const4413 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4414 int64_t Offset) const {
4415 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4416 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4417
4418 // TODO: If we passed in the base kernel offset we could have a better
4419 // alignment than 4, but we don't really need it.
4420 if (!loadInputValue(KernArgReg, B,
4421 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4422 llvm_unreachable("failed to find kernarg segment ptr");
4423
4424 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4425 // TODO: Should get nuw
4426 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4427 }
4428
4429 /// Legalize a value that's loaded from kernel arguments. This is only used by
4430 /// legacy intrinsics.
legalizeKernargMemParameter(MachineInstr & MI,MachineIRBuilder & B,uint64_t Offset,Align Alignment) const4431 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4432 MachineIRBuilder &B,
4433 uint64_t Offset,
4434 Align Alignment) const {
4435 Register DstReg = MI.getOperand(0).getReg();
4436
4437 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4438 "unexpected kernarg parameter type");
4439
4440 Register Ptr = getKernargParameterPtr(B, Offset);
4441 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4442 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4443 MachineMemOperand::MODereferenceable |
4444 MachineMemOperand::MOInvariant);
4445 MI.eraseFromParent();
4446 return true;
4447 }
4448
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4449 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4450 MachineRegisterInfo &MRI,
4451 MachineIRBuilder &B) const {
4452 Register Dst = MI.getOperand(0).getReg();
4453 LLT DstTy = MRI.getType(Dst);
4454 LLT S16 = LLT::scalar(16);
4455 LLT S32 = LLT::scalar(32);
4456 LLT S64 = LLT::scalar(64);
4457
4458 if (DstTy == S16)
4459 return legalizeFDIV16(MI, MRI, B);
4460 if (DstTy == S32)
4461 return legalizeFDIV32(MI, MRI, B);
4462 if (DstTy == S64)
4463 return legalizeFDIV64(MI, MRI, B);
4464
4465 return false;
4466 }
4467
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const4468 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4469 Register DstDivReg,
4470 Register DstRemReg,
4471 Register X,
4472 Register Y) const {
4473 const LLT S1 = LLT::scalar(1);
4474 const LLT S32 = LLT::scalar(32);
4475
4476 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4477 // algorithm used here.
4478
4479 // Initial estimate of inv(y).
4480 auto FloatY = B.buildUITOFP(S32, Y);
4481 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4482 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4483 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4484 auto Z = B.buildFPTOUI(S32, ScaledY);
4485
4486 // One round of UNR.
4487 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4488 auto NegYZ = B.buildMul(S32, NegY, Z);
4489 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4490
4491 // Quotient/remainder estimate.
4492 auto Q = B.buildUMulH(S32, X, Z);
4493 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4494
4495 // First quotient/remainder refinement.
4496 auto One = B.buildConstant(S32, 1);
4497 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4498 if (DstDivReg)
4499 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4500 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4501
4502 // Second quotient/remainder refinement.
4503 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4504 if (DstDivReg)
4505 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4506
4507 if (DstRemReg)
4508 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4509 }
4510
4511 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4512 //
4513 // Return lo, hi of result
4514 //
4515 // %cvt.lo = G_UITOFP Val.lo
4516 // %cvt.hi = G_UITOFP Val.hi
4517 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4518 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4519 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4520 // %mul2 = G_FMUL %mul1, 2**(-32)
4521 // %trunc = G_INTRINSIC_TRUNC %mul2
4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4523 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)4524 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4525 Register Val) {
4526 const LLT S32 = LLT::scalar(32);
4527 auto Unmerge = B.buildUnmerge(S32, Val);
4528
4529 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4530 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4531
4532 auto Mad = B.buildFMAD(
4533 S32, CvtHi, // 2**32
4534 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4535
4536 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4537 auto Mul1 = B.buildFMul(
4538 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4539
4540 // 2**(-32)
4541 auto Mul2 = B.buildFMul(
4542 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4543 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4544
4545 // -(2**32)
4546 auto Mad2 = B.buildFMAD(
4547 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4548 Mul1);
4549
4550 auto ResultLo = B.buildFPTOUI(S32, Mad2);
4551 auto ResultHi = B.buildFPTOUI(S32, Trunc);
4552
4553 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4554 }
4555
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const4556 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4557 Register DstDivReg,
4558 Register DstRemReg,
4559 Register Numer,
4560 Register Denom) const {
4561 const LLT S32 = LLT::scalar(32);
4562 const LLT S64 = LLT::scalar(64);
4563 const LLT S1 = LLT::scalar(1);
4564 Register RcpLo, RcpHi;
4565
4566 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4567
4568 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4569
4570 auto Zero64 = B.buildConstant(S64, 0);
4571 auto NegDenom = B.buildSub(S64, Zero64, Denom);
4572
4573 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4574 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4575
4576 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4577 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4578 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4579
4580 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4581 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4582 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4583
4584 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4585 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4586 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4587 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4588 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4589
4590 auto Zero32 = B.buildConstant(S32, 0);
4591 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4592 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4593 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4594
4595 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4596 Register NumerLo = UnmergeNumer.getReg(0);
4597 Register NumerHi = UnmergeNumer.getReg(1);
4598
4599 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4600 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4601 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4602 Register Mul3_Lo = UnmergeMul3.getReg(0);
4603 Register Mul3_Hi = UnmergeMul3.getReg(1);
4604 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4605 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4606 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4607 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4608
4609 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4610 Register DenomLo = UnmergeDenom.getReg(0);
4611 Register DenomHi = UnmergeDenom.getReg(1);
4612
4613 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4614 auto C1 = B.buildSExt(S32, CmpHi);
4615
4616 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4617 auto C2 = B.buildSExt(S32, CmpLo);
4618
4619 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4620 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4621
4622 // TODO: Here and below portions of the code can be enclosed into if/endif.
4623 // Currently control flow is unconditional and we have 4 selects after
4624 // potential endif to substitute PHIs.
4625
4626 // if C3 != 0 ...
4627 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4628 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4629 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4630 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4631
4632 auto One64 = B.buildConstant(S64, 1);
4633 auto Add3 = B.buildAdd(S64, MulHi3, One64);
4634
4635 auto C4 =
4636 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4637 auto C5 =
4638 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4639 auto C6 = B.buildSelect(
4640 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4641
4642 // if (C6 != 0)
4643 auto Add4 = B.buildAdd(S64, Add3, One64);
4644 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4645
4646 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4647 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4648 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4649
4650 // endif C6
4651 // endif C3
4652
4653 if (DstDivReg) {
4654 auto Sel1 = B.buildSelect(
4655 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4656 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4657 Sel1, MulHi3);
4658 }
4659
4660 if (DstRemReg) {
4661 auto Sel2 = B.buildSelect(
4662 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4663 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4664 Sel2, Sub1);
4665 }
4666 }
4667
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4668 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4669 MachineRegisterInfo &MRI,
4670 MachineIRBuilder &B) const {
4671 Register DstDivReg, DstRemReg;
4672 switch (MI.getOpcode()) {
4673 default:
4674 llvm_unreachable("Unexpected opcode!");
4675 case AMDGPU::G_UDIV: {
4676 DstDivReg = MI.getOperand(0).getReg();
4677 break;
4678 }
4679 case AMDGPU::G_UREM: {
4680 DstRemReg = MI.getOperand(0).getReg();
4681 break;
4682 }
4683 case AMDGPU::G_UDIVREM: {
4684 DstDivReg = MI.getOperand(0).getReg();
4685 DstRemReg = MI.getOperand(1).getReg();
4686 break;
4687 }
4688 }
4689
4690 const LLT S64 = LLT::scalar(64);
4691 const LLT S32 = LLT::scalar(32);
4692 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4693 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4694 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4695 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4696
4697 if (Ty == S32)
4698 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4699 else if (Ty == S64)
4700 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4701 else
4702 return false;
4703
4704 MI.eraseFromParent();
4705 return true;
4706 }
4707
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4708 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4709 MachineRegisterInfo &MRI,
4710 MachineIRBuilder &B) const {
4711 const LLT S64 = LLT::scalar(64);
4712 const LLT S32 = LLT::scalar(32);
4713
4714 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4715 if (Ty != S32 && Ty != S64)
4716 return false;
4717
4718 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4719 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4720 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4721
4722 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4723 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4724 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4725
4726 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4727 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4728
4729 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4730 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4731
4732 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4733 switch (MI.getOpcode()) {
4734 default:
4735 llvm_unreachable("Unexpected opcode!");
4736 case AMDGPU::G_SDIV: {
4737 DstDivReg = MI.getOperand(0).getReg();
4738 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4739 break;
4740 }
4741 case AMDGPU::G_SREM: {
4742 DstRemReg = MI.getOperand(0).getReg();
4743 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4744 break;
4745 }
4746 case AMDGPU::G_SDIVREM: {
4747 DstDivReg = MI.getOperand(0).getReg();
4748 DstRemReg = MI.getOperand(1).getReg();
4749 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4750 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4751 break;
4752 }
4753 }
4754
4755 if (Ty == S32)
4756 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4757 else
4758 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4759
4760 if (DstDivReg) {
4761 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4762 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4763 B.buildSub(DstDivReg, SignXor, Sign);
4764 }
4765
4766 if (DstRemReg) {
4767 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4768 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4769 B.buildSub(DstRemReg, SignXor, Sign);
4770 }
4771
4772 MI.eraseFromParent();
4773 return true;
4774 }
4775
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4777 MachineRegisterInfo &MRI,
4778 MachineIRBuilder &B) const {
4779 Register Res = MI.getOperand(0).getReg();
4780 Register LHS = MI.getOperand(1).getReg();
4781 Register RHS = MI.getOperand(2).getReg();
4782 uint16_t Flags = MI.getFlags();
4783 LLT ResTy = MRI.getType(Res);
4784
4785 const MachineFunction &MF = B.getMF();
4786 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4787 MF.getTarget().Options.UnsafeFPMath;
4788
4789 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4790 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4791 return false;
4792
4793 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4794 // the CI documentation has a worst case error of 1 ulp.
4795 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4796 // use it as long as we aren't trying to use denormals.
4797 //
4798 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4799
4800 // 1 / x -> RCP(x)
4801 if (CLHS->isExactlyValue(1.0)) {
4802 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4803 .addUse(RHS)
4804 .setMIFlags(Flags);
4805
4806 MI.eraseFromParent();
4807 return true;
4808 }
4809
4810 // -1 / x -> RCP( FNEG(x) )
4811 if (CLHS->isExactlyValue(-1.0)) {
4812 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4813 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4814 .addUse(FNeg.getReg(0))
4815 .setMIFlags(Flags);
4816
4817 MI.eraseFromParent();
4818 return true;
4819 }
4820 }
4821
4822 // For f16 require afn or arcp.
4823 // For f32 require afn.
4824 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4825 !MI.getFlag(MachineInstr::FmArcp)))
4826 return false;
4827
4828 // x / y -> x * (1.0 / y)
4829 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4830 .addUse(RHS)
4831 .setMIFlags(Flags);
4832 B.buildFMul(Res, LHS, RCP, Flags);
4833
4834 MI.eraseFromParent();
4835 return true;
4836 }
4837
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4838 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4839 MachineRegisterInfo &MRI,
4840 MachineIRBuilder &B) const {
4841 Register Res = MI.getOperand(0).getReg();
4842 Register X = MI.getOperand(1).getReg();
4843 Register Y = MI.getOperand(2).getReg();
4844 uint16_t Flags = MI.getFlags();
4845 LLT ResTy = MRI.getType(Res);
4846
4847 const MachineFunction &MF = B.getMF();
4848 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4849 MI.getFlag(MachineInstr::FmAfn);
4850
4851 if (!AllowInaccurateRcp)
4852 return false;
4853
4854 auto NegY = B.buildFNeg(ResTy, Y);
4855 auto One = B.buildFConstant(ResTy, 1.0);
4856
4857 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4858 .addUse(Y)
4859 .setMIFlags(Flags);
4860
4861 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4862 R = B.buildFMA(ResTy, Tmp0, R, R);
4863
4864 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4865 R = B.buildFMA(ResTy, Tmp1, R, R);
4866
4867 auto Ret = B.buildFMul(ResTy, X, R);
4868 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4869
4870 B.buildFMA(Res, Tmp2, R, Ret);
4871 MI.eraseFromParent();
4872 return true;
4873 }
4874
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4875 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4876 MachineRegisterInfo &MRI,
4877 MachineIRBuilder &B) const {
4878 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4879 return true;
4880
4881 Register Res = MI.getOperand(0).getReg();
4882 Register LHS = MI.getOperand(1).getReg();
4883 Register RHS = MI.getOperand(2).getReg();
4884
4885 uint16_t Flags = MI.getFlags();
4886
4887 LLT S16 = LLT::scalar(16);
4888 LLT S32 = LLT::scalar(32);
4889
4890 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4891 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4892
4893 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4894 .addUse(RHSExt.getReg(0))
4895 .setMIFlags(Flags);
4896
4897 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4898 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4899
4900 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4901 .addUse(RDst.getReg(0))
4902 .addUse(RHS)
4903 .addUse(LHS)
4904 .setMIFlags(Flags);
4905
4906 MI.eraseFromParent();
4907 return true;
4908 }
4909
4910 static constexpr unsigned SPDenormModeBitField =
4911 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4912
4913 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,SIModeRegisterDefaults Mode)4915 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4916 const GCNSubtarget &ST,
4917 SIModeRegisterDefaults Mode) {
4918 // Set SP denorm mode to this value.
4919 unsigned SPDenormMode =
4920 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4921
4922 if (ST.hasDenormModeInst()) {
4923 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4924 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4925
4926 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4927 B.buildInstr(AMDGPU::S_DENORM_MODE)
4928 .addImm(NewDenormModeValue);
4929
4930 } else {
4931 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4932 .addImm(SPDenormMode)
4933 .addImm(SPDenormModeBitField);
4934 }
4935 }
4936
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4937 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4938 MachineRegisterInfo &MRI,
4939 MachineIRBuilder &B) const {
4940 if (legalizeFastUnsafeFDIV(MI, MRI, B))
4941 return true;
4942
4943 Register Res = MI.getOperand(0).getReg();
4944 Register LHS = MI.getOperand(1).getReg();
4945 Register RHS = MI.getOperand(2).getReg();
4946 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4947 SIModeRegisterDefaults Mode = MFI->getMode();
4948
4949 uint16_t Flags = MI.getFlags();
4950
4951 LLT S32 = LLT::scalar(32);
4952 LLT S1 = LLT::scalar(1);
4953
4954 auto One = B.buildFConstant(S32, 1.0f);
4955
4956 auto DenominatorScaled =
4957 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4958 .addUse(LHS)
4959 .addUse(RHS)
4960 .addImm(0)
4961 .setMIFlags(Flags);
4962 auto NumeratorScaled =
4963 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4964 .addUse(LHS)
4965 .addUse(RHS)
4966 .addImm(1)
4967 .setMIFlags(Flags);
4968
4969 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4970 .addUse(DenominatorScaled.getReg(0))
4971 .setMIFlags(Flags);
4972 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4973
4974 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4975 const bool HasDynamicDenormals =
4976 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4977 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4978
4979 Register SavedSPDenormMode;
4980 if (!PreservesDenormals) {
4981 if (HasDynamicDenormals) {
4982 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4983 B.buildInstr(AMDGPU::S_GETREG_B32)
4984 .addDef(SavedSPDenormMode)
4985 .addImm(SPDenormModeBitField);
4986 }
4987 toggleSPDenormMode(true, B, ST, Mode);
4988 }
4989
4990 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4991 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4992 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4993 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4994 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4995 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4996
4997 if (!PreservesDenormals) {
4998 if (HasDynamicDenormals) {
4999 assert(SavedSPDenormMode);
5000 B.buildInstr(AMDGPU::S_SETREG_B32)
5001 .addReg(SavedSPDenormMode)
5002 .addImm(SPDenormModeBitField);
5003 } else
5004 toggleSPDenormMode(false, B, ST, Mode);
5005 }
5006
5007 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5008 .addUse(Fma4.getReg(0))
5009 .addUse(Fma1.getReg(0))
5010 .addUse(Fma3.getReg(0))
5011 .addUse(NumeratorScaled.getReg(1))
5012 .setMIFlags(Flags);
5013
5014 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5015 .addUse(Fmas.getReg(0))
5016 .addUse(RHS)
5017 .addUse(LHS)
5018 .setMIFlags(Flags);
5019
5020 MI.eraseFromParent();
5021 return true;
5022 }
5023
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5024 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5025 MachineRegisterInfo &MRI,
5026 MachineIRBuilder &B) const {
5027 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5028 return true;
5029
5030 Register Res = MI.getOperand(0).getReg();
5031 Register LHS = MI.getOperand(1).getReg();
5032 Register RHS = MI.getOperand(2).getReg();
5033
5034 uint16_t Flags = MI.getFlags();
5035
5036 LLT S64 = LLT::scalar(64);
5037 LLT S1 = LLT::scalar(1);
5038
5039 auto One = B.buildFConstant(S64, 1.0);
5040
5041 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5042 .addUse(LHS)
5043 .addUse(RHS)
5044 .addImm(0)
5045 .setMIFlags(Flags);
5046
5047 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5048
5049 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5050 .addUse(DivScale0.getReg(0))
5051 .setMIFlags(Flags);
5052
5053 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5054 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5055 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5056
5057 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5058 .addUse(LHS)
5059 .addUse(RHS)
5060 .addImm(1)
5061 .setMIFlags(Flags);
5062
5063 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5064 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5065 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5066
5067 Register Scale;
5068 if (!ST.hasUsableDivScaleConditionOutput()) {
5069 // Workaround a hardware bug on SI where the condition output from div_scale
5070 // is not usable.
5071
5072 LLT S32 = LLT::scalar(32);
5073
5074 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5075 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5076 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5077 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5078
5079 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5080 Scale1Unmerge.getReg(1));
5081 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5082 Scale0Unmerge.getReg(1));
5083 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5084 } else {
5085 Scale = DivScale1.getReg(1);
5086 }
5087
5088 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5089 .addUse(Fma4.getReg(0))
5090 .addUse(Fma3.getReg(0))
5091 .addUse(Mul.getReg(0))
5092 .addUse(Scale)
5093 .setMIFlags(Flags);
5094
5095 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5096 .addUse(Fmas.getReg(0))
5097 .addUse(RHS)
5098 .addUse(LHS)
5099 .setMIFlags(Flags);
5100
5101 MI.eraseFromParent();
5102 return true;
5103 }
5104
legalizeFFREXP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5105 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5106 MachineRegisterInfo &MRI,
5107 MachineIRBuilder &B) const {
5108 Register Res0 = MI.getOperand(0).getReg();
5109 Register Res1 = MI.getOperand(1).getReg();
5110 Register Val = MI.getOperand(2).getReg();
5111 uint16_t Flags = MI.getFlags();
5112
5113 LLT Ty = MRI.getType(Res0);
5114 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5115
5116 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5117 .addUse(Val)
5118 .setMIFlags(Flags);
5119 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5120 .addUse(Val)
5121 .setMIFlags(Flags);
5122
5123 if (ST.hasFractBug()) {
5124 auto Fabs = B.buildFAbs(Ty, Val);
5125 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5126 auto IsFinite =
5127 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5128 auto Zero = B.buildConstant(InstrExpTy, 0);
5129 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5130 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5131 }
5132
5133 B.buildCopy(Res0, Mant);
5134 B.buildSExtOrTrunc(Res1, Exp);
5135
5136 MI.eraseFromParent();
5137 return true;
5138 }
5139
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5140 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5141 MachineRegisterInfo &MRI,
5142 MachineIRBuilder &B) const {
5143 Register Res = MI.getOperand(0).getReg();
5144 Register LHS = MI.getOperand(2).getReg();
5145 Register RHS = MI.getOperand(3).getReg();
5146 uint16_t Flags = MI.getFlags();
5147
5148 LLT S32 = LLT::scalar(32);
5149 LLT S1 = LLT::scalar(1);
5150
5151 auto Abs = B.buildFAbs(S32, RHS, Flags);
5152 const APFloat C0Val(1.0f);
5153
5154 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5155 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5156 auto C2 = B.buildFConstant(S32, 1.0f);
5157
5158 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5159 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5160
5161 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5162
5163 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5164 .addUse(Mul0.getReg(0))
5165 .setMIFlags(Flags);
5166
5167 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5168
5169 B.buildFMul(Res, Sel, Mul1, Flags);
5170
5171 MI.eraseFromParent();
5172 return true;
5173 }
5174
legalizeFSQRTF16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5175 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5176 MachineRegisterInfo &MRI,
5177 MachineIRBuilder &B) const {
5178 // Bypass the correct expansion a standard promotion through G_FSQRT would
5179 // get. The f32 op is accurate enough for the f16 cas.
5180 unsigned Flags = MI.getFlags();
5181 assert(!ST.has16BitInsts());
5182 const LLT F32 = LLT::scalar(32);
5183 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5184 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5185 .addUse(Ext.getReg(0))
5186 .setMIFlags(Flags);
5187 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5188 MI.eraseFromParent();
5189 return true;
5190 }
5191
legalizeFSQRTF32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5192 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5193 MachineRegisterInfo &MRI,
5194 MachineIRBuilder &B) const {
5195 MachineFunction &MF = B.getMF();
5196 Register Dst = MI.getOperand(0).getReg();
5197 Register X = MI.getOperand(1).getReg();
5198 const unsigned Flags = MI.getFlags();
5199 const LLT S1 = LLT::scalar(1);
5200 const LLT F32 = LLT::scalar(32);
5201 const LLT I32 = LLT::scalar(32);
5202
5203 if (allowApproxFunc(MF, Flags)) {
5204 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5205 .addUse(X)
5206 .setMIFlags(Flags);
5207 MI.eraseFromParent();
5208 return true;
5209 }
5210
5211 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5212 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5213 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5214 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5215 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5216
5217 Register SqrtS = MRI.createGenericVirtualRegister(F32);
5218 if (needsDenormHandlingF32(MF, X, Flags)) {
5219 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5220 .addUse(SqrtX.getReg(0))
5221 .setMIFlags(Flags);
5222
5223 auto NegOne = B.buildConstant(I32, -1);
5224 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5225
5226 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5227 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5228
5229 auto PosOne = B.buildConstant(I32, 1);
5230 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5231
5232 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5233 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5234
5235 auto Zero = B.buildFConstant(F32, 0.0f);
5236 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5237
5238 SqrtS =
5239 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5240
5241 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5242 SqrtS =
5243 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5244 } else {
5245 auto SqrtR =
5246 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5247 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5248
5249 auto Half = B.buildFConstant(F32, 0.5f);
5250 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5251 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5252 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5253 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5254 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5255 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5256 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5257 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5258 }
5259
5260 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5261
5262 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5263
5264 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5265
5266 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5267 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5268
5269 MI.eraseFromParent();
5270 return true;
5271 }
5272
legalizeFSQRTF64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5273 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5274 MachineRegisterInfo &MRI,
5275 MachineIRBuilder &B) const {
5276 // For double type, the SQRT and RSQ instructions don't have required
5277 // precision, we apply Goldschmidt's algorithm to improve the result:
5278 //
5279 // y0 = rsq(x)
5280 // g0 = x * y0
5281 // h0 = 0.5 * y0
5282 //
5283 // r0 = 0.5 - h0 * g0
5284 // g1 = g0 * r0 + g0
5285 // h1 = h0 * r0 + h0
5286 //
5287 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5288 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5289 // h2 = h1 * r1 + h1
5290 //
5291 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5292 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5293 //
5294 // sqrt(x) = g3
5295
5296 const LLT S1 = LLT::scalar(1);
5297 const LLT S32 = LLT::scalar(32);
5298 const LLT F64 = LLT::scalar(64);
5299
5300 Register Dst = MI.getOperand(0).getReg();
5301 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5302
5303 Register X = MI.getOperand(1).getReg();
5304 unsigned Flags = MI.getFlags();
5305
5306 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5307
5308 auto ZeroInt = B.buildConstant(S32, 0);
5309 auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5310
5311 // Scale up input if it is too small.
5312 auto ScaleUpFactor = B.buildConstant(S32, 256);
5313 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5314 auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5315
5316 auto SqrtY =
5317 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5318
5319 auto Half = B.buildFConstant(F64, 0.5);
5320 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5321 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5322
5323 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5324 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5325
5326 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5327 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5328
5329 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5330 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5331
5332 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5333
5334 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5335 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5336
5337 auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5338
5339 // Scale down the result.
5340 auto ScaleDownFactor = B.buildConstant(S32, -128);
5341 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5342 SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5343
5344 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5345 // with finite only or nsz because rsq(+/-0) = +/-inf
5346
5347 // TODO: Check for DAZ and expand to subnormals
5348 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5349
5350 // If x is +INF, +0, or -0, use its original value
5351 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5352
5353 MI.eraseFromParent();
5354 return true;
5355 }
5356
legalizeFSQRT(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5357 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5358 MachineRegisterInfo &MRI,
5359 MachineIRBuilder &B) const {
5360 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5361 if (Ty == LLT::scalar(32))
5362 return legalizeFSQRTF32(MI, MRI, B);
5363 if (Ty == LLT::scalar(64))
5364 return legalizeFSQRTF64(MI, MRI, B);
5365 if (Ty == LLT::scalar(16))
5366 return legalizeFSQRTF16(MI, MRI, B);
5367 return false;
5368 }
5369
5370 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371 // FIXME: Why do we handle this one but not other removed instructions?
5372 //
5373 // Reciprocal square root. The clamp prevents infinite results, clamping
5374 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
5375 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5376 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5377 MachineRegisterInfo &MRI,
5378 MachineIRBuilder &B) const {
5379 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5380 return true;
5381
5382 Register Dst = MI.getOperand(0).getReg();
5383 Register Src = MI.getOperand(2).getReg();
5384 auto Flags = MI.getFlags();
5385
5386 LLT Ty = MRI.getType(Dst);
5387
5388 const fltSemantics *FltSemantics;
5389 if (Ty == LLT::scalar(32))
5390 FltSemantics = &APFloat::IEEEsingle();
5391 else if (Ty == LLT::scalar(64))
5392 FltSemantics = &APFloat::IEEEdouble();
5393 else
5394 return false;
5395
5396 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5397 .addUse(Src)
5398 .setMIFlags(Flags);
5399
5400 // We don't need to concern ourselves with the snan handling difference, since
5401 // the rsq quieted (or not) so use the one which will directly select.
5402 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5403 const bool UseIEEE = MFI->getMode().IEEE;
5404
5405 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5406 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5407 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5408
5409 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5410
5411 if (UseIEEE)
5412 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5413 else
5414 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5415 MI.eraseFromParent();
5416 return true;
5417 }
5418
5419 // TODO: Fix pointer type handling
legalizeLaneOp(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const5420 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5421 MachineInstr &MI,
5422 Intrinsic::ID IID) const {
5423
5424 MachineIRBuilder &B = Helper.MIRBuilder;
5425 MachineRegisterInfo &MRI = *B.getMRI();
5426
5427 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5428 IID == Intrinsic::amdgcn_permlanex16;
5429
5430 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5431 Register Src2, LLT VT) -> Register {
5432 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5433 switch (IID) {
5434 case Intrinsic::amdgcn_readfirstlane:
5435 case Intrinsic::amdgcn_permlane64:
5436 return LaneOp.getReg(0);
5437 case Intrinsic::amdgcn_readlane:
5438 return LaneOp.addUse(Src1).getReg(0);
5439 case Intrinsic::amdgcn_writelane:
5440 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5441 case Intrinsic::amdgcn_permlane16:
5442 case Intrinsic::amdgcn_permlanex16: {
5443 Register Src3 = MI.getOperand(5).getReg();
5444 Register Src4 = MI.getOperand(6).getImm();
5445 Register Src5 = MI.getOperand(7).getImm();
5446 return LaneOp.addUse(Src1)
5447 .addUse(Src2)
5448 .addUse(Src3)
5449 .addImm(Src4)
5450 .addImm(Src5)
5451 .getReg(0);
5452 }
5453 default:
5454 llvm_unreachable("unhandled lane op");
5455 }
5456 };
5457
5458 Register DstReg = MI.getOperand(0).getReg();
5459 Register Src0 = MI.getOperand(2).getReg();
5460 Register Src1, Src2;
5461 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5462 IsPermLane16) {
5463 Src1 = MI.getOperand(3).getReg();
5464 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5465 Src2 = MI.getOperand(4).getReg();
5466 }
5467 }
5468
5469 LLT Ty = MRI.getType(DstReg);
5470 unsigned Size = Ty.getSizeInBits();
5471
5472 if (Size == 32) {
5473 // Already legal
5474 return true;
5475 }
5476
5477 if (Size < 32) {
5478 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5479
5480 if (IsPermLane16)
5481 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5482
5483 if (IID == Intrinsic::amdgcn_writelane)
5484 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5485
5486 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5487 B.buildTrunc(DstReg, LaneOpDst);
5488 MI.eraseFromParent();
5489 return true;
5490 }
5491
5492 if (Size % 32 != 0)
5493 return false;
5494
5495 LLT PartialResTy = S32;
5496 if (Ty.isVector()) {
5497 LLT EltTy = Ty.getElementType();
5498 switch (EltTy.getSizeInBits()) {
5499 case 16:
5500 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5501 break;
5502 case 32:
5503 PartialResTy = EltTy;
5504 break;
5505 default:
5506 // Handle all other cases via S32 pieces;
5507 break;
5508 }
5509 }
5510
5511 SmallVector<Register, 2> PartialRes;
5512 unsigned NumParts = Size / 32;
5513 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5514 MachineInstrBuilder Src1Parts, Src2Parts;
5515
5516 if (IsPermLane16)
5517 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5518
5519 if (IID == Intrinsic::amdgcn_writelane)
5520 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5521
5522 for (unsigned i = 0; i < NumParts; ++i) {
5523 Src0 = Src0Parts.getReg(i);
5524
5525 if (IsPermLane16)
5526 Src1 = Src1Parts.getReg(i);
5527
5528 if (IID == Intrinsic::amdgcn_writelane)
5529 Src2 = Src2Parts.getReg(i);
5530
5531 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5532 }
5533
5534 B.buildMergeLikeInstr(DstReg, PartialRes);
5535 MI.eraseFromParent();
5536 return true;
5537 }
5538
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5540 MachineRegisterInfo &MRI,
5541 MachineIRBuilder &B) const {
5542 uint64_t Offset =
5543 ST.getTargetLowering()->getImplicitParameterOffset(
5544 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5545 LLT DstTy = MRI.getType(DstReg);
5546 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5547
5548 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5549 if (!loadInputValue(KernargPtrReg, B,
5550 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5551 return false;
5552
5553 // FIXME: This should be nuw
5554 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5555 return true;
5556 }
5557
5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5559 /// bits of the pointer and replace them with the stride argument, then
5560 /// merge_values everything together. In the common case of a raw buffer (the
5561 /// stride component is 0), we can just AND off the upper half.
legalizePointerAsRsrcIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5562 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5563 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5564 Register Result = MI.getOperand(0).getReg();
5565 Register Pointer = MI.getOperand(2).getReg();
5566 Register Stride = MI.getOperand(3).getReg();
5567 Register NumRecords = MI.getOperand(4).getReg();
5568 Register Flags = MI.getOperand(5).getReg();
5569
5570 LLT S32 = LLT::scalar(32);
5571
5572 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5573 auto Unmerge = B.buildUnmerge(S32, Pointer);
5574 Register LowHalf = Unmerge.getReg(0);
5575 Register HighHalf = Unmerge.getReg(1);
5576
5577 auto AndMask = B.buildConstant(S32, 0x0000ffff);
5578 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5579
5580 MachineInstrBuilder NewHighHalf = Masked;
5581 std::optional<ValueAndVReg> StrideConst =
5582 getIConstantVRegValWithLookThrough(Stride, MRI);
5583 if (!StrideConst || !StrideConst->Value.isZero()) {
5584 MachineInstrBuilder ShiftedStride;
5585 if (StrideConst) {
5586 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5587 uint32_t ShiftedStrideVal = StrideVal << 16;
5588 ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5589 } else {
5590 auto ExtStride = B.buildAnyExt(S32, Stride);
5591 auto ShiftConst = B.buildConstant(S32, 16);
5592 ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5593 }
5594 NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5595 }
5596 Register NewHighHalfReg = NewHighHalf.getReg(0);
5597 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5598 MI.eraseFromParent();
5599 return true;
5600 }
5601
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5603 MachineRegisterInfo &MRI,
5604 MachineIRBuilder &B) const {
5605 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5606 if (!MFI->isEntryFunction()) {
5607 return legalizePreloadedArgIntrin(MI, MRI, B,
5608 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5609 }
5610
5611 Register DstReg = MI.getOperand(0).getReg();
5612 if (!getImplicitArgPtr(DstReg, MRI, B))
5613 return false;
5614
5615 MI.eraseFromParent();
5616 return true;
5617 }
5618
getLDSKernelId(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5619 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5620 MachineRegisterInfo &MRI,
5621 MachineIRBuilder &B) const {
5622 Function &F = B.getMF().getFunction();
5623 std::optional<uint32_t> KnownSize =
5624 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5625 if (KnownSize.has_value())
5626 B.buildConstant(DstReg, *KnownSize);
5627 return false;
5628 }
5629
legalizeLDSKernelId(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5630 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5631 MachineRegisterInfo &MRI,
5632 MachineIRBuilder &B) const {
5633
5634 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5635 if (!MFI->isEntryFunction()) {
5636 return legalizePreloadedArgIntrin(MI, MRI, B,
5637 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5638 }
5639
5640 Register DstReg = MI.getOperand(0).getReg();
5641 if (!getLDSKernelId(DstReg, MRI, B))
5642 return false;
5643
5644 MI.eraseFromParent();
5645 return true;
5646 }
5647
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const5648 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5649 MachineRegisterInfo &MRI,
5650 MachineIRBuilder &B,
5651 unsigned AddrSpace) const {
5652 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5653 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5654 Register Hi32 = Unmerge.getReg(1);
5655
5656 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5657 MI.eraseFromParent();
5658 return true;
5659 }
5660
5661 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5662 // offset (the offset that is included in bounds checking and swizzling, to be
5663 // split between the instruction's voffset and immoffset fields) and soffset
5664 // (the offset that is excluded from bounds checking and swizzling, to go in
5665 // the instruction's soffset field). This function takes the first kind of
5666 // offset and figures out how to split it between voffset and immoffset.
5667 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const5668 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5669 Register OrigOffset) const {
5670 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5671 Register BaseReg;
5672 unsigned ImmOffset;
5673 const LLT S32 = LLT::scalar(32);
5674 MachineRegisterInfo &MRI = *B.getMRI();
5675
5676 std::tie(BaseReg, ImmOffset) =
5677 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5678
5679 // If BaseReg is a pointer, convert it to int.
5680 if (MRI.getType(BaseReg).isPointer())
5681 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5682
5683 // If the immediate value is too big for the immoffset field, put only bits
5684 // that would normally fit in the immoffset field. The remaining value that
5685 // is copied/added for the voffset field is a large power of 2, and it
5686 // stands more chance of being CSEd with the copy/add for another similar
5687 // load/store.
5688 // However, do not do that rounding down if that is a negative
5689 // number, as it appears to be illegal to have a negative offset in the
5690 // vgpr, even if adding the immediate offset makes it positive.
5691 unsigned Overflow = ImmOffset & ~MaxImm;
5692 ImmOffset -= Overflow;
5693 if ((int32_t)Overflow < 0) {
5694 Overflow += ImmOffset;
5695 ImmOffset = 0;
5696 }
5697
5698 if (Overflow != 0) {
5699 if (!BaseReg) {
5700 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5701 } else {
5702 auto OverflowVal = B.buildConstant(S32, Overflow);
5703 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5704 }
5705 }
5706
5707 if (!BaseReg)
5708 BaseReg = B.buildConstant(S32, 0).getReg(0);
5709
5710 return std::pair(BaseReg, ImmOffset);
5711 }
5712
5713 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const5714 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5715 MachineRegisterInfo &MRI,
5716 Register Reg,
5717 bool ImageStore) const {
5718 const LLT S16 = LLT::scalar(16);
5719 const LLT S32 = LLT::scalar(32);
5720 LLT StoreVT = MRI.getType(Reg);
5721 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5722
5723 if (ST.hasUnpackedD16VMem()) {
5724 auto Unmerge = B.buildUnmerge(S16, Reg);
5725
5726 SmallVector<Register, 4> WideRegs;
5727 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5728 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5729
5730 int NumElts = StoreVT.getNumElements();
5731
5732 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5733 .getReg(0);
5734 }
5735
5736 if (ImageStore && ST.hasImageStoreD16Bug()) {
5737 if (StoreVT.getNumElements() == 2) {
5738 SmallVector<Register, 4> PackedRegs;
5739 Reg = B.buildBitcast(S32, Reg).getReg(0);
5740 PackedRegs.push_back(Reg);
5741 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5742 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5743 .getReg(0);
5744 }
5745
5746 if (StoreVT.getNumElements() == 3) {
5747 SmallVector<Register, 4> PackedRegs;
5748 auto Unmerge = B.buildUnmerge(S16, Reg);
5749 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5750 PackedRegs.push_back(Unmerge.getReg(I));
5751 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5752 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5753 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5754 }
5755
5756 if (StoreVT.getNumElements() == 4) {
5757 SmallVector<Register, 4> PackedRegs;
5758 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5759 auto Unmerge = B.buildUnmerge(S32, Reg);
5760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5761 PackedRegs.push_back(Unmerge.getReg(I));
5762 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5763 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5764 .getReg(0);
5765 }
5766
5767 llvm_unreachable("invalid data type");
5768 }
5769
5770 if (StoreVT == LLT::fixed_vector(3, S16)) {
5771 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5772 .getReg(0);
5773 }
5774 return Reg;
5775 }
5776
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const5777 Register AMDGPULegalizerInfo::fixStoreSourceType(
5778 MachineIRBuilder &B, Register VData, bool IsFormat) const {
5779 MachineRegisterInfo *MRI = B.getMRI();
5780 LLT Ty = MRI->getType(VData);
5781
5782 const LLT S16 = LLT::scalar(16);
5783
5784 // Fixup buffer resources themselves needing to be v4i128.
5785 if (hasBufferRsrcWorkaround(Ty))
5786 return castBufferRsrcToV4I32(VData, B);
5787
5788 // Fixup illegal register types for i8 stores.
5789 if (Ty == LLT::scalar(8) || Ty == S16) {
5790 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5791 return AnyExt;
5792 }
5793
5794 if (Ty.isVector()) {
5795 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5796 if (IsFormat)
5797 return handleD16VData(B, *MRI, VData);
5798 }
5799 }
5800
5801 return VData;
5802 }
5803
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const5804 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5805 MachineRegisterInfo &MRI,
5806 MachineIRBuilder &B,
5807 bool IsTyped,
5808 bool IsFormat) const {
5809 Register VData = MI.getOperand(1).getReg();
5810 LLT Ty = MRI.getType(VData);
5811 LLT EltTy = Ty.getScalarType();
5812 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5813 const LLT S32 = LLT::scalar(32);
5814
5815 VData = fixStoreSourceType(B, VData, IsFormat);
5816 castBufferRsrcArgToV4I32(MI, B, 2);
5817 Register RSrc = MI.getOperand(2).getReg();
5818
5819 MachineMemOperand *MMO = *MI.memoperands_begin();
5820 const int MemSize = MMO->getSize().getValue();
5821
5822 unsigned ImmOffset;
5823
5824 // The typed intrinsics add an immediate after the registers.
5825 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5826
5827 // The struct intrinsic variants add one additional operand over raw.
5828 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5829 Register VIndex;
5830 int OpOffset = 0;
5831 if (HasVIndex) {
5832 VIndex = MI.getOperand(3).getReg();
5833 OpOffset = 1;
5834 } else {
5835 VIndex = B.buildConstant(S32, 0).getReg(0);
5836 }
5837
5838 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5839 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5840
5841 unsigned Format = 0;
5842 if (IsTyped) {
5843 Format = MI.getOperand(5 + OpOffset).getImm();
5844 ++OpOffset;
5845 }
5846
5847 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5848
5849 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5850
5851 unsigned Opc;
5852 if (IsTyped) {
5853 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5854 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5855 } else if (IsFormat) {
5856 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5857 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5858 } else {
5859 switch (MemSize) {
5860 case 1:
5861 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5862 break;
5863 case 2:
5864 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5865 break;
5866 default:
5867 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5868 break;
5869 }
5870 }
5871
5872 auto MIB = B.buildInstr(Opc)
5873 .addUse(VData) // vdata
5874 .addUse(RSrc) // rsrc
5875 .addUse(VIndex) // vindex
5876 .addUse(VOffset) // voffset
5877 .addUse(SOffset) // soffset
5878 .addImm(ImmOffset); // offset(imm)
5879
5880 if (IsTyped)
5881 MIB.addImm(Format);
5882
5883 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5884 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5885 .addMemOperand(MMO);
5886
5887 MI.eraseFromParent();
5888 return true;
5889 }
5890
buildBufferLoad(unsigned Opc,Register LoadDstReg,Register RSrc,Register VIndex,Register VOffset,Register SOffset,unsigned ImmOffset,unsigned Format,unsigned AuxiliaryData,MachineMemOperand * MMO,bool IsTyped,bool HasVIndex,MachineIRBuilder & B)5891 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5892 Register VIndex, Register VOffset, Register SOffset,
5893 unsigned ImmOffset, unsigned Format,
5894 unsigned AuxiliaryData, MachineMemOperand *MMO,
5895 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5896 auto MIB = B.buildInstr(Opc)
5897 .addDef(LoadDstReg) // vdata
5898 .addUse(RSrc) // rsrc
5899 .addUse(VIndex) // vindex
5900 .addUse(VOffset) // voffset
5901 .addUse(SOffset) // soffset
5902 .addImm(ImmOffset); // offset(imm)
5903
5904 if (IsTyped)
5905 MIB.addImm(Format);
5906
5907 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
5908 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5909 .addMemOperand(MMO);
5910 }
5911
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const5912 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5913 MachineRegisterInfo &MRI,
5914 MachineIRBuilder &B,
5915 bool IsFormat,
5916 bool IsTyped) const {
5917 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5918 MachineMemOperand *MMO = *MI.memoperands_begin();
5919 const LLT MemTy = MMO->getMemoryType();
5920 const LLT S32 = LLT::scalar(32);
5921
5922 Register Dst = MI.getOperand(0).getReg();
5923
5924 Register StatusDst;
5925 int OpOffset = 0;
5926 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5927 bool IsTFE = MI.getNumExplicitDefs() == 2;
5928 if (IsTFE) {
5929 StatusDst = MI.getOperand(1).getReg();
5930 ++OpOffset;
5931 }
5932
5933 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5934 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5935
5936 // The typed intrinsics add an immediate after the registers.
5937 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5938
5939 // The struct intrinsic variants add one additional operand over raw.
5940 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5941 Register VIndex;
5942 if (HasVIndex) {
5943 VIndex = MI.getOperand(3 + OpOffset).getReg();
5944 ++OpOffset;
5945 } else {
5946 VIndex = B.buildConstant(S32, 0).getReg(0);
5947 }
5948
5949 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5950 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5951
5952 unsigned Format = 0;
5953 if (IsTyped) {
5954 Format = MI.getOperand(5 + OpOffset).getImm();
5955 ++OpOffset;
5956 }
5957
5958 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5959 unsigned ImmOffset;
5960
5961 LLT Ty = MRI.getType(Dst);
5962 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5963 // logic doesn't have to handle that case.
5964 if (hasBufferRsrcWorkaround(Ty)) {
5965 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5966 Dst = MI.getOperand(0).getReg();
5967 }
5968 LLT EltTy = Ty.getScalarType();
5969 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5970 const bool Unpacked = ST.hasUnpackedD16VMem();
5971
5972 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5973
5974 unsigned Opc;
5975
5976 // TODO: Support TFE for typed and narrow loads.
5977 if (IsTyped) {
5978 if (IsTFE)
5979 return false;
5980 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5981 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5982 } else if (IsFormat) {
5983 if (IsD16) {
5984 if (IsTFE)
5985 return false;
5986 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5987 } else {
5988 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5990 }
5991 } else {
5992 switch (MemTy.getSizeInBits()) {
5993 case 8:
5994 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5996 break;
5997 case 16:
5998 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6000 break;
6001 default:
6002 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6004 break;
6005 }
6006 }
6007
6008 if (IsTFE) {
6009 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6010 unsigned NumLoadDWords = NumValueDWords + 1;
6011 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6012 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6013 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6014 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6015 if (MemTy.getSizeInBits() < 32) {
6016 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6017 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6018 B.buildTrunc(Dst, ExtDst);
6019 } else if (NumValueDWords == 1) {
6020 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6021 } else {
6022 SmallVector<Register, 5> LoadElts;
6023 for (unsigned I = 0; I != NumValueDWords; ++I)
6024 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6025 LoadElts.push_back(StatusDst);
6026 B.buildUnmerge(LoadElts, LoadDstReg);
6027 LoadElts.truncate(NumValueDWords);
6028 B.buildMergeLikeInstr(Dst, LoadElts);
6029 }
6030 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6031 (IsD16 && !Ty.isVector())) {
6032 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6033 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6034 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6035 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6036 B.buildTrunc(Dst, LoadDstReg);
6037 } else if (Unpacked && IsD16 && Ty.isVector()) {
6038 LLT UnpackedTy = Ty.changeElementSize(32);
6039 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6040 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6041 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6042 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6043 // FIXME: G_TRUNC should work, but legalization currently fails
6044 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6045 SmallVector<Register, 4> Repack;
6046 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6047 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6048 B.buildMergeLikeInstr(Dst, Repack);
6049 } else {
6050 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6051 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6052 }
6053
6054 MI.eraseFromParent();
6055 return true;
6056 }
6057
getBufferAtomicPseudo(Intrinsic::ID IntrID)6058 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6059 switch (IntrID) {
6060 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6062 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6064 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6065 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6066 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6067 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6068 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6069 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6070 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6072 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6074 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6075 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6076 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6077 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6078 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6079 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6080 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6082 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6084 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6085 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6087 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6088 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6089 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6090 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6092 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6093 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6094 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6095 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6096 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6097 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6098 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6099 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6100 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6101 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6102 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6103 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6104 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6105 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6107 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6109 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6110 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6111 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6112 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6113 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6114 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6115 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6116 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6117 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6119 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6120 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6121 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6122 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6123 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6124 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6125 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6126 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6127 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6129 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6130 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6131 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6132 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6134 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6135 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6136 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6137 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6138 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6139 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6140 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6141 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6142 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6143 default:
6144 llvm_unreachable("unhandled atomic opcode");
6145 }
6146 }
6147
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const6148 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6149 MachineIRBuilder &B,
6150 Intrinsic::ID IID) const {
6151 const bool IsCmpSwap =
6152 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6153 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6154 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6155 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6156
6157 Register Dst = MI.getOperand(0).getReg();
6158 // Since we don't have 128-bit atomics, we don't need to handle the case of
6159 // p8 argmunents to the atomic itself
6160 Register VData = MI.getOperand(2).getReg();
6161
6162 Register CmpVal;
6163 int OpOffset = 0;
6164
6165 if (IsCmpSwap) {
6166 CmpVal = MI.getOperand(3).getReg();
6167 ++OpOffset;
6168 }
6169
6170 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6171 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6172 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6173
6174 // The struct intrinsic variants add one additional operand over raw.
6175 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6176 Register VIndex;
6177 if (HasVIndex) {
6178 VIndex = MI.getOperand(4 + OpOffset).getReg();
6179 ++OpOffset;
6180 } else {
6181 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6182 }
6183
6184 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6185 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6186 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6187
6188 MachineMemOperand *MMO = *MI.memoperands_begin();
6189
6190 unsigned ImmOffset;
6191 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6192
6193 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6194 .addDef(Dst)
6195 .addUse(VData); // vdata
6196
6197 if (IsCmpSwap)
6198 MIB.addReg(CmpVal);
6199
6200 MIB.addUse(RSrc) // rsrc
6201 .addUse(VIndex) // vindex
6202 .addUse(VOffset) // voffset
6203 .addUse(SOffset) // soffset
6204 .addImm(ImmOffset) // offset(imm)
6205 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6206 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6207 .addMemOperand(MMO);
6208
6209 MI.eraseFromParent();
6210 return true;
6211 }
6212
6213 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6214 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)6215 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6216 SmallVectorImpl<Register> &PackedAddrs,
6217 unsigned ArgOffset,
6218 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6219 bool IsA16, bool IsG16) {
6220 const LLT S16 = LLT::scalar(16);
6221 const LLT V2S16 = LLT::fixed_vector(2, 16);
6222 auto EndIdx = Intr->VAddrEnd;
6223
6224 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6225 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6226 if (!SrcOp.isReg())
6227 continue; // _L to _LZ may have eliminated this.
6228
6229 Register AddrReg = SrcOp.getReg();
6230
6231 if ((I < Intr->GradientStart) ||
6232 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233 (I >= Intr->CoordStart && !IsA16)) {
6234 if ((I < Intr->GradientStart) && IsA16 &&
6235 (B.getMRI()->getType(AddrReg) == S16)) {
6236 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6237 // Special handling of bias when A16 is on. Bias is of type half but
6238 // occupies full 32-bit.
6239 PackedAddrs.push_back(
6240 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6241 .getReg(0));
6242 } else {
6243 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6244 "Bias needs to be converted to 16 bit in A16 mode");
6245 // Handle any gradient or coordinate operands that should not be packed
6246 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6247 PackedAddrs.push_back(AddrReg);
6248 }
6249 } else {
6250 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6251 // derivatives dx/dh and dx/dv are packed with undef.
6252 if (((I + 1) >= EndIdx) ||
6253 ((Intr->NumGradients / 2) % 2 == 1 &&
6254 (I == static_cast<unsigned>(Intr->GradientStart +
6255 (Intr->NumGradients / 2) - 1) ||
6256 I == static_cast<unsigned>(Intr->GradientStart +
6257 Intr->NumGradients - 1))) ||
6258 // Check for _L to _LZ optimization
6259 !MI.getOperand(ArgOffset + I + 1).isReg()) {
6260 PackedAddrs.push_back(
6261 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6262 .getReg(0));
6263 } else {
6264 PackedAddrs.push_back(
6265 B.buildBuildVector(
6266 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6267 .getReg(0));
6268 ++I;
6269 }
6270 }
6271 }
6272 }
6273
6274 /// Convert from separate vaddr components to a single vector address register,
6275 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)6276 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6277 int DimIdx, int NumVAddrs) {
6278 const LLT S32 = LLT::scalar(32);
6279 (void)S32;
6280 SmallVector<Register, 8> AddrRegs;
6281 for (int I = 0; I != NumVAddrs; ++I) {
6282 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6283 if (SrcOp.isReg()) {
6284 AddrRegs.push_back(SrcOp.getReg());
6285 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6286 }
6287 }
6288
6289 int NumAddrRegs = AddrRegs.size();
6290 if (NumAddrRegs != 1) {
6291 auto VAddr =
6292 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6293 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6294 }
6295
6296 for (int I = 1; I != NumVAddrs; ++I) {
6297 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6298 if (SrcOp.isReg())
6299 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6300 }
6301 }
6302
6303 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6304 ///
6305 /// Depending on the subtarget, load/store with 16-bit element data need to be
6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6308 /// registers.
6309 ///
6310 /// We don't want to directly select image instructions just yet, but also want
6311 /// to exposes all register repacking to the legalizer/combiners. We also don't
6312 /// want a selected instruction entering RegBankSelect. In order to avoid
6313 /// defining a multitude of intermediate image instructions, directly hack on
6314 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6315 /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const6316 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6318 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6319
6320 const MachineFunction &MF = *MI.getMF();
6321 const unsigned NumDefs = MI.getNumExplicitDefs();
6322 const unsigned ArgOffset = NumDefs + 1;
6323 bool IsTFE = NumDefs == 2;
6324 // We are only processing the operands of d16 image operations on subtargets
6325 // that use the unpacked register layout, or need to repack the TFE result.
6326
6327 // TODO: Do we need to guard against already legalized intrinsics?
6328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6330
6331 MachineRegisterInfo *MRI = B.getMRI();
6332 const LLT S32 = LLT::scalar(32);
6333 const LLT S16 = LLT::scalar(16);
6334 const LLT V2S16 = LLT::fixed_vector(2, 16);
6335
6336 unsigned DMask = 0;
6337 Register VData;
6338 LLT Ty;
6339
6340 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6341 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6342 Ty = MRI->getType(VData);
6343 }
6344
6345 const bool IsAtomicPacked16Bit =
6346 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6347 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6348
6349 // Check for 16 bit addresses and pack if true.
6350 LLT GradTy =
6351 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6352 LLT AddrTy =
6353 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6354 const bool IsG16 =
6355 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6356 const bool IsA16 = AddrTy == S16;
6357 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6358
6359 int DMaskLanes = 0;
6360 if (!BaseOpcode->Atomic) {
6361 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6362 if (BaseOpcode->Gather4) {
6363 DMaskLanes = 4;
6364 } else if (DMask != 0) {
6365 DMaskLanes = llvm::popcount(DMask);
6366 } else if (!IsTFE && !BaseOpcode->Store) {
6367 // If dmask is 0, this is a no-op load. This can be eliminated.
6368 B.buildUndef(MI.getOperand(0));
6369 MI.eraseFromParent();
6370 return true;
6371 }
6372 }
6373
6374 Observer.changingInstr(MI);
6375 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6376
6377 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6378 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6379 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6380 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6381 unsigned NewOpcode = LoadOpcode;
6382 if (BaseOpcode->Store)
6383 NewOpcode = StoreOpcode;
6384 else if (BaseOpcode->NoReturn)
6385 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6386
6387 // Track that we legalized this
6388 MI.setDesc(B.getTII().get(NewOpcode));
6389
6390 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6391 // dmask to be at least 1 otherwise the instruction will fail
6392 if (IsTFE && DMask == 0) {
6393 DMask = 0x1;
6394 DMaskLanes = 1;
6395 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6396 }
6397
6398 if (BaseOpcode->Atomic) {
6399 Register VData0 = MI.getOperand(2).getReg();
6400 LLT Ty = MRI->getType(VData0);
6401
6402 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6403 if (Ty.isVector() && !IsAtomicPacked16Bit)
6404 return false;
6405
6406 if (BaseOpcode->AtomicX2) {
6407 Register VData1 = MI.getOperand(3).getReg();
6408 // The two values are packed in one register.
6409 LLT PackedTy = LLT::fixed_vector(2, Ty);
6410 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6411 MI.getOperand(2).setReg(Concat.getReg(0));
6412 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6413 }
6414 }
6415
6416 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6417
6418 // Rewrite the addressing register layout before doing anything else.
6419 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6420 // 16 bit gradients are supported, but are tied to the A16 control
6421 // so both gradients and addresses must be 16 bit
6422 return false;
6423 }
6424
6425 if (IsA16 && !ST.hasA16()) {
6426 // A16 not supported
6427 return false;
6428 }
6429
6430 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6431 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6432
6433 if (IsA16 || IsG16) {
6434 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6435 // instructions expect VGPR_32
6436 SmallVector<Register, 4> PackedRegs;
6437
6438 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6439
6440 // See also below in the non-a16 branch
6441 const bool UseNSA = ST.hasNSAEncoding() &&
6442 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6443 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6444 const bool UsePartialNSA =
6445 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6446
6447 if (UsePartialNSA) {
6448 // Pack registers that would go over NSAMaxSize into last VAddr register
6449 LLT PackedAddrTy =
6450 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6451 auto Concat = B.buildConcatVectors(
6452 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6453 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6454 PackedRegs.resize(NSAMaxSize);
6455 } else if (!UseNSA && PackedRegs.size() > 1) {
6456 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6457 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6458 PackedRegs[0] = Concat.getReg(0);
6459 PackedRegs.resize(1);
6460 }
6461
6462 const unsigned NumPacked = PackedRegs.size();
6463 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6464 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6465 if (!SrcOp.isReg()) {
6466 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6467 continue;
6468 }
6469
6470 assert(SrcOp.getReg() != AMDGPU::NoRegister);
6471
6472 if (I - Intr->VAddrStart < NumPacked)
6473 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6474 else
6475 SrcOp.setReg(AMDGPU::NoRegister);
6476 }
6477 } else {
6478 // If the register allocator cannot place the address registers contiguously
6479 // without introducing moves, then using the non-sequential address encoding
6480 // is always preferable, since it saves VALU instructions and is usually a
6481 // wash in terms of code size or even better.
6482 //
6483 // However, we currently have no way of hinting to the register allocator
6484 // that MIMG addresses should be placed contiguously when it is possible to
6485 // do so, so force non-NSA for the common 2-address case as a heuristic.
6486 //
6487 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6488 // allocation when possible.
6489 //
6490 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6491 // set of the remaining addresses.
6492 const bool UseNSA = ST.hasNSAEncoding() &&
6493 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6494 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6495 const bool UsePartialNSA =
6496 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6497
6498 if (UsePartialNSA) {
6499 convertImageAddrToPacked(B, MI,
6500 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6501 Intr->NumVAddrs - NSAMaxSize + 1);
6502 } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6504 Intr->NumVAddrs);
6505 }
6506 }
6507
6508 int Flags = 0;
6509 if (IsA16)
6510 Flags |= 1;
6511 if (IsG16)
6512 Flags |= 2;
6513 MI.addOperand(MachineOperand::CreateImm(Flags));
6514
6515 if (BaseOpcode->NoReturn) { // No TFE for stores?
6516 // TODO: Handle dmask trim
6517 if (!Ty.isVector() || !IsD16)
6518 return true;
6519
6520 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6521 if (RepackedReg != VData) {
6522 MI.getOperand(1).setReg(RepackedReg);
6523 }
6524
6525 return true;
6526 }
6527
6528 Register DstReg = MI.getOperand(0).getReg();
6529 const LLT EltTy = Ty.getScalarType();
6530 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6531
6532 // Confirm that the return type is large enough for the dmask specified
6533 if (NumElts < DMaskLanes)
6534 return false;
6535
6536 if (NumElts > 4 || DMaskLanes > 4)
6537 return false;
6538
6539 // Image atomic instructions are using DMask to specify how many bits
6540 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541 // DMaskLanes for image atomic has default value '0'.
6542 // We must be sure that atomic variants (especially packed) will not be
6543 // truncated from v2s16 or v4s16 to s16 type.
6544 //
6545 // ChangeElementCount will be needed for image load where Ty is always scalar.
6546 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6547 const LLT AdjustedTy =
6548 DMaskLanes == 0
6549 ? Ty
6550 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6551
6552 // The raw dword aligned data component of the load. The only legal cases
6553 // where this matters should be when using the packed D16 format, for
6554 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6555 LLT RoundedTy;
6556
6557 // S32 vector to cover all data, plus TFE result element.
6558 LLT TFETy;
6559
6560 // Register type to use for each loaded component. Will be S32 or V2S16.
6561 LLT RegTy;
6562
6563 if (IsD16 && ST.hasUnpackedD16VMem()) {
6564 RoundedTy =
6565 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6566 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6567 RegTy = S32;
6568 } else {
6569 unsigned EltSize = EltTy.getSizeInBits();
6570 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6571 unsigned RoundedSize = 32 * RoundedElts;
6572 RoundedTy = LLT::scalarOrVector(
6573 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6574 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6575 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6576 }
6577
6578 // The return type does not need adjustment.
6579 // TODO: Should we change s16 case to s32 or <2 x s16>?
6580 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6581 return true;
6582
6583 Register Dst1Reg;
6584
6585 // Insert after the instruction.
6586 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6587
6588 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6589 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6590 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6591 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6592
6593 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6594
6595 MI.getOperand(0).setReg(NewResultReg);
6596
6597 // In the IR, TFE is supposed to be used with a 2 element struct return
6598 // type. The instruction really returns these two values in one contiguous
6599 // register, with one additional dword beyond the loaded data. Rewrite the
6600 // return type to use a single register result.
6601
6602 if (IsTFE) {
6603 Dst1Reg = MI.getOperand(1).getReg();
6604 if (MRI->getType(Dst1Reg) != S32)
6605 return false;
6606
6607 // TODO: Make sure the TFE operand bit is set.
6608 MI.removeOperand(1);
6609
6610 // Handle the easy case that requires no repack instructions.
6611 if (Ty == S32) {
6612 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6613 return true;
6614 }
6615 }
6616
6617 // Now figure out how to copy the new result register back into the old
6618 // result.
6619 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6620
6621 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6622
6623 if (ResultNumRegs == 1) {
6624 assert(!IsTFE);
6625 ResultRegs[0] = NewResultReg;
6626 } else {
6627 // We have to repack into a new vector of some kind.
6628 for (int I = 0; I != NumDataRegs; ++I)
6629 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6630 B.buildUnmerge(ResultRegs, NewResultReg);
6631
6632 // Drop the final TFE element to get the data part. The TFE result is
6633 // directly written to the right place already.
6634 if (IsTFE)
6635 ResultRegs.resize(NumDataRegs);
6636 }
6637
6638 // For an s16 scalar result, we form an s32 result with a truncate regardless
6639 // of packed vs. unpacked.
6640 if (IsD16 && !Ty.isVector()) {
6641 B.buildTrunc(DstReg, ResultRegs[0]);
6642 return true;
6643 }
6644
6645 // Avoid a build/concat_vector of 1 entry.
6646 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6647 B.buildBitcast(DstReg, ResultRegs[0]);
6648 return true;
6649 }
6650
6651 assert(Ty.isVector());
6652
6653 if (IsD16) {
6654 // For packed D16 results with TFE enabled, all the data components are
6655 // S32. Cast back to the expected type.
6656 //
6657 // TODO: We don't really need to use load s32 elements. We would only need one
6658 // cast for the TFE result if a multiple of v2s16 was used.
6659 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6660 for (Register &Reg : ResultRegs)
6661 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6662 } else if (ST.hasUnpackedD16VMem()) {
6663 for (Register &Reg : ResultRegs)
6664 Reg = B.buildTrunc(S16, Reg).getReg(0);
6665 }
6666 }
6667
6668 auto padWithUndef = [&](LLT Ty, int NumElts) {
6669 if (NumElts == 0)
6670 return;
6671 Register Undef = B.buildUndef(Ty).getReg(0);
6672 for (int I = 0; I != NumElts; ++I)
6673 ResultRegs.push_back(Undef);
6674 };
6675
6676 // Pad out any elements eliminated due to the dmask.
6677 LLT ResTy = MRI->getType(ResultRegs[0]);
6678 if (!ResTy.isVector()) {
6679 padWithUndef(ResTy, NumElts - ResultRegs.size());
6680 B.buildBuildVector(DstReg, ResultRegs);
6681 return true;
6682 }
6683
6684 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6685 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6686
6687 // Deal with the one annoying legal case.
6688 const LLT V3S16 = LLT::fixed_vector(3, 16);
6689 if (Ty == V3S16) {
6690 if (IsTFE) {
6691 if (ResultRegs.size() == 1) {
6692 NewResultReg = ResultRegs[0];
6693 } else if (ResultRegs.size() == 2) {
6694 LLT V4S16 = LLT::fixed_vector(4, 16);
6695 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6696 } else {
6697 return false;
6698 }
6699 }
6700
6701 if (MRI->getType(DstReg).getNumElements() <
6702 MRI->getType(NewResultReg).getNumElements()) {
6703 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6704 } else {
6705 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6706 }
6707 return true;
6708 }
6709
6710 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6711 B.buildConcatVectors(DstReg, ResultRegs);
6712 return true;
6713 }
6714
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const6715 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6716 MachineInstr &MI) const {
6717 MachineIRBuilder &B = Helper.MIRBuilder;
6718 GISelChangeObserver &Observer = Helper.Observer;
6719
6720 Register OrigDst = MI.getOperand(0).getReg();
6721 Register Dst;
6722 LLT Ty = B.getMRI()->getType(OrigDst);
6723 unsigned Size = Ty.getSizeInBits();
6724 MachineFunction &MF = B.getMF();
6725 unsigned Opc = 0;
6726 if (Size < 32 && ST.hasScalarSubwordLoads()) {
6727 assert(Size == 8 || Size == 16);
6728 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6729 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6730 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6731 // destination register.
6732 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6733 } else {
6734 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6735 Dst = OrigDst;
6736 }
6737
6738 Observer.changingInstr(MI);
6739
6740 // Handle needing to s.buffer.load() a p8 value.
6741 if (hasBufferRsrcWorkaround(Ty)) {
6742 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6743 B.setInsertPt(B.getMBB(), MI);
6744 }
6745 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6746 Ty = getBitcastRegisterType(Ty);
6747 Helper.bitcastDst(MI, Ty, 0);
6748 B.setInsertPt(B.getMBB(), MI);
6749 }
6750
6751 // FIXME: We don't really need this intermediate instruction. The intrinsic
6752 // should be fixed to have a memory operand. Since it's readnone, we're not
6753 // allowed to add one.
6754 MI.setDesc(B.getTII().get(Opc));
6755 MI.removeOperand(1); // Remove intrinsic ID
6756
6757 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6758 const unsigned MemSize = (Size + 7) / 8;
6759 const Align MemAlign = B.getDataLayout().getABITypeAlign(
6760 getTypeForLLT(Ty, MF.getFunction().getContext()));
6761 MachineMemOperand *MMO = MF.getMachineMemOperand(
6762 MachinePointerInfo(),
6763 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6764 MachineMemOperand::MOInvariant,
6765 MemSize, MemAlign);
6766 MI.addMemOperand(MF, MMO);
6767 if (Dst != OrigDst) {
6768 MI.getOperand(0).setReg(Dst);
6769 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6770 B.buildTrunc(OrigDst, Dst);
6771 }
6772
6773 // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774 // always be legal. We may need to restore this to a 96-bit result if it turns
6775 // out this needs to be converted to a vector load during RegBankSelect.
6776 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6777 if (Ty.isVector())
6778 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6779 else
6780 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6781 }
6782
6783 Observer.changedInstr(MI);
6784 return true;
6785 }
6786
6787 // TODO: Move to selection
legalizeTrap(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6788 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6789 MachineRegisterInfo &MRI,
6790 MachineIRBuilder &B) const {
6791 if (!ST.isTrapHandlerEnabled() ||
6792 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6793 return legalizeTrapEndpgm(MI, MRI, B);
6794
6795 return ST.supportsGetDoorbellID() ?
6796 legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6797 }
6798
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6799 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6801 const DebugLoc &DL = MI.getDebugLoc();
6802 MachineBasicBlock &BB = B.getMBB();
6803 MachineFunction *MF = BB.getParent();
6804
6805 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6806 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6807 .addImm(0);
6808 MI.eraseFromParent();
6809 return true;
6810 }
6811
6812 // We need a block split to make the real endpgm a terminator. We also don't
6813 // want to break phis in successor blocks, so we can't just delete to the
6814 // end of the block.
6815 BB.splitAt(MI, false /*UpdateLiveIns*/);
6816 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817 MF->push_back(TrapBB);
6818 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6819 .addImm(0);
6820 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6821 .addMBB(TrapBB);
6822
6823 BB.addSuccessor(TrapBB);
6824 MI.eraseFromParent();
6825 return true;
6826 }
6827
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6828 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6830 MachineFunction &MF = B.getMF();
6831 const LLT S64 = LLT::scalar(64);
6832
6833 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6834 // For code object version 5, queue_ptr is passed through implicit kernarg.
6835 if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6836 AMDGPU::AMDHSA_COV5) {
6837 AMDGPUTargetLowering::ImplicitParameter Param =
6838 AMDGPUTargetLowering::QUEUE_PTR;
6839 uint64_t Offset =
6840 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6841
6842 Register KernargPtrReg = MRI.createGenericVirtualRegister(
6843 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6844
6845 if (!loadInputValue(KernargPtrReg, B,
6846 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6847 return false;
6848
6849 // TODO: can we be smarter about machine pointer info?
6850 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6851 MachineMemOperand *MMO = MF.getMachineMemOperand(
6852 PtrInfo,
6853 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6854 MachineMemOperand::MOInvariant,
6855 LLT::scalar(64), commonAlignment(Align(64), Offset));
6856
6857 // Pointer address
6858 Register LoadAddr = MRI.createGenericVirtualRegister(
6859 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6860 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6861 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6862 // Load address
6863 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6864 B.buildCopy(SGPR01, Temp);
6865 B.buildInstr(AMDGPU::S_TRAP)
6866 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6867 .addReg(SGPR01, RegState::Implicit);
6868 MI.eraseFromParent();
6869 return true;
6870 }
6871
6872 // Pass queue pointer to trap handler as input, and insert trap instruction
6873 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6874 Register LiveIn =
6875 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6876 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6877 return false;
6878
6879 B.buildCopy(SGPR01, LiveIn);
6880 B.buildInstr(AMDGPU::S_TRAP)
6881 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6882 .addReg(SGPR01, RegState::Implicit);
6883
6884 MI.eraseFromParent();
6885 return true;
6886 }
6887
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6888 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6889 MachineRegisterInfo &MRI,
6890 MachineIRBuilder &B) const {
6891 // We need to simulate the 's_trap 2' instruction on targets that run in
6892 // PRIV=1 (where it is treated as a nop).
6893 if (ST.hasPrivEnabledTrap2NopBug()) {
6894 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6895 MI.getDebugLoc());
6896 MI.eraseFromParent();
6897 return true;
6898 }
6899
6900 B.buildInstr(AMDGPU::S_TRAP)
6901 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6902 MI.eraseFromParent();
6903 return true;
6904 }
6905
legalizeDebugTrap(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6906 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6907 MachineRegisterInfo &MRI,
6908 MachineIRBuilder &B) const {
6909 // Is non-HSA path or trap-handler disabled? Then, report a warning
6910 // accordingly
6911 if (!ST.isTrapHandlerEnabled() ||
6912 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6913 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6914 "debugtrap handler not supported",
6915 MI.getDebugLoc(), DS_Warning);
6916 LLVMContext &Ctx = B.getMF().getFunction().getContext();
6917 Ctx.diagnose(NoTrap);
6918 } else {
6919 // Insert debug-trap instruction
6920 B.buildInstr(AMDGPU::S_TRAP)
6921 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6922 }
6923
6924 MI.eraseFromParent();
6925 return true;
6926 }
6927
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const6928 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6929 MachineIRBuilder &B) const {
6930 MachineRegisterInfo &MRI = *B.getMRI();
6931 const LLT S16 = LLT::scalar(16);
6932 const LLT S32 = LLT::scalar(32);
6933 const LLT V2S16 = LLT::fixed_vector(2, 16);
6934 const LLT V3S32 = LLT::fixed_vector(3, 32);
6935
6936 Register DstReg = MI.getOperand(0).getReg();
6937 Register NodePtr = MI.getOperand(2).getReg();
6938 Register RayExtent = MI.getOperand(3).getReg();
6939 Register RayOrigin = MI.getOperand(4).getReg();
6940 Register RayDir = MI.getOperand(5).getReg();
6941 Register RayInvDir = MI.getOperand(6).getReg();
6942 Register TDescr = MI.getOperand(7).getReg();
6943
6944 if (!ST.hasGFX10_AEncoding()) {
6945 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6946 "intrinsic not supported on subtarget",
6947 MI.getDebugLoc());
6948 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6949 return false;
6950 }
6951
6952 const bool IsGFX11 = AMDGPU::isGFX11(ST);
6953 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6954 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6955 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6956 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6957 const unsigned NumVDataDwords = 4;
6958 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6959 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6960 const bool UseNSA =
6961 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6962
6963 const unsigned BaseOpcodes[2][2] = {
6964 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6965 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6966 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6967 int Opcode;
6968 if (UseNSA) {
6969 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6970 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6971 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6972 : AMDGPU::MIMGEncGfx10NSA,
6973 NumVDataDwords, NumVAddrDwords);
6974 } else {
6975 assert(!IsGFX12Plus);
6976 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6977 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6978 : AMDGPU::MIMGEncGfx10Default,
6979 NumVDataDwords, NumVAddrDwords);
6980 }
6981 assert(Opcode != -1);
6982
6983 SmallVector<Register, 12> Ops;
6984 if (UseNSA && IsGFX11Plus) {
6985 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6986 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6987 auto Merged = B.buildMergeLikeInstr(
6988 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6989 Ops.push_back(Merged.getReg(0));
6990 };
6991
6992 Ops.push_back(NodePtr);
6993 Ops.push_back(RayExtent);
6994 packLanes(RayOrigin);
6995
6996 if (IsA16) {
6997 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6998 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6999 auto MergedDir = B.buildMergeLikeInstr(
7000 V3S32,
7001 {B.buildBitcast(
7002 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7003 UnmergeRayDir.getReg(0)}))
7004 .getReg(0),
7005 B.buildBitcast(
7006 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7007 UnmergeRayDir.getReg(1)}))
7008 .getReg(0),
7009 B.buildBitcast(
7010 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7011 UnmergeRayDir.getReg(2)}))
7012 .getReg(0)});
7013 Ops.push_back(MergedDir.getReg(0));
7014 } else {
7015 packLanes(RayDir);
7016 packLanes(RayInvDir);
7017 }
7018 } else {
7019 if (Is64) {
7020 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7021 Ops.push_back(Unmerge.getReg(0));
7022 Ops.push_back(Unmerge.getReg(1));
7023 } else {
7024 Ops.push_back(NodePtr);
7025 }
7026 Ops.push_back(RayExtent);
7027
7028 auto packLanes = [&Ops, &S32, &B](Register Src) {
7029 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7030 Ops.push_back(Unmerge.getReg(0));
7031 Ops.push_back(Unmerge.getReg(1));
7032 Ops.push_back(Unmerge.getReg(2));
7033 };
7034
7035 packLanes(RayOrigin);
7036 if (IsA16) {
7037 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7038 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7039 Register R1 = MRI.createGenericVirtualRegister(S32);
7040 Register R2 = MRI.createGenericVirtualRegister(S32);
7041 Register R3 = MRI.createGenericVirtualRegister(S32);
7042 B.buildMergeLikeInstr(R1,
7043 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7044 B.buildMergeLikeInstr(
7045 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7046 B.buildMergeLikeInstr(
7047 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7048 Ops.push_back(R1);
7049 Ops.push_back(R2);
7050 Ops.push_back(R3);
7051 } else {
7052 packLanes(RayDir);
7053 packLanes(RayInvDir);
7054 }
7055 }
7056
7057 if (!UseNSA) {
7058 // Build a single vector containing all the operands so far prepared.
7059 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7060 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7061 Ops.clear();
7062 Ops.push_back(MergedOps);
7063 }
7064
7065 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7066 .addDef(DstReg)
7067 .addImm(Opcode);
7068
7069 for (Register R : Ops) {
7070 MIB.addUse(R);
7071 }
7072
7073 MIB.addUse(TDescr)
7074 .addImm(IsA16 ? 1 : 0)
7075 .cloneMemRefs(MI);
7076
7077 MI.eraseFromParent();
7078 return true;
7079 }
7080
legalizeFPTruncRound(MachineInstr & MI,MachineIRBuilder & B) const7081 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
7082 MachineIRBuilder &B) const {
7083 unsigned Opc;
7084 int RoundMode = MI.getOperand(2).getImm();
7085
7086 if (RoundMode == (int)RoundingMode::TowardPositive)
7087 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7088 else if (RoundMode == (int)RoundingMode::TowardNegative)
7089 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7090 else
7091 return false;
7092
7093 B.buildInstr(Opc)
7094 .addDef(MI.getOperand(0).getReg())
7095 .addUse(MI.getOperand(1).getReg());
7096
7097 MI.eraseFromParent();
7098
7099 return true;
7100 }
7101
legalizeStackSave(MachineInstr & MI,MachineIRBuilder & B) const7102 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7103 MachineIRBuilder &B) const {
7104 const SITargetLowering *TLI = ST.getTargetLowering();
7105 Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7106 Register DstReg = MI.getOperand(0).getReg();
7107 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7108 MI.eraseFromParent();
7109 return true;
7110 }
7111
legalizeWaveID(MachineInstr & MI,MachineIRBuilder & B) const7112 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7113 MachineIRBuilder &B) const {
7114 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115 if (!ST.hasArchitectedSGPRs())
7116 return false;
7117 LLT S32 = LLT::scalar(32);
7118 Register DstReg = MI.getOperand(0).getReg();
7119 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7120 auto LSB = B.buildConstant(S32, 25);
7121 auto Width = B.buildConstant(S32, 5);
7122 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7123 MI.eraseFromParent();
7124 return true;
7125 }
7126
7127 static constexpr unsigned FPEnvModeBitField =
7128 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7129
7130 static constexpr unsigned FPEnvTrapBitField =
7131 AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7132
legalizeGetFPEnv(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const7133 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7134 MachineRegisterInfo &MRI,
7135 MachineIRBuilder &B) const {
7136 Register Src = MI.getOperand(0).getReg();
7137 if (MRI.getType(Src) != S64)
7138 return false;
7139
7140 auto ModeReg =
7141 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7142 /*HasSideEffects=*/true, /*isConvergent=*/false)
7143 .addImm(FPEnvModeBitField);
7144 auto TrapReg =
7145 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7146 /*HasSideEffects=*/true, /*isConvergent=*/false)
7147 .addImm(FPEnvTrapBitField);
7148 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7149 MI.eraseFromParent();
7150 return true;
7151 }
7152
legalizeSetFPEnv(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const7153 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7154 MachineRegisterInfo &MRI,
7155 MachineIRBuilder &B) const {
7156 Register Src = MI.getOperand(0).getReg();
7157 if (MRI.getType(Src) != S64)
7158 return false;
7159
7160 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7161 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7162 /*HasSideEffects=*/true, /*isConvergent=*/false)
7163 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7164 .addReg(Unmerge.getReg(0));
7165 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7166 /*HasSideEffects=*/true, /*isConvergent=*/false)
7167 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7168 .addReg(Unmerge.getReg(1));
7169 MI.eraseFromParent();
7170 return true;
7171 }
7172
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const7173 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7174 MachineInstr &MI) const {
7175 MachineIRBuilder &B = Helper.MIRBuilder;
7176 MachineRegisterInfo &MRI = *B.getMRI();
7177
7178 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7179 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7180 switch (IntrID) {
7181 case Intrinsic::amdgcn_if:
7182 case Intrinsic::amdgcn_else: {
7183 MachineInstr *Br = nullptr;
7184 MachineBasicBlock *UncondBrTarget = nullptr;
7185 bool Negated = false;
7186 if (MachineInstr *BrCond =
7187 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7188 const SIRegisterInfo *TRI
7189 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7190
7191 Register Def = MI.getOperand(1).getReg();
7192 Register Use = MI.getOperand(3).getReg();
7193
7194 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7195
7196 if (Negated)
7197 std::swap(CondBrTarget, UncondBrTarget);
7198
7199 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7200 if (IntrID == Intrinsic::amdgcn_if) {
7201 B.buildInstr(AMDGPU::SI_IF)
7202 .addDef(Def)
7203 .addUse(Use)
7204 .addMBB(UncondBrTarget);
7205 } else {
7206 B.buildInstr(AMDGPU::SI_ELSE)
7207 .addDef(Def)
7208 .addUse(Use)
7209 .addMBB(UncondBrTarget);
7210 }
7211
7212 if (Br) {
7213 Br->getOperand(0).setMBB(CondBrTarget);
7214 } else {
7215 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7216 // since we're swapping branch targets it needs to be reinserted.
7217 // FIXME: IRTranslator should probably not do this
7218 B.buildBr(*CondBrTarget);
7219 }
7220
7221 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7222 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7223 MI.eraseFromParent();
7224 BrCond->eraseFromParent();
7225 return true;
7226 }
7227
7228 return false;
7229 }
7230 case Intrinsic::amdgcn_loop: {
7231 MachineInstr *Br = nullptr;
7232 MachineBasicBlock *UncondBrTarget = nullptr;
7233 bool Negated = false;
7234 if (MachineInstr *BrCond =
7235 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7236 const SIRegisterInfo *TRI
7237 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7238
7239 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7240 Register Reg = MI.getOperand(2).getReg();
7241
7242 if (Negated)
7243 std::swap(CondBrTarget, UncondBrTarget);
7244
7245 B.setInsertPt(B.getMBB(), BrCond->getIterator());
7246 B.buildInstr(AMDGPU::SI_LOOP)
7247 .addUse(Reg)
7248 .addMBB(UncondBrTarget);
7249
7250 if (Br)
7251 Br->getOperand(0).setMBB(CondBrTarget);
7252 else
7253 B.buildBr(*CondBrTarget);
7254
7255 MI.eraseFromParent();
7256 BrCond->eraseFromParent();
7257 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7258 return true;
7259 }
7260
7261 return false;
7262 }
7263 case Intrinsic::amdgcn_addrspacecast_nonnull:
7264 return legalizeAddrSpaceCast(MI, MRI, B);
7265 case Intrinsic::amdgcn_make_buffer_rsrc:
7266 return legalizePointerAsRsrcIntrin(MI, MRI, B);
7267 case Intrinsic::amdgcn_kernarg_segment_ptr:
7268 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7269 // This only makes sense to call in a kernel, so just lower to null.
7270 B.buildConstant(MI.getOperand(0).getReg(), 0);
7271 MI.eraseFromParent();
7272 return true;
7273 }
7274
7275 return legalizePreloadedArgIntrin(
7276 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7277 case Intrinsic::amdgcn_implicitarg_ptr:
7278 return legalizeImplicitArgPtr(MI, MRI, B);
7279 case Intrinsic::amdgcn_workitem_id_x:
7280 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7281 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7282 case Intrinsic::amdgcn_workitem_id_y:
7283 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7284 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7285 case Intrinsic::amdgcn_workitem_id_z:
7286 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7287 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7288 case Intrinsic::amdgcn_workgroup_id_x:
7289 return legalizePreloadedArgIntrin(MI, MRI, B,
7290 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7291 case Intrinsic::amdgcn_workgroup_id_y:
7292 return legalizePreloadedArgIntrin(MI, MRI, B,
7293 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7294 case Intrinsic::amdgcn_workgroup_id_z:
7295 return legalizePreloadedArgIntrin(MI, MRI, B,
7296 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7297 case Intrinsic::amdgcn_wave_id:
7298 return legalizeWaveID(MI, B);
7299 case Intrinsic::amdgcn_lds_kernel_id:
7300 return legalizePreloadedArgIntrin(MI, MRI, B,
7301 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7302 case Intrinsic::amdgcn_dispatch_ptr:
7303 return legalizePreloadedArgIntrin(MI, MRI, B,
7304 AMDGPUFunctionArgInfo::DISPATCH_PTR);
7305 case Intrinsic::amdgcn_queue_ptr:
7306 return legalizePreloadedArgIntrin(MI, MRI, B,
7307 AMDGPUFunctionArgInfo::QUEUE_PTR);
7308 case Intrinsic::amdgcn_implicit_buffer_ptr:
7309 return legalizePreloadedArgIntrin(
7310 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7311 case Intrinsic::amdgcn_dispatch_id:
7312 return legalizePreloadedArgIntrin(MI, MRI, B,
7313 AMDGPUFunctionArgInfo::DISPATCH_ID);
7314 case Intrinsic::r600_read_ngroups_x:
7315 // TODO: Emit error for hsa
7316 return legalizeKernargMemParameter(MI, B,
7317 SI::KernelInputOffsets::NGROUPS_X);
7318 case Intrinsic::r600_read_ngroups_y:
7319 return legalizeKernargMemParameter(MI, B,
7320 SI::KernelInputOffsets::NGROUPS_Y);
7321 case Intrinsic::r600_read_ngroups_z:
7322 return legalizeKernargMemParameter(MI, B,
7323 SI::KernelInputOffsets::NGROUPS_Z);
7324 case Intrinsic::r600_read_local_size_x:
7325 // TODO: Could insert G_ASSERT_ZEXT from s16
7326 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7327 case Intrinsic::r600_read_local_size_y:
7328 // TODO: Could insert G_ASSERT_ZEXT from s16
7329 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
7330 // TODO: Could insert G_ASSERT_ZEXT from s16
7331 case Intrinsic::r600_read_local_size_z:
7332 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7333 case Intrinsic::r600_read_global_size_x:
7334 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7335 case Intrinsic::r600_read_global_size_y:
7336 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7337 case Intrinsic::r600_read_global_size_z:
7338 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7339 case Intrinsic::amdgcn_fdiv_fast:
7340 return legalizeFDIVFastIntrin(MI, MRI, B);
7341 case Intrinsic::amdgcn_is_shared:
7342 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7343 case Intrinsic::amdgcn_is_private:
7344 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7345 case Intrinsic::amdgcn_wavefrontsize: {
7346 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7347 MI.eraseFromParent();
7348 return true;
7349 }
7350 case Intrinsic::amdgcn_s_buffer_load:
7351 return legalizeSBufferLoad(Helper, MI);
7352 case Intrinsic::amdgcn_raw_buffer_store:
7353 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7354 case Intrinsic::amdgcn_struct_buffer_store:
7355 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7356 return legalizeBufferStore(MI, MRI, B, false, false);
7357 case Intrinsic::amdgcn_raw_buffer_store_format:
7358 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7359 case Intrinsic::amdgcn_struct_buffer_store_format:
7360 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7361 return legalizeBufferStore(MI, MRI, B, false, true);
7362 case Intrinsic::amdgcn_raw_tbuffer_store:
7363 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7364 case Intrinsic::amdgcn_struct_tbuffer_store:
7365 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7366 return legalizeBufferStore(MI, MRI, B, true, true);
7367 case Intrinsic::amdgcn_raw_buffer_load:
7368 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7371 case Intrinsic::amdgcn_struct_buffer_load:
7372 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7373 return legalizeBufferLoad(MI, MRI, B, false, false);
7374 case Intrinsic::amdgcn_raw_buffer_load_format:
7375 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7376 case Intrinsic::amdgcn_struct_buffer_load_format:
7377 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7378 return legalizeBufferLoad(MI, MRI, B, true, false);
7379 case Intrinsic::amdgcn_raw_tbuffer_load:
7380 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7381 case Intrinsic::amdgcn_struct_tbuffer_load:
7382 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7383 return legalizeBufferLoad(MI, MRI, B, true, true);
7384 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7385 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7386 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7387 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7388 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7389 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7390 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7391 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7392 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7393 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7394 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7395 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7396 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7397 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7398 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7399 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7400 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7401 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7402 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7403 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7404 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7405 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7406 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7407 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7408 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7409 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7410 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7411 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7412 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7413 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7414 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7415 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7416 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7417 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7418 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7419 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7420 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7421 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7422 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7423 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7424 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7425 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7426 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7427 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7428 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7430 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7431 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7432 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7433 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7434 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7435 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7436 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7440 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7441 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7442 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7443 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7444 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7445 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7446 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7448 return legalizeBufferAtomic(MI, B, IntrID);
7449 case Intrinsic::amdgcn_rsq_clamp:
7450 return legalizeRsqClampIntrinsic(MI, MRI, B);
7451 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7452 return legalizeBVHIntrinsic(MI, B);
7453 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7454 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7455 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7456 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7457 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7458 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7459 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7460 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7461 Register Index = MI.getOperand(5).getReg();
7462 LLT S32 = LLT::scalar(32);
7463 if (MRI.getType(Index) != S32)
7464 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7465 return true;
7466 }
7467 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7468 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7469 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7470 Register Index = MI.getOperand(7).getReg();
7471 LLT S32 = LLT::scalar(32);
7472 if (MRI.getType(Index) != S32)
7473 MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7474 return true;
7475 }
7476 case Intrinsic::amdgcn_fmed3: {
7477 GISelChangeObserver &Observer = Helper.Observer;
7478
7479 // FIXME: This is to workaround the inability of tablegen match combiners to
7480 // match intrinsics in patterns.
7481 Observer.changingInstr(MI);
7482 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7483 MI.removeOperand(1);
7484 Observer.changedInstr(MI);
7485 return true;
7486 }
7487 case Intrinsic::amdgcn_readlane:
7488 case Intrinsic::amdgcn_writelane:
7489 case Intrinsic::amdgcn_readfirstlane:
7490 case Intrinsic::amdgcn_permlane16:
7491 case Intrinsic::amdgcn_permlanex16:
7492 case Intrinsic::amdgcn_permlane64:
7493 return legalizeLaneOp(Helper, MI, IntrID);
7494 default: {
7495 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7496 AMDGPU::getImageDimIntrinsicInfo(IntrID))
7497 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7498 return true;
7499 }
7500 }
7501
7502 return true;
7503 }
7504