1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/ADT/FloatingPointMode.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/Transforms/InstCombine/InstCombiner.h" 23 #include <optional> 24 25 using namespace llvm; 26 27 #define DEBUG_TYPE "AMDGPUtti" 28 29 namespace { 30 31 struct AMDGPUImageDMaskIntrinsic { 32 unsigned Intr; 33 }; 34 35 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 36 #include "InstCombineTables.inc" 37 38 } // end anonymous namespace 39 40 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 41 // 42 // A single NaN input is folded to minnum, so we rely on that folding for 43 // handling NaNs. 44 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 45 const APFloat &Src2) { 46 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 47 48 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 49 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 50 if (Cmp0 == APFloat::cmpEqual) 51 return maxnum(Src1, Src2); 52 53 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 54 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 55 if (Cmp1 == APFloat::cmpEqual) 56 return maxnum(Src0, Src2); 57 58 return maxnum(Src0, Src1); 59 } 60 61 // Check if a value can be converted to a 16-bit value without losing 62 // precision. 63 // The value is expected to be either a float (IsFloat = true) or an unsigned 64 // integer (IsFloat = false). 65 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 66 Type *VTy = V.getType(); 67 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 68 // The value is already 16-bit, so we don't want to convert to 16-bit again! 69 return false; 70 } 71 if (IsFloat) { 72 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 73 // We need to check that if we cast the index down to a half, we do not 74 // lose precision. 75 APFloat FloatValue(ConstFloat->getValueAPF()); 76 bool LosesInfo = true; 77 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 78 &LosesInfo); 79 return !LosesInfo; 80 } 81 } else { 82 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 83 // We need to check that if we cast the index down to an i16, we do not 84 // lose precision. 85 APInt IntValue(ConstInt->getValue()); 86 return IntValue.getActiveBits() <= 16; 87 } 88 } 89 90 Value *CastSrc; 91 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 92 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 93 if (IsExt) { 94 Type *CastSrcTy = CastSrc->getType(); 95 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 96 return true; 97 } 98 99 return false; 100 } 101 102 // Convert a value to 16-bit. 103 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 104 Type *VTy = V.getType(); 105 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 106 return cast<Instruction>(&V)->getOperand(0); 107 if (VTy->isIntegerTy()) 108 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 109 if (VTy->isFloatingPointTy()) 110 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 111 112 llvm_unreachable("Should never be called!"); 113 } 114 115 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 116 /// modified arguments (based on OldIntr) and replaces InstToReplace with 117 /// this newly created intrinsic call. 118 static std::optional<Instruction *> modifyIntrinsicCall( 119 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 120 InstCombiner &IC, 121 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 122 Func) { 123 SmallVector<Type *, 4> ArgTys; 124 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 125 return std::nullopt; 126 127 SmallVector<Value *, 8> Args(OldIntr.args()); 128 129 // Modify arguments and types 130 Func(Args, ArgTys); 131 132 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); 133 134 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 135 NewCall->takeName(&OldIntr); 136 NewCall->copyMetadata(OldIntr); 137 if (isa<FPMathOperator>(NewCall)) 138 NewCall->copyFastMathFlags(&OldIntr); 139 140 // Erase and replace uses 141 if (!InstToReplace.getType()->isVoidTy()) 142 IC.replaceInstUsesWith(InstToReplace, NewCall); 143 144 bool RemoveOldIntr = &OldIntr != &InstToReplace; 145 146 auto RetValue = IC.eraseInstFromFunction(InstToReplace); 147 if (RemoveOldIntr) 148 IC.eraseInstFromFunction(OldIntr); 149 150 return RetValue; 151 } 152 153 static std::optional<Instruction *> 154 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 155 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 156 IntrinsicInst &II, InstCombiner &IC) { 157 // Optimize _L to _LZ when _L is zero 158 if (const auto *LZMappingInfo = 159 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 160 if (auto *ConstantLod = 161 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 162 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 163 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 164 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 165 ImageDimIntr->Dim); 166 return modifyIntrinsicCall( 167 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 168 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 169 }); 170 } 171 } 172 } 173 174 // Optimize _mip away, when 'lod' is zero 175 if (const auto *MIPMappingInfo = 176 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 177 if (auto *ConstantMip = 178 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 179 if (ConstantMip->isZero()) { 180 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 181 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 182 ImageDimIntr->Dim); 183 return modifyIntrinsicCall( 184 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 185 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 186 }); 187 } 188 } 189 } 190 191 // Optimize _bias away when 'bias' is zero 192 if (const auto *BiasMappingInfo = 193 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 194 if (auto *ConstantBias = 195 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 196 if (ConstantBias->isZero()) { 197 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 198 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 199 ImageDimIntr->Dim); 200 return modifyIntrinsicCall( 201 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 202 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 203 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 204 }); 205 } 206 } 207 } 208 209 // Optimize _offset away when 'offset' is zero 210 if (const auto *OffsetMappingInfo = 211 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 212 if (auto *ConstantOffset = 213 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 214 if (ConstantOffset->isZero()) { 215 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 216 AMDGPU::getImageDimIntrinsicByBaseOpcode( 217 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 218 return modifyIntrinsicCall( 219 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 220 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 221 }); 222 } 223 } 224 } 225 226 // Try to use D16 227 if (ST->hasD16Images()) { 228 229 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 230 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 231 232 if (BaseOpcode->HasD16) { 233 234 // If the only use of image intrinsic is a fptrunc (with conversion to 235 // half) then both fptrunc and image intrinsic will be replaced with image 236 // intrinsic with D16 flag. 237 if (II.hasOneUse()) { 238 Instruction *User = II.user_back(); 239 240 if (User->getOpcode() == Instruction::FPTrunc && 241 User->getType()->getScalarType()->isHalfTy()) { 242 243 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 244 [&](auto &Args, auto &ArgTys) { 245 // Change return type of image intrinsic. 246 // Set it to return type of fptrunc. 247 ArgTys[0] = User->getType(); 248 }); 249 } 250 } 251 } 252 } 253 254 // Try to use A16 or G16 255 if (!ST->hasA16() && !ST->hasG16()) 256 return std::nullopt; 257 258 // Address is interpreted as float if the instruction has a sampler or as 259 // unsigned int if there is no sampler. 260 bool HasSampler = 261 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 262 bool FloatCoord = false; 263 // true means derivatives can be converted to 16 bit, coordinates not 264 bool OnlyDerivatives = false; 265 266 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 267 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 268 Value *Coord = II.getOperand(OperandIndex); 269 // If the values are not derived from 16-bit values, we cannot optimize. 270 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 271 if (OperandIndex < ImageDimIntr->CoordStart || 272 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 273 return std::nullopt; 274 } 275 // All gradients can be converted, so convert only them 276 OnlyDerivatives = true; 277 break; 278 } 279 280 assert(OperandIndex == ImageDimIntr->GradientStart || 281 FloatCoord == Coord->getType()->isFloatingPointTy()); 282 FloatCoord = Coord->getType()->isFloatingPointTy(); 283 } 284 285 if (!OnlyDerivatives && !ST->hasA16()) 286 OnlyDerivatives = true; // Only supports G16 287 288 // Check if there is a bias parameter and if it can be converted to f16 289 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 290 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 291 assert(HasSampler && 292 "Only image instructions with a sampler can have a bias"); 293 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 294 OnlyDerivatives = true; 295 } 296 297 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 298 ImageDimIntr->CoordStart)) 299 return std::nullopt; 300 301 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 302 : Type::getInt16Ty(II.getContext()); 303 304 return modifyIntrinsicCall( 305 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 306 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 307 if (!OnlyDerivatives) { 308 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 309 310 // Change the bias type 311 if (ImageDimIntr->NumBiasArgs != 0) 312 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 313 } 314 315 unsigned EndIndex = 316 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 317 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 318 OperandIndex < EndIndex; OperandIndex++) { 319 Args[OperandIndex] = 320 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 321 } 322 323 // Convert the bias 324 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 325 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 326 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 327 } 328 }); 329 } 330 331 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 332 InstCombiner &IC) const { 333 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 334 // infinity, gives +0.0. If we can prove we don't have one of the special 335 // cases then we can use a normal multiply instead. 336 // TODO: Create and use isKnownFiniteNonZero instead of just matching 337 // constants here. 338 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 339 match(Op1, PatternMatch::m_FiniteNonZero())) { 340 // One operand is not zero or infinity or NaN. 341 return true; 342 } 343 auto *TLI = &IC.getTargetLibraryInfo(); 344 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 345 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 346 // Neither operand is infinity or NaN. 347 return true; 348 } 349 return false; 350 } 351 352 std::optional<Instruction *> 353 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 354 Intrinsic::ID IID = II.getIntrinsicID(); 355 switch (IID) { 356 case Intrinsic::amdgcn_rcp: { 357 Value *Src = II.getArgOperand(0); 358 359 // TODO: Move to ConstantFolding/InstSimplify? 360 if (isa<UndefValue>(Src)) { 361 Type *Ty = II.getType(); 362 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 363 return IC.replaceInstUsesWith(II, QNaN); 364 } 365 366 if (II.isStrictFP()) 367 break; 368 369 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 370 const APFloat &ArgVal = C->getValueAPF(); 371 APFloat Val(ArgVal.getSemantics(), 1); 372 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 373 374 // This is more precise than the instruction may give. 375 // 376 // TODO: The instruction always flushes denormal results (except for f16), 377 // should this also? 378 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 379 } 380 381 break; 382 } 383 case Intrinsic::amdgcn_sqrt: 384 case Intrinsic::amdgcn_rsq: { 385 Value *Src = II.getArgOperand(0); 386 387 // TODO: Move to ConstantFolding/InstSimplify? 388 if (isa<UndefValue>(Src)) { 389 Type *Ty = II.getType(); 390 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 391 return IC.replaceInstUsesWith(II, QNaN); 392 } 393 394 break; 395 } 396 case Intrinsic::amdgcn_frexp_mant: 397 case Intrinsic::amdgcn_frexp_exp: { 398 Value *Src = II.getArgOperand(0); 399 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 400 int Exp; 401 APFloat Significand = 402 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 403 404 if (IID == Intrinsic::amdgcn_frexp_mant) { 405 return IC.replaceInstUsesWith( 406 II, ConstantFP::get(II.getContext(), Significand)); 407 } 408 409 // Match instruction special case behavior. 410 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 411 Exp = 0; 412 413 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 414 } 415 416 if (isa<UndefValue>(Src)) { 417 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 418 } 419 420 break; 421 } 422 case Intrinsic::amdgcn_class: { 423 Value *Src0 = II.getArgOperand(0); 424 Value *Src1 = II.getArgOperand(1); 425 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 426 if (!CMask) { 427 if (isa<UndefValue>(Src0)) { 428 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 429 } 430 431 if (isa<UndefValue>(Src1)) { 432 return IC.replaceInstUsesWith(II, 433 ConstantInt::get(II.getType(), false)); 434 } 435 break; 436 } 437 438 uint32_t Mask = CMask->getZExtValue(); 439 440 // If all tests are made, it doesn't matter what the value is. 441 if ((Mask & fcAllFlags) == fcAllFlags) { 442 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 443 } 444 445 if ((Mask & fcAllFlags) == 0) { 446 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 447 } 448 449 if (Mask == fcNan && !II.isStrictFP()) { 450 // Equivalent of isnan. Replace with standard fcmp. 451 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 452 FCmp->takeName(&II); 453 return IC.replaceInstUsesWith(II, FCmp); 454 } 455 456 if (Mask == fcZero && !II.isStrictFP()) { 457 // Equivalent of == 0. 458 Value *FCmp = 459 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 460 461 FCmp->takeName(&II); 462 return IC.replaceInstUsesWith(II, FCmp); 463 } 464 465 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 466 if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 467 return IC.replaceOperand( 468 II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan)); 469 } 470 471 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 472 if (!CVal) { 473 if (isa<UndefValue>(Src0)) { 474 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 475 } 476 477 // Clamp mask to used bits 478 if ((Mask & fcAllFlags) != Mask) { 479 CallInst *NewCall = IC.Builder.CreateCall( 480 II.getCalledFunction(), 481 {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)}); 482 483 NewCall->takeName(&II); 484 return IC.replaceInstUsesWith(II, NewCall); 485 } 486 487 break; 488 } 489 490 const APFloat &Val = CVal->getValueAPF(); 491 492 bool Result = 493 ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) || 494 ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) || 495 ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) || 496 ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) || 497 ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) || 498 ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) || 499 ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) || 500 ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) || 501 ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) || 502 ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative()); 503 504 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 505 } 506 case Intrinsic::amdgcn_cvt_pkrtz: { 507 Value *Src0 = II.getArgOperand(0); 508 Value *Src1 = II.getArgOperand(1); 509 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 510 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 511 const fltSemantics &HalfSem = 512 II.getType()->getScalarType()->getFltSemantics(); 513 bool LosesInfo; 514 APFloat Val0 = C0->getValueAPF(); 515 APFloat Val1 = C1->getValueAPF(); 516 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 517 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 518 519 Constant *Folded = 520 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 521 ConstantFP::get(II.getContext(), Val1)}); 522 return IC.replaceInstUsesWith(II, Folded); 523 } 524 } 525 526 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 527 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 528 } 529 530 break; 531 } 532 case Intrinsic::amdgcn_cvt_pknorm_i16: 533 case Intrinsic::amdgcn_cvt_pknorm_u16: 534 case Intrinsic::amdgcn_cvt_pk_i16: 535 case Intrinsic::amdgcn_cvt_pk_u16: { 536 Value *Src0 = II.getArgOperand(0); 537 Value *Src1 = II.getArgOperand(1); 538 539 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 540 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 541 } 542 543 break; 544 } 545 case Intrinsic::amdgcn_ubfe: 546 case Intrinsic::amdgcn_sbfe: { 547 // Decompose simple cases into standard shifts. 548 Value *Src = II.getArgOperand(0); 549 if (isa<UndefValue>(Src)) { 550 return IC.replaceInstUsesWith(II, Src); 551 } 552 553 unsigned Width; 554 Type *Ty = II.getType(); 555 unsigned IntSize = Ty->getIntegerBitWidth(); 556 557 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 558 if (CWidth) { 559 Width = CWidth->getZExtValue(); 560 if ((Width & (IntSize - 1)) == 0) { 561 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 562 } 563 564 // Hardware ignores high bits, so remove those. 565 if (Width >= IntSize) { 566 return IC.replaceOperand( 567 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 568 } 569 } 570 571 unsigned Offset; 572 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 573 if (COffset) { 574 Offset = COffset->getZExtValue(); 575 if (Offset >= IntSize) { 576 return IC.replaceOperand( 577 II, 1, 578 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 579 } 580 } 581 582 bool Signed = IID == Intrinsic::amdgcn_sbfe; 583 584 if (!CWidth || !COffset) 585 break; 586 587 // The case of Width == 0 is handled above, which makes this transformation 588 // safe. If Width == 0, then the ashr and lshr instructions become poison 589 // value since the shift amount would be equal to the bit size. 590 assert(Width != 0); 591 592 // TODO: This allows folding to undef when the hardware has specific 593 // behavior? 594 if (Offset + Width < IntSize) { 595 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 596 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 597 : IC.Builder.CreateLShr(Shl, IntSize - Width); 598 RightShift->takeName(&II); 599 return IC.replaceInstUsesWith(II, RightShift); 600 } 601 602 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 603 : IC.Builder.CreateLShr(Src, Offset); 604 605 RightShift->takeName(&II); 606 return IC.replaceInstUsesWith(II, RightShift); 607 } 608 case Intrinsic::amdgcn_exp: 609 case Intrinsic::amdgcn_exp_row: 610 case Intrinsic::amdgcn_exp_compr: { 611 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 612 unsigned EnBits = En->getZExtValue(); 613 if (EnBits == 0xf) 614 break; // All inputs enabled. 615 616 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 617 bool Changed = false; 618 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 619 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 620 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 621 Value *Src = II.getArgOperand(I + 2); 622 if (!isa<UndefValue>(Src)) { 623 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 624 Changed = true; 625 } 626 } 627 } 628 629 if (Changed) { 630 return &II; 631 } 632 633 break; 634 } 635 case Intrinsic::amdgcn_fmed3: { 636 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 637 // for the shader. 638 639 Value *Src0 = II.getArgOperand(0); 640 Value *Src1 = II.getArgOperand(1); 641 Value *Src2 = II.getArgOperand(2); 642 643 // Checking for NaN before canonicalization provides better fidelity when 644 // mapping other operations onto fmed3 since the order of operands is 645 // unchanged. 646 CallInst *NewCall = nullptr; 647 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 648 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 649 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 650 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 651 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 652 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 653 } 654 655 if (NewCall) { 656 NewCall->copyFastMathFlags(&II); 657 NewCall->takeName(&II); 658 return IC.replaceInstUsesWith(II, NewCall); 659 } 660 661 bool Swap = false; 662 // Canonicalize constants to RHS operands. 663 // 664 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 665 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 666 std::swap(Src0, Src1); 667 Swap = true; 668 } 669 670 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 671 std::swap(Src1, Src2); 672 Swap = true; 673 } 674 675 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 676 std::swap(Src0, Src1); 677 Swap = true; 678 } 679 680 if (Swap) { 681 II.setArgOperand(0, Src0); 682 II.setArgOperand(1, Src1); 683 II.setArgOperand(2, Src2); 684 return &II; 685 } 686 687 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 688 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 689 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 690 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 691 C2->getValueAPF()); 692 return IC.replaceInstUsesWith( 693 II, ConstantFP::get(IC.Builder.getContext(), Result)); 694 } 695 } 696 } 697 698 break; 699 } 700 case Intrinsic::amdgcn_icmp: 701 case Intrinsic::amdgcn_fcmp: { 702 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 703 // Guard against invalid arguments. 704 int64_t CCVal = CC->getZExtValue(); 705 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 706 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 707 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 708 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 709 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 710 break; 711 712 Value *Src0 = II.getArgOperand(0); 713 Value *Src1 = II.getArgOperand(1); 714 715 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 716 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 717 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 718 if (CCmp->isNullValue()) { 719 return IC.replaceInstUsesWith( 720 II, ConstantExpr::getSExt(CCmp, II.getType())); 721 } 722 723 // The result of V_ICMP/V_FCMP assembly instructions (which this 724 // intrinsic exposes) is one bit per thread, masked with the EXEC 725 // register (which contains the bitmask of live threads). So a 726 // comparison that always returns true is the same as a read of the 727 // EXEC register. 728 Function *NewF = Intrinsic::getDeclaration( 729 II.getModule(), Intrinsic::read_register, II.getType()); 730 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 731 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 732 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 733 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 734 NewCall->addFnAttr(Attribute::Convergent); 735 NewCall->takeName(&II); 736 return IC.replaceInstUsesWith(II, NewCall); 737 } 738 739 // Canonicalize constants to RHS. 740 CmpInst::Predicate SwapPred = 741 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 742 II.setArgOperand(0, Src1); 743 II.setArgOperand(1, Src0); 744 II.setArgOperand( 745 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 746 return &II; 747 } 748 749 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 750 break; 751 752 // Canonicalize compare eq with true value to compare != 0 753 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 754 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 755 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 756 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 757 Value *ExtSrc; 758 if (CCVal == CmpInst::ICMP_EQ && 759 ((match(Src1, PatternMatch::m_One()) && 760 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 761 (match(Src1, PatternMatch::m_AllOnes()) && 762 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 763 ExtSrc->getType()->isIntegerTy(1)) { 764 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 765 IC.replaceOperand(II, 2, 766 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 767 return &II; 768 } 769 770 CmpInst::Predicate SrcPred; 771 Value *SrcLHS; 772 Value *SrcRHS; 773 774 // Fold compare eq/ne with 0 from a compare result as the predicate to the 775 // intrinsic. The typical use is a wave vote function in the library, which 776 // will be fed from a user code condition compared with 0. Fold in the 777 // redundant compare. 778 779 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 780 // -> llvm.amdgcn.[if]cmp(a, b, pred) 781 // 782 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 783 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 784 if (match(Src1, PatternMatch::m_Zero()) && 785 match(Src0, PatternMatch::m_ZExtOrSExt( 786 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 787 PatternMatch::m_Value(SrcRHS))))) { 788 if (CCVal == CmpInst::ICMP_EQ) 789 SrcPred = CmpInst::getInversePredicate(SrcPred); 790 791 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 792 ? Intrinsic::amdgcn_fcmp 793 : Intrinsic::amdgcn_icmp; 794 795 Type *Ty = SrcLHS->getType(); 796 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 797 // Promote to next legal integer type. 798 unsigned Width = CmpType->getBitWidth(); 799 unsigned NewWidth = Width; 800 801 // Don't do anything for i1 comparisons. 802 if (Width == 1) 803 break; 804 805 if (Width <= 16) 806 NewWidth = 16; 807 else if (Width <= 32) 808 NewWidth = 32; 809 else if (Width <= 64) 810 NewWidth = 64; 811 else if (Width > 64) 812 break; // Can't handle this. 813 814 if (Width != NewWidth) { 815 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 816 if (CmpInst::isSigned(SrcPred)) { 817 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 818 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 819 } else { 820 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 821 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 822 } 823 } 824 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 825 break; 826 827 Function *NewF = Intrinsic::getDeclaration( 828 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 829 Value *Args[] = {SrcLHS, SrcRHS, 830 ConstantInt::get(CC->getType(), SrcPred)}; 831 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 832 NewCall->takeName(&II); 833 return IC.replaceInstUsesWith(II, NewCall); 834 } 835 836 break; 837 } 838 case Intrinsic::amdgcn_ballot: { 839 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 840 if (Src->isZero()) { 841 // amdgcn.ballot(i1 0) is zero. 842 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 843 } 844 845 if (Src->isOne()) { 846 // amdgcn.ballot(i1 1) is exec. 847 const char *RegName = "exec"; 848 if (II.getType()->isIntegerTy(32)) 849 RegName = "exec_lo"; 850 else if (!II.getType()->isIntegerTy(64)) 851 break; 852 853 Function *NewF = Intrinsic::getDeclaration( 854 II.getModule(), Intrinsic::read_register, II.getType()); 855 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 856 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 857 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 858 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 859 NewCall->addFnAttr(Attribute::Convergent); 860 NewCall->takeName(&II); 861 return IC.replaceInstUsesWith(II, NewCall); 862 } 863 } 864 break; 865 } 866 case Intrinsic::amdgcn_wqm_vote: { 867 // wqm_vote is identity when the argument is constant. 868 if (!isa<Constant>(II.getArgOperand(0))) 869 break; 870 871 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 872 } 873 case Intrinsic::amdgcn_kill: { 874 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 875 if (!C || !C->getZExtValue()) 876 break; 877 878 // amdgcn.kill(i1 1) is a no-op 879 return IC.eraseInstFromFunction(II); 880 } 881 case Intrinsic::amdgcn_update_dpp: { 882 Value *Old = II.getArgOperand(0); 883 884 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 885 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 886 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 887 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 888 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 889 break; 890 891 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 892 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 893 } 894 case Intrinsic::amdgcn_permlane16: 895 case Intrinsic::amdgcn_permlanex16: { 896 // Discard vdst_in if it's not going to be read. 897 Value *VDstIn = II.getArgOperand(0); 898 if (isa<UndefValue>(VDstIn)) 899 break; 900 901 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 902 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 903 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 904 break; 905 906 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 907 } 908 case Intrinsic::amdgcn_permlane64: 909 // A constant value is trivially uniform. 910 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 911 return IC.replaceInstUsesWith(II, C); 912 } 913 break; 914 case Intrinsic::amdgcn_readfirstlane: 915 case Intrinsic::amdgcn_readlane: { 916 // A constant value is trivially uniform. 917 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 918 return IC.replaceInstUsesWith(II, C); 919 } 920 921 // The rest of these may not be safe if the exec may not be the same between 922 // the def and use. 923 Value *Src = II.getArgOperand(0); 924 Instruction *SrcInst = dyn_cast<Instruction>(Src); 925 if (SrcInst && SrcInst->getParent() != II.getParent()) 926 break; 927 928 // readfirstlane (readfirstlane x) -> readfirstlane x 929 // readlane (readfirstlane x), y -> readfirstlane x 930 if (match(Src, 931 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 932 return IC.replaceInstUsesWith(II, Src); 933 } 934 935 if (IID == Intrinsic::amdgcn_readfirstlane) { 936 // readfirstlane (readlane x, y) -> readlane x, y 937 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 938 return IC.replaceInstUsesWith(II, Src); 939 } 940 } else { 941 // readlane (readlane x, y), y -> readlane x, y 942 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 943 PatternMatch::m_Value(), 944 PatternMatch::m_Specific(II.getArgOperand(1))))) { 945 return IC.replaceInstUsesWith(II, Src); 946 } 947 } 948 949 break; 950 } 951 case Intrinsic::amdgcn_ldexp: { 952 // FIXME: This doesn't introduce new instructions and belongs in 953 // InstructionSimplify. 954 Type *Ty = II.getType(); 955 Value *Op0 = II.getArgOperand(0); 956 Value *Op1 = II.getArgOperand(1); 957 958 // Folding undef to qnan is safe regardless of the FP mode. 959 if (isa<UndefValue>(Op0)) { 960 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 961 return IC.replaceInstUsesWith(II, QNaN); 962 } 963 964 const APFloat *C = nullptr; 965 match(Op0, PatternMatch::m_APFloat(C)); 966 967 // FIXME: Should flush denorms depending on FP mode, but that's ignored 968 // everywhere else. 969 // 970 // These cases should be safe, even with strictfp. 971 // ldexp(0.0, x) -> 0.0 972 // ldexp(-0.0, x) -> -0.0 973 // ldexp(inf, x) -> inf 974 // ldexp(-inf, x) -> -inf 975 if (C && (C->isZero() || C->isInfinity())) { 976 return IC.replaceInstUsesWith(II, Op0); 977 } 978 979 // With strictfp, be more careful about possibly needing to flush denormals 980 // or not, and snan behavior depends on ieee_mode. 981 if (II.isStrictFP()) 982 break; 983 984 if (C && C->isNaN()) { 985 // FIXME: We just need to make the nan quiet here, but that's unavailable 986 // on APFloat, only IEEEfloat 987 auto *Quieted = 988 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 989 return IC.replaceInstUsesWith(II, Quieted); 990 } 991 992 // ldexp(x, 0) -> x 993 // ldexp(x, undef) -> x 994 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 995 return IC.replaceInstUsesWith(II, Op0); 996 } 997 998 break; 999 } 1000 case Intrinsic::amdgcn_fmul_legacy: { 1001 Value *Op0 = II.getArgOperand(0); 1002 Value *Op1 = II.getArgOperand(1); 1003 1004 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1005 // infinity, gives +0.0. 1006 // TODO: Move to InstSimplify? 1007 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1008 match(Op1, PatternMatch::m_AnyZeroFP())) 1009 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 1010 1011 // If we can prove we don't have one of the special cases then we can use a 1012 // normal fmul instruction instead. 1013 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1014 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1015 FMul->takeName(&II); 1016 return IC.replaceInstUsesWith(II, FMul); 1017 } 1018 break; 1019 } 1020 case Intrinsic::amdgcn_fma_legacy: { 1021 Value *Op0 = II.getArgOperand(0); 1022 Value *Op1 = II.getArgOperand(1); 1023 Value *Op2 = II.getArgOperand(2); 1024 1025 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1026 // infinity, gives +0.0. 1027 // TODO: Move to InstSimplify? 1028 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1029 match(Op1, PatternMatch::m_AnyZeroFP())) { 1030 // It's tempting to just return Op2 here, but that would give the wrong 1031 // result if Op2 was -0.0. 1032 auto *Zero = ConstantFP::getNullValue(II.getType()); 1033 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1034 FAdd->takeName(&II); 1035 return IC.replaceInstUsesWith(II, FAdd); 1036 } 1037 1038 // If we can prove we don't have one of the special cases then we can use a 1039 // normal fma instead. 1040 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1041 II.setCalledOperand(Intrinsic::getDeclaration( 1042 II.getModule(), Intrinsic::fma, II.getType())); 1043 return &II; 1044 } 1045 break; 1046 } 1047 case Intrinsic::amdgcn_is_shared: 1048 case Intrinsic::amdgcn_is_private: { 1049 if (isa<UndefValue>(II.getArgOperand(0))) 1050 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1051 1052 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1053 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1054 break; 1055 } 1056 default: { 1057 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1058 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1059 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1060 } 1061 } 1062 } 1063 return std::nullopt; 1064 } 1065 1066 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1067 /// 1068 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1069 /// struct returns. 1070 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1071 IntrinsicInst &II, 1072 APInt DemandedElts, 1073 int DMaskIdx = -1) { 1074 1075 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1076 unsigned VWidth = IIVTy->getNumElements(); 1077 if (VWidth == 1) 1078 return nullptr; 1079 Type *EltTy = IIVTy->getElementType(); 1080 1081 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1082 IC.Builder.SetInsertPoint(&II); 1083 1084 // Assume the arguments are unchanged and later override them, if needed. 1085 SmallVector<Value *, 16> Args(II.args()); 1086 1087 if (DMaskIdx < 0) { 1088 // Buffer case. 1089 1090 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1091 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1092 1093 // Start assuming the prefix of elements is demanded, but possibly clear 1094 // some other bits if there are trailing zeros (unused components at front) 1095 // and update offset. 1096 DemandedElts = (1 << ActiveBits) - 1; 1097 1098 if (UnusedComponentsAtFront > 0) { 1099 static const unsigned InvalidOffsetIdx = 0xf; 1100 1101 unsigned OffsetIdx; 1102 switch (II.getIntrinsicID()) { 1103 case Intrinsic::amdgcn_raw_buffer_load: 1104 OffsetIdx = 1; 1105 break; 1106 case Intrinsic::amdgcn_s_buffer_load: 1107 // If resulting type is vec3, there is no point in trimming the 1108 // load with updated offset, as the vec3 would most likely be widened to 1109 // vec4 anyway during lowering. 1110 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1111 OffsetIdx = InvalidOffsetIdx; 1112 else 1113 OffsetIdx = 1; 1114 break; 1115 case Intrinsic::amdgcn_struct_buffer_load: 1116 OffsetIdx = 2; 1117 break; 1118 default: 1119 // TODO: handle tbuffer* intrinsics. 1120 OffsetIdx = InvalidOffsetIdx; 1121 break; 1122 } 1123 1124 if (OffsetIdx != InvalidOffsetIdx) { 1125 // Clear demanded bits and update the offset. 1126 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1127 auto *Offset = Args[OffsetIdx]; 1128 unsigned SingleComponentSizeInBits = 1129 IC.getDataLayout().getTypeSizeInBits(EltTy); 1130 unsigned OffsetAdd = 1131 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1132 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1133 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1134 } 1135 } 1136 } else { 1137 // Image case. 1138 1139 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]); 1140 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1141 1142 // Mask off values that are undefined because the dmask doesn't cover them 1143 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; 1144 1145 unsigned NewDMaskVal = 0; 1146 unsigned OrigLoadIdx = 0; 1147 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1148 const unsigned Bit = 1 << SrcIdx; 1149 if (!!(DMaskVal & Bit)) { 1150 if (!!DemandedElts[OrigLoadIdx]) 1151 NewDMaskVal |= Bit; 1152 OrigLoadIdx++; 1153 } 1154 } 1155 1156 if (DMaskVal != NewDMaskVal) 1157 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1158 } 1159 1160 unsigned NewNumElts = DemandedElts.countPopulation(); 1161 if (!NewNumElts) 1162 return UndefValue::get(IIVTy); 1163 1164 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1165 if (DMaskIdx >= 0) 1166 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1167 return nullptr; 1168 } 1169 1170 // Validate function argument and return types, extracting overloaded types 1171 // along the way. 1172 SmallVector<Type *, 6> OverloadTys; 1173 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1174 return nullptr; 1175 1176 Type *NewTy = 1177 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1178 OverloadTys[0] = NewTy; 1179 1180 Function *NewIntrin = Intrinsic::getDeclaration( 1181 II.getModule(), II.getIntrinsicID(), OverloadTys); 1182 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1183 NewCall->takeName(&II); 1184 NewCall->copyMetadata(II); 1185 1186 if (NewNumElts == 1) { 1187 return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, 1188 DemandedElts.countTrailingZeros()); 1189 } 1190 1191 SmallVector<int, 8> EltMask; 1192 unsigned NewLoadIdx = 0; 1193 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1194 if (!!DemandedElts[OrigLoadIdx]) 1195 EltMask.push_back(NewLoadIdx++); 1196 else 1197 EltMask.push_back(NewNumElts); 1198 } 1199 1200 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1201 1202 return Shuffle; 1203 } 1204 1205 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1206 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1207 APInt &UndefElts2, APInt &UndefElts3, 1208 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1209 SimplifyAndSetOp) const { 1210 switch (II.getIntrinsicID()) { 1211 case Intrinsic::amdgcn_buffer_load: 1212 case Intrinsic::amdgcn_buffer_load_format: 1213 case Intrinsic::amdgcn_raw_buffer_load: 1214 case Intrinsic::amdgcn_raw_buffer_load_format: 1215 case Intrinsic::amdgcn_raw_tbuffer_load: 1216 case Intrinsic::amdgcn_s_buffer_load: 1217 case Intrinsic::amdgcn_struct_buffer_load: 1218 case Intrinsic::amdgcn_struct_buffer_load_format: 1219 case Intrinsic::amdgcn_struct_tbuffer_load: 1220 case Intrinsic::amdgcn_tbuffer_load: 1221 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1222 default: { 1223 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1224 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1225 } 1226 break; 1227 } 1228 } 1229 return std::nullopt; 1230 } 1231