1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/Transforms/InstCombine/InstCombiner.h" 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "AMDGPUtti" 26 27 namespace { 28 29 struct AMDGPUImageDMaskIntrinsic { 30 unsigned Intr; 31 }; 32 33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 34 #include "InstCombineTables.inc" 35 36 } // end anonymous namespace 37 38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 39 // 40 // A single NaN input is folded to minnum, so we rely on that folding for 41 // handling NaNs. 42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 43 const APFloat &Src2) { 44 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 45 46 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 47 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 48 if (Cmp0 == APFloat::cmpEqual) 49 return maxnum(Src1, Src2); 50 51 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 52 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 53 if (Cmp1 == APFloat::cmpEqual) 54 return maxnum(Src0, Src2); 55 56 return maxnum(Src0, Src1); 57 } 58 59 // Check if a value can be converted to a 16-bit value without losing 60 // precision. 61 // The value is expected to be either a float (IsFloat = true) or an unsigned 62 // integer (IsFloat = false). 63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 64 Type *VTy = V.getType(); 65 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 66 // The value is already 16-bit, so we don't want to convert to 16-bit again! 67 return false; 68 } 69 if (IsFloat) { 70 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 71 // We need to check that if we cast the index down to a half, we do not 72 // lose precision. 73 APFloat FloatValue(ConstFloat->getValueAPF()); 74 bool LosesInfo = true; 75 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 76 &LosesInfo); 77 return !LosesInfo; 78 } 79 } else { 80 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 81 // We need to check that if we cast the index down to an i16, we do not 82 // lose precision. 83 APInt IntValue(ConstInt->getValue()); 84 return IntValue.getActiveBits() <= 16; 85 } 86 } 87 88 Value *CastSrc; 89 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 90 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 91 if (IsExt) { 92 Type *CastSrcTy = CastSrc->getType(); 93 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 94 return true; 95 } 96 97 return false; 98 } 99 100 // Convert a value to 16-bit. 101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 102 Type *VTy = V.getType(); 103 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 104 return cast<Instruction>(&V)->getOperand(0); 105 if (VTy->isIntegerTy()) 106 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 107 if (VTy->isFloatingPointTy()) 108 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 109 110 llvm_unreachable("Should never be called!"); 111 } 112 113 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with 114 /// modified arguments (based on OldIntr) and replaces InstToReplace with 115 /// this newly created intrinsic call. 116 static Optional<Instruction *> modifyIntrinsicCall( 117 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, 118 InstCombiner &IC, 119 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 120 Func) { 121 SmallVector<Type *, 4> ArgTys; 122 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys)) 123 return None; 124 125 SmallVector<Value *, 8> Args(OldIntr.args()); 126 127 // Modify arguments and types 128 Func(Args, ArgTys); 129 130 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); 131 132 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 133 NewCall->takeName(&OldIntr); 134 NewCall->copyMetadata(OldIntr); 135 if (isa<FPMathOperator>(NewCall)) 136 NewCall->copyFastMathFlags(&OldIntr); 137 138 // Erase and replace uses 139 if (!InstToReplace.getType()->isVoidTy()) 140 IC.replaceInstUsesWith(InstToReplace, NewCall); 141 142 bool RemoveOldIntr = &OldIntr != &InstToReplace; 143 144 auto RetValue = IC.eraseInstFromFunction(InstToReplace); 145 if (RemoveOldIntr) 146 IC.eraseInstFromFunction(OldIntr); 147 148 return RetValue; 149 } 150 151 static Optional<Instruction *> 152 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 153 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 154 IntrinsicInst &II, InstCombiner &IC) { 155 // Optimize _L to _LZ when _L is zero 156 if (const auto *LZMappingInfo = 157 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 158 if (auto *ConstantLod = 159 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 160 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 161 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 162 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 163 ImageDimIntr->Dim); 164 return modifyIntrinsicCall( 165 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 166 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 167 }); 168 } 169 } 170 } 171 172 // Optimize _mip away, when 'lod' is zero 173 if (const auto *MIPMappingInfo = 174 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 175 if (auto *ConstantMip = 176 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 177 if (ConstantMip->isZero()) { 178 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 179 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 180 ImageDimIntr->Dim); 181 return modifyIntrinsicCall( 182 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 183 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 184 }); 185 } 186 } 187 } 188 189 // Optimize _bias away when 'bias' is zero 190 if (const auto *BiasMappingInfo = 191 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 192 if (auto *ConstantBias = 193 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 194 if (ConstantBias->isZero()) { 195 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 196 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 197 ImageDimIntr->Dim); 198 return modifyIntrinsicCall( 199 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 200 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 201 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 202 }); 203 } 204 } 205 } 206 207 // Optimize _offset away when 'offset' is zero 208 if (const auto *OffsetMappingInfo = 209 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 210 if (auto *ConstantOffset = 211 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 212 if (ConstantOffset->isZero()) { 213 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 214 AMDGPU::getImageDimIntrinsicByBaseOpcode( 215 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 216 return modifyIntrinsicCall( 217 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 218 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 219 }); 220 } 221 } 222 } 223 224 // Try to use D16 225 if (ST->hasD16Images()) { 226 227 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 228 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 229 230 if (BaseOpcode->HasD16) { 231 232 // If the only use of image intrinsic is a fptrunc (with conversion to 233 // half) then both fptrunc and image intrinsic will be replaced with image 234 // intrinsic with D16 flag. 235 if (II.hasOneUse()) { 236 Instruction *User = II.user_back(); 237 238 if (User->getOpcode() == Instruction::FPTrunc && 239 User->getType()->getScalarType()->isHalfTy()) { 240 241 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC, 242 [&](auto &Args, auto &ArgTys) { 243 // Change return type of image intrinsic. 244 // Set it to return type of fptrunc. 245 ArgTys[0] = User->getType(); 246 }); 247 } 248 } 249 } 250 } 251 252 // Try to use A16 or G16 253 if (!ST->hasA16() && !ST->hasG16()) 254 return None; 255 256 // Address is interpreted as float if the instruction has a sampler or as 257 // unsigned int if there is no sampler. 258 bool HasSampler = 259 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 260 bool FloatCoord = false; 261 // true means derivatives can be converted to 16 bit, coordinates not 262 bool OnlyDerivatives = false; 263 264 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 265 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 266 Value *Coord = II.getOperand(OperandIndex); 267 // If the values are not derived from 16-bit values, we cannot optimize. 268 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 269 if (OperandIndex < ImageDimIntr->CoordStart || 270 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 271 return None; 272 } 273 // All gradients can be converted, so convert only them 274 OnlyDerivatives = true; 275 break; 276 } 277 278 assert(OperandIndex == ImageDimIntr->GradientStart || 279 FloatCoord == Coord->getType()->isFloatingPointTy()); 280 FloatCoord = Coord->getType()->isFloatingPointTy(); 281 } 282 283 if (!OnlyDerivatives && !ST->hasA16()) 284 OnlyDerivatives = true; // Only supports G16 285 286 // Check if there is a bias parameter and if it can be converted to f16 287 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 288 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 289 assert(HasSampler && 290 "Only image instructions with a sampler can have a bias"); 291 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 292 OnlyDerivatives = true; 293 } 294 295 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 296 ImageDimIntr->CoordStart)) 297 return None; 298 299 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 300 : Type::getInt16Ty(II.getContext()); 301 302 return modifyIntrinsicCall( 303 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 304 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 305 if (!OnlyDerivatives) { 306 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 307 308 // Change the bias type 309 if (ImageDimIntr->NumBiasArgs != 0) 310 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 311 } 312 313 unsigned EndIndex = 314 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 315 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 316 OperandIndex < EndIndex; OperandIndex++) { 317 Args[OperandIndex] = 318 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 319 } 320 321 // Convert the bias 322 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 323 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 324 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 325 } 326 }); 327 } 328 329 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 330 InstCombiner &IC) const { 331 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 332 // infinity, gives +0.0. If we can prove we don't have one of the special 333 // cases then we can use a normal multiply instead. 334 // TODO: Create and use isKnownFiniteNonZero instead of just matching 335 // constants here. 336 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 337 match(Op1, PatternMatch::m_FiniteNonZero())) { 338 // One operand is not zero or infinity or NaN. 339 return true; 340 } 341 auto *TLI = &IC.getTargetLibraryInfo(); 342 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 343 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 344 // Neither operand is infinity or NaN. 345 return true; 346 } 347 return false; 348 } 349 350 Optional<Instruction *> 351 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 352 Intrinsic::ID IID = II.getIntrinsicID(); 353 switch (IID) { 354 case Intrinsic::amdgcn_rcp: { 355 Value *Src = II.getArgOperand(0); 356 357 // TODO: Move to ConstantFolding/InstSimplify? 358 if (isa<UndefValue>(Src)) { 359 Type *Ty = II.getType(); 360 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 361 return IC.replaceInstUsesWith(II, QNaN); 362 } 363 364 if (II.isStrictFP()) 365 break; 366 367 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 368 const APFloat &ArgVal = C->getValueAPF(); 369 APFloat Val(ArgVal.getSemantics(), 1); 370 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 371 372 // This is more precise than the instruction may give. 373 // 374 // TODO: The instruction always flushes denormal results (except for f16), 375 // should this also? 376 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 377 } 378 379 break; 380 } 381 case Intrinsic::amdgcn_rsq: { 382 Value *Src = II.getArgOperand(0); 383 384 // TODO: Move to ConstantFolding/InstSimplify? 385 if (isa<UndefValue>(Src)) { 386 Type *Ty = II.getType(); 387 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 388 return IC.replaceInstUsesWith(II, QNaN); 389 } 390 391 break; 392 } 393 case Intrinsic::amdgcn_frexp_mant: 394 case Intrinsic::amdgcn_frexp_exp: { 395 Value *Src = II.getArgOperand(0); 396 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 397 int Exp; 398 APFloat Significand = 399 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 400 401 if (IID == Intrinsic::amdgcn_frexp_mant) { 402 return IC.replaceInstUsesWith( 403 II, ConstantFP::get(II.getContext(), Significand)); 404 } 405 406 // Match instruction special case behavior. 407 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 408 Exp = 0; 409 410 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 411 } 412 413 if (isa<UndefValue>(Src)) { 414 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 415 } 416 417 break; 418 } 419 case Intrinsic::amdgcn_class: { 420 enum { 421 S_NAN = 1 << 0, // Signaling NaN 422 Q_NAN = 1 << 1, // Quiet NaN 423 N_INFINITY = 1 << 2, // Negative infinity 424 N_NORMAL = 1 << 3, // Negative normal 425 N_SUBNORMAL = 1 << 4, // Negative subnormal 426 N_ZERO = 1 << 5, // Negative zero 427 P_ZERO = 1 << 6, // Positive zero 428 P_SUBNORMAL = 1 << 7, // Positive subnormal 429 P_NORMAL = 1 << 8, // Positive normal 430 P_INFINITY = 1 << 9 // Positive infinity 431 }; 432 433 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 434 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 435 P_NORMAL | P_INFINITY; 436 437 Value *Src0 = II.getArgOperand(0); 438 Value *Src1 = II.getArgOperand(1); 439 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 440 if (!CMask) { 441 if (isa<UndefValue>(Src0)) { 442 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 443 } 444 445 if (isa<UndefValue>(Src1)) { 446 return IC.replaceInstUsesWith(II, 447 ConstantInt::get(II.getType(), false)); 448 } 449 break; 450 } 451 452 uint32_t Mask = CMask->getZExtValue(); 453 454 // If all tests are made, it doesn't matter what the value is. 455 if ((Mask & FullMask) == FullMask) { 456 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 457 } 458 459 if ((Mask & FullMask) == 0) { 460 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 461 } 462 463 if (Mask == (S_NAN | Q_NAN)) { 464 // Equivalent of isnan. Replace with standard fcmp. 465 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 466 FCmp->takeName(&II); 467 return IC.replaceInstUsesWith(II, FCmp); 468 } 469 470 if (Mask == (N_ZERO | P_ZERO)) { 471 // Equivalent of == 0. 472 Value *FCmp = 473 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 474 475 FCmp->takeName(&II); 476 return IC.replaceInstUsesWith(II, FCmp); 477 } 478 479 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 480 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 481 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 482 return IC.replaceOperand( 483 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 484 } 485 486 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 487 if (!CVal) { 488 if (isa<UndefValue>(Src0)) { 489 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 490 } 491 492 // Clamp mask to used bits 493 if ((Mask & FullMask) != Mask) { 494 CallInst *NewCall = IC.Builder.CreateCall( 495 II.getCalledFunction(), 496 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 497 498 NewCall->takeName(&II); 499 return IC.replaceInstUsesWith(II, NewCall); 500 } 501 502 break; 503 } 504 505 const APFloat &Val = CVal->getValueAPF(); 506 507 bool Result = 508 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 509 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 510 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 511 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 512 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 513 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 514 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 515 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 516 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 517 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 518 519 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 520 } 521 case Intrinsic::amdgcn_cvt_pkrtz: { 522 Value *Src0 = II.getArgOperand(0); 523 Value *Src1 = II.getArgOperand(1); 524 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 525 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 526 const fltSemantics &HalfSem = 527 II.getType()->getScalarType()->getFltSemantics(); 528 bool LosesInfo; 529 APFloat Val0 = C0->getValueAPF(); 530 APFloat Val1 = C1->getValueAPF(); 531 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 532 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 533 534 Constant *Folded = 535 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 536 ConstantFP::get(II.getContext(), Val1)}); 537 return IC.replaceInstUsesWith(II, Folded); 538 } 539 } 540 541 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 542 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 543 } 544 545 break; 546 } 547 case Intrinsic::amdgcn_cvt_pknorm_i16: 548 case Intrinsic::amdgcn_cvt_pknorm_u16: 549 case Intrinsic::amdgcn_cvt_pk_i16: 550 case Intrinsic::amdgcn_cvt_pk_u16: { 551 Value *Src0 = II.getArgOperand(0); 552 Value *Src1 = II.getArgOperand(1); 553 554 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 555 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 556 } 557 558 break; 559 } 560 case Intrinsic::amdgcn_ubfe: 561 case Intrinsic::amdgcn_sbfe: { 562 // Decompose simple cases into standard shifts. 563 Value *Src = II.getArgOperand(0); 564 if (isa<UndefValue>(Src)) { 565 return IC.replaceInstUsesWith(II, Src); 566 } 567 568 unsigned Width; 569 Type *Ty = II.getType(); 570 unsigned IntSize = Ty->getIntegerBitWidth(); 571 572 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 573 if (CWidth) { 574 Width = CWidth->getZExtValue(); 575 if ((Width & (IntSize - 1)) == 0) { 576 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 577 } 578 579 // Hardware ignores high bits, so remove those. 580 if (Width >= IntSize) { 581 return IC.replaceOperand( 582 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 583 } 584 } 585 586 unsigned Offset; 587 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 588 if (COffset) { 589 Offset = COffset->getZExtValue(); 590 if (Offset >= IntSize) { 591 return IC.replaceOperand( 592 II, 1, 593 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 594 } 595 } 596 597 bool Signed = IID == Intrinsic::amdgcn_sbfe; 598 599 if (!CWidth || !COffset) 600 break; 601 602 // The case of Width == 0 is handled above, which makes this transformation 603 // safe. If Width == 0, then the ashr and lshr instructions become poison 604 // value since the shift amount would be equal to the bit size. 605 assert(Width != 0); 606 607 // TODO: This allows folding to undef when the hardware has specific 608 // behavior? 609 if (Offset + Width < IntSize) { 610 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 611 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 612 : IC.Builder.CreateLShr(Shl, IntSize - Width); 613 RightShift->takeName(&II); 614 return IC.replaceInstUsesWith(II, RightShift); 615 } 616 617 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 618 : IC.Builder.CreateLShr(Src, Offset); 619 620 RightShift->takeName(&II); 621 return IC.replaceInstUsesWith(II, RightShift); 622 } 623 case Intrinsic::amdgcn_exp: 624 case Intrinsic::amdgcn_exp_row: 625 case Intrinsic::amdgcn_exp_compr: { 626 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 627 unsigned EnBits = En->getZExtValue(); 628 if (EnBits == 0xf) 629 break; // All inputs enabled. 630 631 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 632 bool Changed = false; 633 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 634 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 635 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 636 Value *Src = II.getArgOperand(I + 2); 637 if (!isa<UndefValue>(Src)) { 638 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 639 Changed = true; 640 } 641 } 642 } 643 644 if (Changed) { 645 return &II; 646 } 647 648 break; 649 } 650 case Intrinsic::amdgcn_fmed3: { 651 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 652 // for the shader. 653 654 Value *Src0 = II.getArgOperand(0); 655 Value *Src1 = II.getArgOperand(1); 656 Value *Src2 = II.getArgOperand(2); 657 658 // Checking for NaN before canonicalization provides better fidelity when 659 // mapping other operations onto fmed3 since the order of operands is 660 // unchanged. 661 CallInst *NewCall = nullptr; 662 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 663 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 664 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 665 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 666 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 667 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 668 } 669 670 if (NewCall) { 671 NewCall->copyFastMathFlags(&II); 672 NewCall->takeName(&II); 673 return IC.replaceInstUsesWith(II, NewCall); 674 } 675 676 bool Swap = false; 677 // Canonicalize constants to RHS operands. 678 // 679 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 680 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 681 std::swap(Src0, Src1); 682 Swap = true; 683 } 684 685 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 686 std::swap(Src1, Src2); 687 Swap = true; 688 } 689 690 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 691 std::swap(Src0, Src1); 692 Swap = true; 693 } 694 695 if (Swap) { 696 II.setArgOperand(0, Src0); 697 II.setArgOperand(1, Src1); 698 II.setArgOperand(2, Src2); 699 return &II; 700 } 701 702 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 703 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 704 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 705 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 706 C2->getValueAPF()); 707 return IC.replaceInstUsesWith( 708 II, ConstantFP::get(IC.Builder.getContext(), Result)); 709 } 710 } 711 } 712 713 break; 714 } 715 case Intrinsic::amdgcn_icmp: 716 case Intrinsic::amdgcn_fcmp: { 717 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 718 // Guard against invalid arguments. 719 int64_t CCVal = CC->getZExtValue(); 720 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 721 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 722 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 723 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 724 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 725 break; 726 727 Value *Src0 = II.getArgOperand(0); 728 Value *Src1 = II.getArgOperand(1); 729 730 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 731 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 732 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 733 if (CCmp->isNullValue()) { 734 return IC.replaceInstUsesWith( 735 II, ConstantExpr::getSExt(CCmp, II.getType())); 736 } 737 738 // The result of V_ICMP/V_FCMP assembly instructions (which this 739 // intrinsic exposes) is one bit per thread, masked with the EXEC 740 // register (which contains the bitmask of live threads). So a 741 // comparison that always returns true is the same as a read of the 742 // EXEC register. 743 Function *NewF = Intrinsic::getDeclaration( 744 II.getModule(), Intrinsic::read_register, II.getType()); 745 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 746 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 747 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 748 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 749 NewCall->addFnAttr(Attribute::Convergent); 750 NewCall->takeName(&II); 751 return IC.replaceInstUsesWith(II, NewCall); 752 } 753 754 // Canonicalize constants to RHS. 755 CmpInst::Predicate SwapPred = 756 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 757 II.setArgOperand(0, Src1); 758 II.setArgOperand(1, Src0); 759 II.setArgOperand( 760 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 761 return &II; 762 } 763 764 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 765 break; 766 767 // Canonicalize compare eq with true value to compare != 0 768 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 769 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 770 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 771 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 772 Value *ExtSrc; 773 if (CCVal == CmpInst::ICMP_EQ && 774 ((match(Src1, PatternMatch::m_One()) && 775 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 776 (match(Src1, PatternMatch::m_AllOnes()) && 777 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 778 ExtSrc->getType()->isIntegerTy(1)) { 779 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 780 IC.replaceOperand(II, 2, 781 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 782 return &II; 783 } 784 785 CmpInst::Predicate SrcPred; 786 Value *SrcLHS; 787 Value *SrcRHS; 788 789 // Fold compare eq/ne with 0 from a compare result as the predicate to the 790 // intrinsic. The typical use is a wave vote function in the library, which 791 // will be fed from a user code condition compared with 0. Fold in the 792 // redundant compare. 793 794 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 795 // -> llvm.amdgcn.[if]cmp(a, b, pred) 796 // 797 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 798 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 799 if (match(Src1, PatternMatch::m_Zero()) && 800 match(Src0, PatternMatch::m_ZExtOrSExt( 801 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 802 PatternMatch::m_Value(SrcRHS))))) { 803 if (CCVal == CmpInst::ICMP_EQ) 804 SrcPred = CmpInst::getInversePredicate(SrcPred); 805 806 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 807 ? Intrinsic::amdgcn_fcmp 808 : Intrinsic::amdgcn_icmp; 809 810 Type *Ty = SrcLHS->getType(); 811 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 812 // Promote to next legal integer type. 813 unsigned Width = CmpType->getBitWidth(); 814 unsigned NewWidth = Width; 815 816 // Don't do anything for i1 comparisons. 817 if (Width == 1) 818 break; 819 820 if (Width <= 16) 821 NewWidth = 16; 822 else if (Width <= 32) 823 NewWidth = 32; 824 else if (Width <= 64) 825 NewWidth = 64; 826 else if (Width > 64) 827 break; // Can't handle this. 828 829 if (Width != NewWidth) { 830 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 831 if (CmpInst::isSigned(SrcPred)) { 832 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 833 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 834 } else { 835 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 836 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 837 } 838 } 839 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 840 break; 841 842 Function *NewF = Intrinsic::getDeclaration( 843 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 844 Value *Args[] = {SrcLHS, SrcRHS, 845 ConstantInt::get(CC->getType(), SrcPred)}; 846 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 847 NewCall->takeName(&II); 848 return IC.replaceInstUsesWith(II, NewCall); 849 } 850 851 break; 852 } 853 case Intrinsic::amdgcn_ballot: { 854 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 855 if (Src->isZero()) { 856 // amdgcn.ballot(i1 0) is zero. 857 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 858 } 859 860 if (Src->isOne()) { 861 // amdgcn.ballot(i1 1) is exec. 862 const char *RegName = "exec"; 863 if (II.getType()->isIntegerTy(32)) 864 RegName = "exec_lo"; 865 else if (!II.getType()->isIntegerTy(64)) 866 break; 867 868 Function *NewF = Intrinsic::getDeclaration( 869 II.getModule(), Intrinsic::read_register, II.getType()); 870 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 871 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 872 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 873 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 874 NewCall->addFnAttr(Attribute::Convergent); 875 NewCall->takeName(&II); 876 return IC.replaceInstUsesWith(II, NewCall); 877 } 878 } 879 break; 880 } 881 case Intrinsic::amdgcn_wqm_vote: { 882 // wqm_vote is identity when the argument is constant. 883 if (!isa<Constant>(II.getArgOperand(0))) 884 break; 885 886 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 887 } 888 case Intrinsic::amdgcn_kill: { 889 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 890 if (!C || !C->getZExtValue()) 891 break; 892 893 // amdgcn.kill(i1 1) is a no-op 894 return IC.eraseInstFromFunction(II); 895 } 896 case Intrinsic::amdgcn_update_dpp: { 897 Value *Old = II.getArgOperand(0); 898 899 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 900 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 901 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 902 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 903 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 904 break; 905 906 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 907 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 908 } 909 case Intrinsic::amdgcn_permlane16: 910 case Intrinsic::amdgcn_permlanex16: { 911 // Discard vdst_in if it's not going to be read. 912 Value *VDstIn = II.getArgOperand(0); 913 if (isa<UndefValue>(VDstIn)) 914 break; 915 916 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 917 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 918 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 919 break; 920 921 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 922 } 923 case Intrinsic::amdgcn_permlane64: 924 // A constant value is trivially uniform. 925 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 926 return IC.replaceInstUsesWith(II, C); 927 } 928 break; 929 case Intrinsic::amdgcn_readfirstlane: 930 case Intrinsic::amdgcn_readlane: { 931 // A constant value is trivially uniform. 932 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 933 return IC.replaceInstUsesWith(II, C); 934 } 935 936 // The rest of these may not be safe if the exec may not be the same between 937 // the def and use. 938 Value *Src = II.getArgOperand(0); 939 Instruction *SrcInst = dyn_cast<Instruction>(Src); 940 if (SrcInst && SrcInst->getParent() != II.getParent()) 941 break; 942 943 // readfirstlane (readfirstlane x) -> readfirstlane x 944 // readlane (readfirstlane x), y -> readfirstlane x 945 if (match(Src, 946 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 947 return IC.replaceInstUsesWith(II, Src); 948 } 949 950 if (IID == Intrinsic::amdgcn_readfirstlane) { 951 // readfirstlane (readlane x, y) -> readlane x, y 952 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 953 return IC.replaceInstUsesWith(II, Src); 954 } 955 } else { 956 // readlane (readlane x, y), y -> readlane x, y 957 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 958 PatternMatch::m_Value(), 959 PatternMatch::m_Specific(II.getArgOperand(1))))) { 960 return IC.replaceInstUsesWith(II, Src); 961 } 962 } 963 964 break; 965 } 966 case Intrinsic::amdgcn_ldexp: { 967 // FIXME: This doesn't introduce new instructions and belongs in 968 // InstructionSimplify. 969 Type *Ty = II.getType(); 970 Value *Op0 = II.getArgOperand(0); 971 Value *Op1 = II.getArgOperand(1); 972 973 // Folding undef to qnan is safe regardless of the FP mode. 974 if (isa<UndefValue>(Op0)) { 975 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 976 return IC.replaceInstUsesWith(II, QNaN); 977 } 978 979 const APFloat *C = nullptr; 980 match(Op0, PatternMatch::m_APFloat(C)); 981 982 // FIXME: Should flush denorms depending on FP mode, but that's ignored 983 // everywhere else. 984 // 985 // These cases should be safe, even with strictfp. 986 // ldexp(0.0, x) -> 0.0 987 // ldexp(-0.0, x) -> -0.0 988 // ldexp(inf, x) -> inf 989 // ldexp(-inf, x) -> -inf 990 if (C && (C->isZero() || C->isInfinity())) { 991 return IC.replaceInstUsesWith(II, Op0); 992 } 993 994 // With strictfp, be more careful about possibly needing to flush denormals 995 // or not, and snan behavior depends on ieee_mode. 996 if (II.isStrictFP()) 997 break; 998 999 if (C && C->isNaN()) { 1000 // FIXME: We just need to make the nan quiet here, but that's unavailable 1001 // on APFloat, only IEEEfloat 1002 auto *Quieted = 1003 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 1004 return IC.replaceInstUsesWith(II, Quieted); 1005 } 1006 1007 // ldexp(x, 0) -> x 1008 // ldexp(x, undef) -> x 1009 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 1010 return IC.replaceInstUsesWith(II, Op0); 1011 } 1012 1013 break; 1014 } 1015 case Intrinsic::amdgcn_fmul_legacy: { 1016 Value *Op0 = II.getArgOperand(0); 1017 Value *Op1 = II.getArgOperand(1); 1018 1019 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1020 // infinity, gives +0.0. 1021 // TODO: Move to InstSimplify? 1022 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1023 match(Op1, PatternMatch::m_AnyZeroFP())) 1024 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 1025 1026 // If we can prove we don't have one of the special cases then we can use a 1027 // normal fmul instruction instead. 1028 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1029 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 1030 FMul->takeName(&II); 1031 return IC.replaceInstUsesWith(II, FMul); 1032 } 1033 break; 1034 } 1035 case Intrinsic::amdgcn_fma_legacy: { 1036 Value *Op0 = II.getArgOperand(0); 1037 Value *Op1 = II.getArgOperand(1); 1038 Value *Op2 = II.getArgOperand(2); 1039 1040 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 1041 // infinity, gives +0.0. 1042 // TODO: Move to InstSimplify? 1043 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1044 match(Op1, PatternMatch::m_AnyZeroFP())) { 1045 // It's tempting to just return Op2 here, but that would give the wrong 1046 // result if Op2 was -0.0. 1047 auto *Zero = ConstantFP::getNullValue(II.getType()); 1048 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1049 FAdd->takeName(&II); 1050 return IC.replaceInstUsesWith(II, FAdd); 1051 } 1052 1053 // If we can prove we don't have one of the special cases then we can use a 1054 // normal fma instead. 1055 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1056 II.setCalledOperand(Intrinsic::getDeclaration( 1057 II.getModule(), Intrinsic::fma, II.getType())); 1058 return &II; 1059 } 1060 break; 1061 } 1062 case Intrinsic::amdgcn_is_shared: 1063 case Intrinsic::amdgcn_is_private: { 1064 if (isa<UndefValue>(II.getArgOperand(0))) 1065 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1066 1067 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1068 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1069 break; 1070 } 1071 default: { 1072 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1073 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1074 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1075 } 1076 } 1077 } 1078 return None; 1079 } 1080 1081 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1082 /// 1083 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1084 /// struct returns. 1085 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1086 IntrinsicInst &II, 1087 APInt DemandedElts, 1088 int DMaskIdx = -1) { 1089 1090 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1091 unsigned VWidth = IIVTy->getNumElements(); 1092 if (VWidth == 1) 1093 return nullptr; 1094 1095 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1096 IC.Builder.SetInsertPoint(&II); 1097 1098 // Assume the arguments are unchanged and later override them, if needed. 1099 SmallVector<Value *, 16> Args(II.args()); 1100 1101 if (DMaskIdx < 0) { 1102 // Buffer case. 1103 1104 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1105 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1106 1107 // Start assuming the prefix of elements is demanded, but possibly clear 1108 // some other bits if there are trailing zeros (unused components at front) 1109 // and update offset. 1110 DemandedElts = (1 << ActiveBits) - 1; 1111 1112 if (UnusedComponentsAtFront > 0) { 1113 static const unsigned InvalidOffsetIdx = 0xf; 1114 1115 unsigned OffsetIdx; 1116 switch (II.getIntrinsicID()) { 1117 case Intrinsic::amdgcn_raw_buffer_load: 1118 OffsetIdx = 1; 1119 break; 1120 case Intrinsic::amdgcn_s_buffer_load: 1121 // If resulting type is vec3, there is no point in trimming the 1122 // load with updated offset, as the vec3 would most likely be widened to 1123 // vec4 anyway during lowering. 1124 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1125 OffsetIdx = InvalidOffsetIdx; 1126 else 1127 OffsetIdx = 1; 1128 break; 1129 case Intrinsic::amdgcn_struct_buffer_load: 1130 OffsetIdx = 2; 1131 break; 1132 default: 1133 // TODO: handle tbuffer* intrinsics. 1134 OffsetIdx = InvalidOffsetIdx; 1135 break; 1136 } 1137 1138 if (OffsetIdx != InvalidOffsetIdx) { 1139 // Clear demanded bits and update the offset. 1140 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1141 auto *Offset = II.getArgOperand(OffsetIdx); 1142 unsigned SingleComponentSizeInBits = 1143 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 1144 unsigned OffsetAdd = 1145 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1146 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1147 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1148 } 1149 } 1150 } else { 1151 // Image case. 1152 1153 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 1154 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1155 1156 // Mask off values that are undefined because the dmask doesn't cover them 1157 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 1158 1159 unsigned NewDMaskVal = 0; 1160 unsigned OrigLoadIdx = 0; 1161 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1162 const unsigned Bit = 1 << SrcIdx; 1163 if (!!(DMaskVal & Bit)) { 1164 if (!!DemandedElts[OrigLoadIdx]) 1165 NewDMaskVal |= Bit; 1166 OrigLoadIdx++; 1167 } 1168 } 1169 1170 if (DMaskVal != NewDMaskVal) 1171 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1172 } 1173 1174 unsigned NewNumElts = DemandedElts.countPopulation(); 1175 if (!NewNumElts) 1176 return UndefValue::get(II.getType()); 1177 1178 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1179 if (DMaskIdx >= 0) 1180 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1181 return nullptr; 1182 } 1183 1184 // Validate function argument and return types, extracting overloaded types 1185 // along the way. 1186 SmallVector<Type *, 6> OverloadTys; 1187 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1188 return nullptr; 1189 1190 Module *M = II.getParent()->getParent()->getParent(); 1191 Type *EltTy = IIVTy->getElementType(); 1192 Type *NewTy = 1193 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1194 1195 OverloadTys[0] = NewTy; 1196 Function *NewIntrin = 1197 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1198 1199 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1200 NewCall->takeName(&II); 1201 NewCall->copyMetadata(II); 1202 1203 if (NewNumElts == 1) { 1204 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1205 NewCall, 1206 DemandedElts.countTrailingZeros()); 1207 } 1208 1209 SmallVector<int, 8> EltMask; 1210 unsigned NewLoadIdx = 0; 1211 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1212 if (!!DemandedElts[OrigLoadIdx]) 1213 EltMask.push_back(NewLoadIdx++); 1214 else 1215 EltMask.push_back(NewNumElts); 1216 } 1217 1218 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1219 1220 return Shuffle; 1221 } 1222 1223 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1224 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1225 APInt &UndefElts2, APInt &UndefElts3, 1226 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1227 SimplifyAndSetOp) const { 1228 switch (II.getIntrinsicID()) { 1229 case Intrinsic::amdgcn_buffer_load: 1230 case Intrinsic::amdgcn_buffer_load_format: 1231 case Intrinsic::amdgcn_raw_buffer_load: 1232 case Intrinsic::amdgcn_raw_buffer_load_format: 1233 case Intrinsic::amdgcn_raw_tbuffer_load: 1234 case Intrinsic::amdgcn_s_buffer_load: 1235 case Intrinsic::amdgcn_struct_buffer_load: 1236 case Intrinsic::amdgcn_struct_buffer_load_format: 1237 case Intrinsic::amdgcn_struct_tbuffer_load: 1238 case Intrinsic::amdgcn_tbuffer_load: 1239 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1240 default: { 1241 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1242 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1243 } 1244 break; 1245 } 1246 } 1247 return None; 1248 } 1249