1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/Transforms/InstCombine/InstCombiner.h" 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "AMDGPUtti" 26 27 namespace { 28 29 struct AMDGPUImageDMaskIntrinsic { 30 unsigned Intr; 31 }; 32 33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 34 #include "InstCombineTables.inc" 35 36 } // end anonymous namespace 37 38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 39 // 40 // A single NaN input is folded to minnum, so we rely on that folding for 41 // handling NaNs. 42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 43 const APFloat &Src2) { 44 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 45 46 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 47 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 48 if (Cmp0 == APFloat::cmpEqual) 49 return maxnum(Src1, Src2); 50 51 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 52 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 53 if (Cmp1 == APFloat::cmpEqual) 54 return maxnum(Src0, Src2); 55 56 return maxnum(Src0, Src1); 57 } 58 59 // Check if a value can be converted to a 16-bit value without losing 60 // precision. 61 // The value is expected to be either a float (IsFloat = true) or an unsigned 62 // integer (IsFloat = false). 63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) { 64 Type *VTy = V.getType(); 65 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 66 // The value is already 16-bit, so we don't want to convert to 16-bit again! 67 return false; 68 } 69 if (IsFloat) { 70 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 71 // We need to check that if we cast the index down to a half, we do not 72 // lose precision. 73 APFloat FloatValue(ConstFloat->getValueAPF()); 74 bool LosesInfo = true; 75 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, 76 &LosesInfo); 77 return !LosesInfo; 78 } 79 } else { 80 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) { 81 // We need to check that if we cast the index down to an i16, we do not 82 // lose precision. 83 APInt IntValue(ConstInt->getValue()); 84 return IntValue.getActiveBits() <= 16; 85 } 86 } 87 88 Value *CastSrc; 89 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) 90 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc))); 91 if (IsExt) { 92 Type *CastSrcTy = CastSrc->getType(); 93 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 94 return true; 95 } 96 97 return false; 98 } 99 100 // Convert a value to 16-bit. 101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 102 Type *VTy = V.getType(); 103 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 104 return cast<Instruction>(&V)->getOperand(0); 105 if (VTy->isIntegerTy()) 106 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 107 if (VTy->isFloatingPointTy()) 108 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 109 110 llvm_unreachable("Should never be called!"); 111 } 112 113 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with 114 /// the modified arguments. 115 static Optional<Instruction *> modifyIntrinsicCall( 116 IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC, 117 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 118 Func) { 119 SmallVector<Type *, 4> ArgTys; 120 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 121 return None; 122 123 SmallVector<Value *, 8> Args(II.args()); 124 125 // Modify arguments and types 126 Func(Args, ArgTys); 127 128 Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys); 129 130 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 131 NewCall->takeName(&II); 132 NewCall->copyMetadata(II); 133 if (isa<FPMathOperator>(NewCall)) 134 NewCall->copyFastMathFlags(&II); 135 136 // Erase and replace uses 137 if (!II.getType()->isVoidTy()) 138 IC.replaceInstUsesWith(II, NewCall); 139 return IC.eraseInstFromFunction(II); 140 } 141 142 static Optional<Instruction *> 143 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 144 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 145 IntrinsicInst &II, InstCombiner &IC) { 146 // Optimize _L to _LZ when _L is zero 147 if (const auto *LZMappingInfo = 148 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 149 if (auto *ConstantLod = 150 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 151 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 152 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 153 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 154 ImageDimIntr->Dim); 155 return modifyIntrinsicCall( 156 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 157 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 158 }); 159 } 160 } 161 } 162 163 // Optimize _mip away, when 'lod' is zero 164 if (const auto *MIPMappingInfo = 165 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 166 if (auto *ConstantMip = 167 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 168 if (ConstantMip->isZero()) { 169 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 170 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 171 ImageDimIntr->Dim); 172 return modifyIntrinsicCall( 173 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 174 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 175 }); 176 } 177 } 178 } 179 180 // Optimize _bias away when 'bias' is zero 181 if (const auto *BiasMappingInfo = 182 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 183 if (auto *ConstantBias = 184 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 185 if (ConstantBias->isZero()) { 186 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 187 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 188 ImageDimIntr->Dim); 189 return modifyIntrinsicCall( 190 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 191 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 192 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 193 }); 194 } 195 } 196 } 197 198 // Optimize _offset away when 'offset' is zero 199 if (const auto *OffsetMappingInfo = 200 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 201 if (auto *ConstantOffset = 202 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 203 if (ConstantOffset->isZero()) { 204 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 205 AMDGPU::getImageDimIntrinsicByBaseOpcode( 206 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 207 return modifyIntrinsicCall( 208 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 209 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 210 }); 211 } 212 } 213 } 214 215 // Try to use A16 or G16 216 if (!ST->hasA16() && !ST->hasG16()) 217 return None; 218 219 // Address is interpreted as float if the instruction has a sampler or as 220 // unsigned int if there is no sampler. 221 bool HasSampler = 222 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler; 223 bool FloatCoord = false; 224 // true means derivatives can be converted to 16 bit, coordinates not 225 bool OnlyDerivatives = false; 226 227 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 228 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 229 Value *Coord = II.getOperand(OperandIndex); 230 // If the values are not derived from 16-bit values, we cannot optimize. 231 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) { 232 if (OperandIndex < ImageDimIntr->CoordStart || 233 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 234 return None; 235 } 236 // All gradients can be converted, so convert only them 237 OnlyDerivatives = true; 238 break; 239 } 240 241 assert(OperandIndex == ImageDimIntr->GradientStart || 242 FloatCoord == Coord->getType()->isFloatingPointTy()); 243 FloatCoord = Coord->getType()->isFloatingPointTy(); 244 } 245 246 if (!OnlyDerivatives && !ST->hasA16()) 247 OnlyDerivatives = true; // Only supports G16 248 249 // Check if there is a bias parameter and if it can be converted to f16 250 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 251 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 252 assert(HasSampler && 253 "Only image instructions with a sampler can have a bias"); 254 if (!canSafelyConvertTo16Bit(*Bias, HasSampler)) 255 OnlyDerivatives = true; 256 } 257 258 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 259 ImageDimIntr->CoordStart)) 260 return None; 261 262 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 263 : Type::getInt16Ty(II.getContext()); 264 265 return modifyIntrinsicCall( 266 II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 267 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 268 if (!OnlyDerivatives) { 269 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 270 271 // Change the bias type 272 if (ImageDimIntr->NumBiasArgs != 0) 273 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 274 } 275 276 unsigned EndIndex = 277 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 278 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 279 OperandIndex < EndIndex; OperandIndex++) { 280 Args[OperandIndex] = 281 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 282 } 283 284 // Convert the bias 285 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 286 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 287 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 288 } 289 }); 290 } 291 292 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 293 InstCombiner &IC) const { 294 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 295 // infinity, gives +0.0. If we can prove we don't have one of the special 296 // cases then we can use a normal multiply instead. 297 // TODO: Create and use isKnownFiniteNonZero instead of just matching 298 // constants here. 299 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 300 match(Op1, PatternMatch::m_FiniteNonZero())) { 301 // One operand is not zero or infinity or NaN. 302 return true; 303 } 304 auto *TLI = &IC.getTargetLibraryInfo(); 305 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 306 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 307 // Neither operand is infinity or NaN. 308 return true; 309 } 310 return false; 311 } 312 313 Optional<Instruction *> 314 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 315 Intrinsic::ID IID = II.getIntrinsicID(); 316 switch (IID) { 317 case Intrinsic::amdgcn_rcp: { 318 Value *Src = II.getArgOperand(0); 319 320 // TODO: Move to ConstantFolding/InstSimplify? 321 if (isa<UndefValue>(Src)) { 322 Type *Ty = II.getType(); 323 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 324 return IC.replaceInstUsesWith(II, QNaN); 325 } 326 327 if (II.isStrictFP()) 328 break; 329 330 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 331 const APFloat &ArgVal = C->getValueAPF(); 332 APFloat Val(ArgVal.getSemantics(), 1); 333 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 334 335 // This is more precise than the instruction may give. 336 // 337 // TODO: The instruction always flushes denormal results (except for f16), 338 // should this also? 339 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 340 } 341 342 break; 343 } 344 case Intrinsic::amdgcn_rsq: { 345 Value *Src = II.getArgOperand(0); 346 347 // TODO: Move to ConstantFolding/InstSimplify? 348 if (isa<UndefValue>(Src)) { 349 Type *Ty = II.getType(); 350 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 351 return IC.replaceInstUsesWith(II, QNaN); 352 } 353 354 break; 355 } 356 case Intrinsic::amdgcn_frexp_mant: 357 case Intrinsic::amdgcn_frexp_exp: { 358 Value *Src = II.getArgOperand(0); 359 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 360 int Exp; 361 APFloat Significand = 362 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 363 364 if (IID == Intrinsic::amdgcn_frexp_mant) { 365 return IC.replaceInstUsesWith( 366 II, ConstantFP::get(II.getContext(), Significand)); 367 } 368 369 // Match instruction special case behavior. 370 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 371 Exp = 0; 372 373 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 374 } 375 376 if (isa<UndefValue>(Src)) { 377 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 378 } 379 380 break; 381 } 382 case Intrinsic::amdgcn_class: { 383 enum { 384 S_NAN = 1 << 0, // Signaling NaN 385 Q_NAN = 1 << 1, // Quiet NaN 386 N_INFINITY = 1 << 2, // Negative infinity 387 N_NORMAL = 1 << 3, // Negative normal 388 N_SUBNORMAL = 1 << 4, // Negative subnormal 389 N_ZERO = 1 << 5, // Negative zero 390 P_ZERO = 1 << 6, // Positive zero 391 P_SUBNORMAL = 1 << 7, // Positive subnormal 392 P_NORMAL = 1 << 8, // Positive normal 393 P_INFINITY = 1 << 9 // Positive infinity 394 }; 395 396 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 397 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 398 P_NORMAL | P_INFINITY; 399 400 Value *Src0 = II.getArgOperand(0); 401 Value *Src1 = II.getArgOperand(1); 402 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 403 if (!CMask) { 404 if (isa<UndefValue>(Src0)) { 405 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 406 } 407 408 if (isa<UndefValue>(Src1)) { 409 return IC.replaceInstUsesWith(II, 410 ConstantInt::get(II.getType(), false)); 411 } 412 break; 413 } 414 415 uint32_t Mask = CMask->getZExtValue(); 416 417 // If all tests are made, it doesn't matter what the value is. 418 if ((Mask & FullMask) == FullMask) { 419 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 420 } 421 422 if ((Mask & FullMask) == 0) { 423 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 424 } 425 426 if (Mask == (S_NAN | Q_NAN)) { 427 // Equivalent of isnan. Replace with standard fcmp. 428 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 429 FCmp->takeName(&II); 430 return IC.replaceInstUsesWith(II, FCmp); 431 } 432 433 if (Mask == (N_ZERO | P_ZERO)) { 434 // Equivalent of == 0. 435 Value *FCmp = 436 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 437 438 FCmp->takeName(&II); 439 return IC.replaceInstUsesWith(II, FCmp); 440 } 441 442 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 443 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 444 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 445 return IC.replaceOperand( 446 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 447 } 448 449 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 450 if (!CVal) { 451 if (isa<UndefValue>(Src0)) { 452 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 453 } 454 455 // Clamp mask to used bits 456 if ((Mask & FullMask) != Mask) { 457 CallInst *NewCall = IC.Builder.CreateCall( 458 II.getCalledFunction(), 459 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 460 461 NewCall->takeName(&II); 462 return IC.replaceInstUsesWith(II, NewCall); 463 } 464 465 break; 466 } 467 468 const APFloat &Val = CVal->getValueAPF(); 469 470 bool Result = 471 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 472 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 473 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 474 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 475 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 476 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 477 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 478 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 479 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 480 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 481 482 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 483 } 484 case Intrinsic::amdgcn_cvt_pkrtz: { 485 Value *Src0 = II.getArgOperand(0); 486 Value *Src1 = II.getArgOperand(1); 487 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 488 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 489 const fltSemantics &HalfSem = 490 II.getType()->getScalarType()->getFltSemantics(); 491 bool LosesInfo; 492 APFloat Val0 = C0->getValueAPF(); 493 APFloat Val1 = C1->getValueAPF(); 494 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 495 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 496 497 Constant *Folded = 498 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 499 ConstantFP::get(II.getContext(), Val1)}); 500 return IC.replaceInstUsesWith(II, Folded); 501 } 502 } 503 504 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 505 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 506 } 507 508 break; 509 } 510 case Intrinsic::amdgcn_cvt_pknorm_i16: 511 case Intrinsic::amdgcn_cvt_pknorm_u16: 512 case Intrinsic::amdgcn_cvt_pk_i16: 513 case Intrinsic::amdgcn_cvt_pk_u16: { 514 Value *Src0 = II.getArgOperand(0); 515 Value *Src1 = II.getArgOperand(1); 516 517 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 518 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 519 } 520 521 break; 522 } 523 case Intrinsic::amdgcn_ubfe: 524 case Intrinsic::amdgcn_sbfe: { 525 // Decompose simple cases into standard shifts. 526 Value *Src = II.getArgOperand(0); 527 if (isa<UndefValue>(Src)) { 528 return IC.replaceInstUsesWith(II, Src); 529 } 530 531 unsigned Width; 532 Type *Ty = II.getType(); 533 unsigned IntSize = Ty->getIntegerBitWidth(); 534 535 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 536 if (CWidth) { 537 Width = CWidth->getZExtValue(); 538 if ((Width & (IntSize - 1)) == 0) { 539 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 540 } 541 542 // Hardware ignores high bits, so remove those. 543 if (Width >= IntSize) { 544 return IC.replaceOperand( 545 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 546 } 547 } 548 549 unsigned Offset; 550 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 551 if (COffset) { 552 Offset = COffset->getZExtValue(); 553 if (Offset >= IntSize) { 554 return IC.replaceOperand( 555 II, 1, 556 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 557 } 558 } 559 560 bool Signed = IID == Intrinsic::amdgcn_sbfe; 561 562 if (!CWidth || !COffset) 563 break; 564 565 // The case of Width == 0 is handled above, which makes this transformation 566 // safe. If Width == 0, then the ashr and lshr instructions become poison 567 // value since the shift amount would be equal to the bit size. 568 assert(Width != 0); 569 570 // TODO: This allows folding to undef when the hardware has specific 571 // behavior? 572 if (Offset + Width < IntSize) { 573 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 574 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 575 : IC.Builder.CreateLShr(Shl, IntSize - Width); 576 RightShift->takeName(&II); 577 return IC.replaceInstUsesWith(II, RightShift); 578 } 579 580 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 581 : IC.Builder.CreateLShr(Src, Offset); 582 583 RightShift->takeName(&II); 584 return IC.replaceInstUsesWith(II, RightShift); 585 } 586 case Intrinsic::amdgcn_exp: 587 case Intrinsic::amdgcn_exp_compr: { 588 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 589 unsigned EnBits = En->getZExtValue(); 590 if (EnBits == 0xf) 591 break; // All inputs enabled. 592 593 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 594 bool Changed = false; 595 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 596 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 597 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 598 Value *Src = II.getArgOperand(I + 2); 599 if (!isa<UndefValue>(Src)) { 600 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 601 Changed = true; 602 } 603 } 604 } 605 606 if (Changed) { 607 return &II; 608 } 609 610 break; 611 } 612 case Intrinsic::amdgcn_fmed3: { 613 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 614 // for the shader. 615 616 Value *Src0 = II.getArgOperand(0); 617 Value *Src1 = II.getArgOperand(1); 618 Value *Src2 = II.getArgOperand(2); 619 620 // Checking for NaN before canonicalization provides better fidelity when 621 // mapping other operations onto fmed3 since the order of operands is 622 // unchanged. 623 CallInst *NewCall = nullptr; 624 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 625 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 626 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 627 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 628 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 629 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 630 } 631 632 if (NewCall) { 633 NewCall->copyFastMathFlags(&II); 634 NewCall->takeName(&II); 635 return IC.replaceInstUsesWith(II, NewCall); 636 } 637 638 bool Swap = false; 639 // Canonicalize constants to RHS operands. 640 // 641 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 642 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 643 std::swap(Src0, Src1); 644 Swap = true; 645 } 646 647 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 648 std::swap(Src1, Src2); 649 Swap = true; 650 } 651 652 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 653 std::swap(Src0, Src1); 654 Swap = true; 655 } 656 657 if (Swap) { 658 II.setArgOperand(0, Src0); 659 II.setArgOperand(1, Src1); 660 II.setArgOperand(2, Src2); 661 return &II; 662 } 663 664 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 665 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 666 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 667 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 668 C2->getValueAPF()); 669 return IC.replaceInstUsesWith( 670 II, ConstantFP::get(IC.Builder.getContext(), Result)); 671 } 672 } 673 } 674 675 break; 676 } 677 case Intrinsic::amdgcn_icmp: 678 case Intrinsic::amdgcn_fcmp: { 679 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 680 // Guard against invalid arguments. 681 int64_t CCVal = CC->getZExtValue(); 682 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 683 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 684 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 685 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 686 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 687 break; 688 689 Value *Src0 = II.getArgOperand(0); 690 Value *Src1 = II.getArgOperand(1); 691 692 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 693 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 694 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 695 if (CCmp->isNullValue()) { 696 return IC.replaceInstUsesWith( 697 II, ConstantExpr::getSExt(CCmp, II.getType())); 698 } 699 700 // The result of V_ICMP/V_FCMP assembly instructions (which this 701 // intrinsic exposes) is one bit per thread, masked with the EXEC 702 // register (which contains the bitmask of live threads). So a 703 // comparison that always returns true is the same as a read of the 704 // EXEC register. 705 Function *NewF = Intrinsic::getDeclaration( 706 II.getModule(), Intrinsic::read_register, II.getType()); 707 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 708 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 709 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 710 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 711 NewCall->addFnAttr(Attribute::Convergent); 712 NewCall->takeName(&II); 713 return IC.replaceInstUsesWith(II, NewCall); 714 } 715 716 // Canonicalize constants to RHS. 717 CmpInst::Predicate SwapPred = 718 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 719 II.setArgOperand(0, Src1); 720 II.setArgOperand(1, Src0); 721 II.setArgOperand( 722 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 723 return &II; 724 } 725 726 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 727 break; 728 729 // Canonicalize compare eq with true value to compare != 0 730 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 731 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 732 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 733 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 734 Value *ExtSrc; 735 if (CCVal == CmpInst::ICMP_EQ && 736 ((match(Src1, PatternMatch::m_One()) && 737 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 738 (match(Src1, PatternMatch::m_AllOnes()) && 739 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 740 ExtSrc->getType()->isIntegerTy(1)) { 741 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 742 IC.replaceOperand(II, 2, 743 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 744 return &II; 745 } 746 747 CmpInst::Predicate SrcPred; 748 Value *SrcLHS; 749 Value *SrcRHS; 750 751 // Fold compare eq/ne with 0 from a compare result as the predicate to the 752 // intrinsic. The typical use is a wave vote function in the library, which 753 // will be fed from a user code condition compared with 0. Fold in the 754 // redundant compare. 755 756 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 757 // -> llvm.amdgcn.[if]cmp(a, b, pred) 758 // 759 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 760 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 761 if (match(Src1, PatternMatch::m_Zero()) && 762 match(Src0, PatternMatch::m_ZExtOrSExt( 763 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 764 PatternMatch::m_Value(SrcRHS))))) { 765 if (CCVal == CmpInst::ICMP_EQ) 766 SrcPred = CmpInst::getInversePredicate(SrcPred); 767 768 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 769 ? Intrinsic::amdgcn_fcmp 770 : Intrinsic::amdgcn_icmp; 771 772 Type *Ty = SrcLHS->getType(); 773 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 774 // Promote to next legal integer type. 775 unsigned Width = CmpType->getBitWidth(); 776 unsigned NewWidth = Width; 777 778 // Don't do anything for i1 comparisons. 779 if (Width == 1) 780 break; 781 782 if (Width <= 16) 783 NewWidth = 16; 784 else if (Width <= 32) 785 NewWidth = 32; 786 else if (Width <= 64) 787 NewWidth = 64; 788 else if (Width > 64) 789 break; // Can't handle this. 790 791 if (Width != NewWidth) { 792 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 793 if (CmpInst::isSigned(SrcPred)) { 794 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 795 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 796 } else { 797 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 798 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 799 } 800 } 801 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 802 break; 803 804 Function *NewF = Intrinsic::getDeclaration( 805 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 806 Value *Args[] = {SrcLHS, SrcRHS, 807 ConstantInt::get(CC->getType(), SrcPred)}; 808 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 809 NewCall->takeName(&II); 810 return IC.replaceInstUsesWith(II, NewCall); 811 } 812 813 break; 814 } 815 case Intrinsic::amdgcn_ballot: { 816 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 817 if (Src->isZero()) { 818 // amdgcn.ballot(i1 0) is zero. 819 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 820 } 821 822 if (Src->isOne()) { 823 // amdgcn.ballot(i1 1) is exec. 824 const char *RegName = "exec"; 825 if (II.getType()->isIntegerTy(32)) 826 RegName = "exec_lo"; 827 else if (!II.getType()->isIntegerTy(64)) 828 break; 829 830 Function *NewF = Intrinsic::getDeclaration( 831 II.getModule(), Intrinsic::read_register, II.getType()); 832 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 833 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 834 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 835 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 836 NewCall->addFnAttr(Attribute::Convergent); 837 NewCall->takeName(&II); 838 return IC.replaceInstUsesWith(II, NewCall); 839 } 840 } 841 break; 842 } 843 case Intrinsic::amdgcn_wqm_vote: { 844 // wqm_vote is identity when the argument is constant. 845 if (!isa<Constant>(II.getArgOperand(0))) 846 break; 847 848 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 849 } 850 case Intrinsic::amdgcn_kill: { 851 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 852 if (!C || !C->getZExtValue()) 853 break; 854 855 // amdgcn.kill(i1 1) is a no-op 856 return IC.eraseInstFromFunction(II); 857 } 858 case Intrinsic::amdgcn_update_dpp: { 859 Value *Old = II.getArgOperand(0); 860 861 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 862 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 863 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 864 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 865 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 866 break; 867 868 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 869 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 870 } 871 case Intrinsic::amdgcn_permlane16: 872 case Intrinsic::amdgcn_permlanex16: { 873 // Discard vdst_in if it's not going to be read. 874 Value *VDstIn = II.getArgOperand(0); 875 if (isa<UndefValue>(VDstIn)) 876 break; 877 878 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 879 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 880 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 881 break; 882 883 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 884 } 885 case Intrinsic::amdgcn_readfirstlane: 886 case Intrinsic::amdgcn_readlane: { 887 // A constant value is trivially uniform. 888 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 889 return IC.replaceInstUsesWith(II, C); 890 } 891 892 // The rest of these may not be safe if the exec may not be the same between 893 // the def and use. 894 Value *Src = II.getArgOperand(0); 895 Instruction *SrcInst = dyn_cast<Instruction>(Src); 896 if (SrcInst && SrcInst->getParent() != II.getParent()) 897 break; 898 899 // readfirstlane (readfirstlane x) -> readfirstlane x 900 // readlane (readfirstlane x), y -> readfirstlane x 901 if (match(Src, 902 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 903 return IC.replaceInstUsesWith(II, Src); 904 } 905 906 if (IID == Intrinsic::amdgcn_readfirstlane) { 907 // readfirstlane (readlane x, y) -> readlane x, y 908 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 909 return IC.replaceInstUsesWith(II, Src); 910 } 911 } else { 912 // readlane (readlane x, y), y -> readlane x, y 913 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 914 PatternMatch::m_Value(), 915 PatternMatch::m_Specific(II.getArgOperand(1))))) { 916 return IC.replaceInstUsesWith(II, Src); 917 } 918 } 919 920 break; 921 } 922 case Intrinsic::amdgcn_ldexp: { 923 // FIXME: This doesn't introduce new instructions and belongs in 924 // InstructionSimplify. 925 Type *Ty = II.getType(); 926 Value *Op0 = II.getArgOperand(0); 927 Value *Op1 = II.getArgOperand(1); 928 929 // Folding undef to qnan is safe regardless of the FP mode. 930 if (isa<UndefValue>(Op0)) { 931 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 932 return IC.replaceInstUsesWith(II, QNaN); 933 } 934 935 const APFloat *C = nullptr; 936 match(Op0, PatternMatch::m_APFloat(C)); 937 938 // FIXME: Should flush denorms depending on FP mode, but that's ignored 939 // everywhere else. 940 // 941 // These cases should be safe, even with strictfp. 942 // ldexp(0.0, x) -> 0.0 943 // ldexp(-0.0, x) -> -0.0 944 // ldexp(inf, x) -> inf 945 // ldexp(-inf, x) -> -inf 946 if (C && (C->isZero() || C->isInfinity())) { 947 return IC.replaceInstUsesWith(II, Op0); 948 } 949 950 // With strictfp, be more careful about possibly needing to flush denormals 951 // or not, and snan behavior depends on ieee_mode. 952 if (II.isStrictFP()) 953 break; 954 955 if (C && C->isNaN()) { 956 // FIXME: We just need to make the nan quiet here, but that's unavailable 957 // on APFloat, only IEEEfloat 958 auto *Quieted = 959 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 960 return IC.replaceInstUsesWith(II, Quieted); 961 } 962 963 // ldexp(x, 0) -> x 964 // ldexp(x, undef) -> x 965 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 966 return IC.replaceInstUsesWith(II, Op0); 967 } 968 969 break; 970 } 971 case Intrinsic::amdgcn_fmul_legacy: { 972 Value *Op0 = II.getArgOperand(0); 973 Value *Op1 = II.getArgOperand(1); 974 975 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 976 // infinity, gives +0.0. 977 // TODO: Move to InstSimplify? 978 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 979 match(Op1, PatternMatch::m_AnyZeroFP())) 980 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 981 982 // If we can prove we don't have one of the special cases then we can use a 983 // normal fmul instruction instead. 984 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 985 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 986 FMul->takeName(&II); 987 return IC.replaceInstUsesWith(II, FMul); 988 } 989 break; 990 } 991 case Intrinsic::amdgcn_fma_legacy: { 992 Value *Op0 = II.getArgOperand(0); 993 Value *Op1 = II.getArgOperand(1); 994 Value *Op2 = II.getArgOperand(2); 995 996 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 997 // infinity, gives +0.0. 998 // TODO: Move to InstSimplify? 999 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 1000 match(Op1, PatternMatch::m_AnyZeroFP())) { 1001 // It's tempting to just return Op2 here, but that would give the wrong 1002 // result if Op2 was -0.0. 1003 auto *Zero = ConstantFP::getNullValue(II.getType()); 1004 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 1005 FAdd->takeName(&II); 1006 return IC.replaceInstUsesWith(II, FAdd); 1007 } 1008 1009 // If we can prove we don't have one of the special cases then we can use a 1010 // normal fma instead. 1011 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 1012 II.setCalledOperand(Intrinsic::getDeclaration( 1013 II.getModule(), Intrinsic::fma, II.getType())); 1014 return &II; 1015 } 1016 break; 1017 } 1018 case Intrinsic::amdgcn_is_shared: 1019 case Intrinsic::amdgcn_is_private: { 1020 if (isa<UndefValue>(II.getArgOperand(0))) 1021 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1022 1023 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1024 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1025 break; 1026 } 1027 default: { 1028 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1029 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1030 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1031 } 1032 } 1033 } 1034 return None; 1035 } 1036 1037 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1038 /// 1039 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1040 /// struct returns. 1041 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1042 IntrinsicInst &II, 1043 APInt DemandedElts, 1044 int DMaskIdx = -1) { 1045 1046 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1047 unsigned VWidth = IIVTy->getNumElements(); 1048 if (VWidth == 1) 1049 return nullptr; 1050 1051 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1052 IC.Builder.SetInsertPoint(&II); 1053 1054 // Assume the arguments are unchanged and later override them, if needed. 1055 SmallVector<Value *, 16> Args(II.args()); 1056 1057 if (DMaskIdx < 0) { 1058 // Buffer case. 1059 1060 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1061 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1062 1063 // Start assuming the prefix of elements is demanded, but possibly clear 1064 // some other bits if there are trailing zeros (unused components at front) 1065 // and update offset. 1066 DemandedElts = (1 << ActiveBits) - 1; 1067 1068 if (UnusedComponentsAtFront > 0) { 1069 static const unsigned InvalidOffsetIdx = 0xf; 1070 1071 unsigned OffsetIdx; 1072 switch (II.getIntrinsicID()) { 1073 case Intrinsic::amdgcn_raw_buffer_load: 1074 OffsetIdx = 1; 1075 break; 1076 case Intrinsic::amdgcn_s_buffer_load: 1077 // If resulting type is vec3, there is no point in trimming the 1078 // load with updated offset, as the vec3 would most likely be widened to 1079 // vec4 anyway during lowering. 1080 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1081 OffsetIdx = InvalidOffsetIdx; 1082 else 1083 OffsetIdx = 1; 1084 break; 1085 case Intrinsic::amdgcn_struct_buffer_load: 1086 OffsetIdx = 2; 1087 break; 1088 default: 1089 // TODO: handle tbuffer* intrinsics. 1090 OffsetIdx = InvalidOffsetIdx; 1091 break; 1092 } 1093 1094 if (OffsetIdx != InvalidOffsetIdx) { 1095 // Clear demanded bits and update the offset. 1096 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1097 auto *Offset = II.getArgOperand(OffsetIdx); 1098 unsigned SingleComponentSizeInBits = 1099 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 1100 unsigned OffsetAdd = 1101 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1102 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1103 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1104 } 1105 } 1106 } else { 1107 // Image case. 1108 1109 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 1110 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1111 1112 // Mask off values that are undefined because the dmask doesn't cover them 1113 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 1114 1115 unsigned NewDMaskVal = 0; 1116 unsigned OrigLoadIdx = 0; 1117 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1118 const unsigned Bit = 1 << SrcIdx; 1119 if (!!(DMaskVal & Bit)) { 1120 if (!!DemandedElts[OrigLoadIdx]) 1121 NewDMaskVal |= Bit; 1122 OrigLoadIdx++; 1123 } 1124 } 1125 1126 if (DMaskVal != NewDMaskVal) 1127 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1128 } 1129 1130 unsigned NewNumElts = DemandedElts.countPopulation(); 1131 if (!NewNumElts) 1132 return UndefValue::get(II.getType()); 1133 1134 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1135 if (DMaskIdx >= 0) 1136 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1137 return nullptr; 1138 } 1139 1140 // Validate function argument and return types, extracting overloaded types 1141 // along the way. 1142 SmallVector<Type *, 6> OverloadTys; 1143 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1144 return nullptr; 1145 1146 Module *M = II.getParent()->getParent()->getParent(); 1147 Type *EltTy = IIVTy->getElementType(); 1148 Type *NewTy = 1149 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1150 1151 OverloadTys[0] = NewTy; 1152 Function *NewIntrin = 1153 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1154 1155 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1156 NewCall->takeName(&II); 1157 NewCall->copyMetadata(II); 1158 1159 if (NewNumElts == 1) { 1160 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1161 NewCall, 1162 DemandedElts.countTrailingZeros()); 1163 } 1164 1165 SmallVector<int, 8> EltMask; 1166 unsigned NewLoadIdx = 0; 1167 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1168 if (!!DemandedElts[OrigLoadIdx]) 1169 EltMask.push_back(NewLoadIdx++); 1170 else 1171 EltMask.push_back(NewNumElts); 1172 } 1173 1174 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1175 1176 return Shuffle; 1177 } 1178 1179 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1180 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1181 APInt &UndefElts2, APInt &UndefElts3, 1182 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1183 SimplifyAndSetOp) const { 1184 switch (II.getIntrinsicID()) { 1185 case Intrinsic::amdgcn_buffer_load: 1186 case Intrinsic::amdgcn_buffer_load_format: 1187 case Intrinsic::amdgcn_raw_buffer_load: 1188 case Intrinsic::amdgcn_raw_buffer_load_format: 1189 case Intrinsic::amdgcn_raw_tbuffer_load: 1190 case Intrinsic::amdgcn_s_buffer_load: 1191 case Intrinsic::amdgcn_struct_buffer_load: 1192 case Intrinsic::amdgcn_struct_buffer_load_format: 1193 case Intrinsic::amdgcn_struct_tbuffer_load: 1194 case Intrinsic::amdgcn_tbuffer_load: 1195 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1196 default: { 1197 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1198 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1199 } 1200 break; 1201 } 1202 } 1203 return None; 1204 } 1205