1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "R600Subtarget.h" 21 #include "llvm/IR/IntrinsicsAMDGPU.h" 22 #include "llvm/Transforms/InstCombine/InstCombiner.h" 23 24 using namespace llvm; 25 26 #define DEBUG_TYPE "AMDGPUtti" 27 28 namespace { 29 30 struct AMDGPUImageDMaskIntrinsic { 31 unsigned Intr; 32 }; 33 34 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 35 #include "InstCombineTables.inc" 36 37 } // end anonymous namespace 38 39 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 40 // 41 // A single NaN input is folded to minnum, so we rely on that folding for 42 // handling NaNs. 43 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 44 const APFloat &Src2) { 45 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 46 47 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 48 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 49 if (Cmp0 == APFloat::cmpEqual) 50 return maxnum(Src1, Src2); 51 52 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 53 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 54 if (Cmp1 == APFloat::cmpEqual) 55 return maxnum(Src0, Src2); 56 57 return maxnum(Src0, Src1); 58 } 59 60 // Check if a value can be converted to a 16-bit value without losing 61 // precision. 62 static bool canSafelyConvertTo16Bit(Value &V) { 63 Type *VTy = V.getType(); 64 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 65 // The value is already 16-bit, so we don't want to convert to 16-bit again! 66 return false; 67 } 68 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 69 // We need to check that if we cast the index down to a half, we do not lose 70 // precision. 71 APFloat FloatValue(ConstFloat->getValueAPF()); 72 bool LosesInfo = true; 73 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 74 return !LosesInfo; 75 } 76 Value *CastSrc; 77 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 78 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 79 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 80 Type *CastSrcTy = CastSrc->getType(); 81 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 82 return true; 83 } 84 85 return false; 86 } 87 88 // Convert a value to 16-bit. 89 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 90 Type *VTy = V.getType(); 91 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 92 return cast<Instruction>(&V)->getOperand(0); 93 if (VTy->isIntegerTy()) 94 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 95 if (VTy->isFloatingPointTy()) 96 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 97 98 llvm_unreachable("Should never be called!"); 99 } 100 101 static Optional<Instruction *> 102 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 103 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 104 IntrinsicInst &II, InstCombiner &IC) { 105 if (!ST->hasA16() && !ST->hasG16()) 106 return None; 107 108 bool FloatCoord = false; 109 // true means derivatives can be converted to 16 bit, coordinates not 110 bool OnlyDerivatives = false; 111 112 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 113 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 114 Value *Coord = II.getOperand(OperandIndex); 115 // If the values are not derived from 16-bit values, we cannot optimize. 116 if (!canSafelyConvertTo16Bit(*Coord)) { 117 if (OperandIndex < ImageDimIntr->CoordStart || 118 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 119 return None; 120 } 121 // All gradients can be converted, so convert only them 122 OnlyDerivatives = true; 123 break; 124 } 125 126 assert(OperandIndex == ImageDimIntr->GradientStart || 127 FloatCoord == Coord->getType()->isFloatingPointTy()); 128 FloatCoord = Coord->getType()->isFloatingPointTy(); 129 } 130 131 if (OnlyDerivatives) { 132 if (!ST->hasG16()) 133 return None; 134 } else { 135 if (!ST->hasA16()) 136 OnlyDerivatives = true; // Only supports G16 137 } 138 139 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 140 : Type::getInt16Ty(II.getContext()); 141 142 SmallVector<Type *, 4> ArgTys; 143 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 144 return None; 145 146 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 147 if (!OnlyDerivatives) 148 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 149 Function *I = 150 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); 151 152 SmallVector<Value *, 8> Args(II.arg_operands()); 153 154 unsigned EndIndex = 155 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 156 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 157 OperandIndex < EndIndex; OperandIndex++) { 158 Args[OperandIndex] = 159 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 160 } 161 162 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 163 NewCall->takeName(&II); 164 NewCall->copyMetadata(II); 165 if (isa<FPMathOperator>(NewCall)) 166 NewCall->copyFastMathFlags(&II); 167 return IC.replaceInstUsesWith(II, NewCall); 168 } 169 170 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 171 InstCombiner &IC) const { 172 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 173 // infinity, gives +0.0. If we can prove we don't have one of the special 174 // cases then we can use a normal multiply instead. 175 // TODO: Create and use isKnownFiniteNonZero instead of just matching 176 // constants here. 177 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 178 match(Op1, PatternMatch::m_FiniteNonZero())) { 179 // One operand is not zero or infinity or NaN. 180 return true; 181 } 182 auto *TLI = &IC.getTargetLibraryInfo(); 183 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 184 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 185 // Neither operand is infinity or NaN. 186 return true; 187 } 188 return false; 189 } 190 191 Optional<Instruction *> 192 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 193 Intrinsic::ID IID = II.getIntrinsicID(); 194 switch (IID) { 195 case Intrinsic::amdgcn_rcp: { 196 Value *Src = II.getArgOperand(0); 197 198 // TODO: Move to ConstantFolding/InstSimplify? 199 if (isa<UndefValue>(Src)) { 200 Type *Ty = II.getType(); 201 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 202 return IC.replaceInstUsesWith(II, QNaN); 203 } 204 205 if (II.isStrictFP()) 206 break; 207 208 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 209 const APFloat &ArgVal = C->getValueAPF(); 210 APFloat Val(ArgVal.getSemantics(), 1); 211 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 212 213 // This is more precise than the instruction may give. 214 // 215 // TODO: The instruction always flushes denormal results (except for f16), 216 // should this also? 217 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 218 } 219 220 break; 221 } 222 case Intrinsic::amdgcn_rsq: { 223 Value *Src = II.getArgOperand(0); 224 225 // TODO: Move to ConstantFolding/InstSimplify? 226 if (isa<UndefValue>(Src)) { 227 Type *Ty = II.getType(); 228 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 229 return IC.replaceInstUsesWith(II, QNaN); 230 } 231 232 break; 233 } 234 case Intrinsic::amdgcn_frexp_mant: 235 case Intrinsic::amdgcn_frexp_exp: { 236 Value *Src = II.getArgOperand(0); 237 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 238 int Exp; 239 APFloat Significand = 240 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 241 242 if (IID == Intrinsic::amdgcn_frexp_mant) { 243 return IC.replaceInstUsesWith( 244 II, ConstantFP::get(II.getContext(), Significand)); 245 } 246 247 // Match instruction special case behavior. 248 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 249 Exp = 0; 250 251 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 252 } 253 254 if (isa<UndefValue>(Src)) { 255 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 256 } 257 258 break; 259 } 260 case Intrinsic::amdgcn_class: { 261 enum { 262 S_NAN = 1 << 0, // Signaling NaN 263 Q_NAN = 1 << 1, // Quiet NaN 264 N_INFINITY = 1 << 2, // Negative infinity 265 N_NORMAL = 1 << 3, // Negative normal 266 N_SUBNORMAL = 1 << 4, // Negative subnormal 267 N_ZERO = 1 << 5, // Negative zero 268 P_ZERO = 1 << 6, // Positive zero 269 P_SUBNORMAL = 1 << 7, // Positive subnormal 270 P_NORMAL = 1 << 8, // Positive normal 271 P_INFINITY = 1 << 9 // Positive infinity 272 }; 273 274 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 275 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 276 P_NORMAL | P_INFINITY; 277 278 Value *Src0 = II.getArgOperand(0); 279 Value *Src1 = II.getArgOperand(1); 280 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 281 if (!CMask) { 282 if (isa<UndefValue>(Src0)) { 283 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 284 } 285 286 if (isa<UndefValue>(Src1)) { 287 return IC.replaceInstUsesWith(II, 288 ConstantInt::get(II.getType(), false)); 289 } 290 break; 291 } 292 293 uint32_t Mask = CMask->getZExtValue(); 294 295 // If all tests are made, it doesn't matter what the value is. 296 if ((Mask & FullMask) == FullMask) { 297 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 298 } 299 300 if ((Mask & FullMask) == 0) { 301 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 302 } 303 304 if (Mask == (S_NAN | Q_NAN)) { 305 // Equivalent of isnan. Replace with standard fcmp. 306 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 307 FCmp->takeName(&II); 308 return IC.replaceInstUsesWith(II, FCmp); 309 } 310 311 if (Mask == (N_ZERO | P_ZERO)) { 312 // Equivalent of == 0. 313 Value *FCmp = 314 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 315 316 FCmp->takeName(&II); 317 return IC.replaceInstUsesWith(II, FCmp); 318 } 319 320 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 321 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 322 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 323 return IC.replaceOperand( 324 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 325 } 326 327 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 328 if (!CVal) { 329 if (isa<UndefValue>(Src0)) { 330 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 331 } 332 333 // Clamp mask to used bits 334 if ((Mask & FullMask) != Mask) { 335 CallInst *NewCall = IC.Builder.CreateCall( 336 II.getCalledFunction(), 337 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 338 339 NewCall->takeName(&II); 340 return IC.replaceInstUsesWith(II, NewCall); 341 } 342 343 break; 344 } 345 346 const APFloat &Val = CVal->getValueAPF(); 347 348 bool Result = 349 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 350 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 351 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 352 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 353 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 354 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 355 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 356 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 357 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 358 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 359 360 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 361 } 362 case Intrinsic::amdgcn_cvt_pkrtz: { 363 Value *Src0 = II.getArgOperand(0); 364 Value *Src1 = II.getArgOperand(1); 365 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 366 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 367 const fltSemantics &HalfSem = 368 II.getType()->getScalarType()->getFltSemantics(); 369 bool LosesInfo; 370 APFloat Val0 = C0->getValueAPF(); 371 APFloat Val1 = C1->getValueAPF(); 372 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 373 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 374 375 Constant *Folded = 376 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 377 ConstantFP::get(II.getContext(), Val1)}); 378 return IC.replaceInstUsesWith(II, Folded); 379 } 380 } 381 382 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 383 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 384 } 385 386 break; 387 } 388 case Intrinsic::amdgcn_cvt_pknorm_i16: 389 case Intrinsic::amdgcn_cvt_pknorm_u16: 390 case Intrinsic::amdgcn_cvt_pk_i16: 391 case Intrinsic::amdgcn_cvt_pk_u16: { 392 Value *Src0 = II.getArgOperand(0); 393 Value *Src1 = II.getArgOperand(1); 394 395 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 396 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 397 } 398 399 break; 400 } 401 case Intrinsic::amdgcn_ubfe: 402 case Intrinsic::amdgcn_sbfe: { 403 // Decompose simple cases into standard shifts. 404 Value *Src = II.getArgOperand(0); 405 if (isa<UndefValue>(Src)) { 406 return IC.replaceInstUsesWith(II, Src); 407 } 408 409 unsigned Width; 410 Type *Ty = II.getType(); 411 unsigned IntSize = Ty->getIntegerBitWidth(); 412 413 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 414 if (CWidth) { 415 Width = CWidth->getZExtValue(); 416 if ((Width & (IntSize - 1)) == 0) { 417 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 418 } 419 420 // Hardware ignores high bits, so remove those. 421 if (Width >= IntSize) { 422 return IC.replaceOperand( 423 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 424 } 425 } 426 427 unsigned Offset; 428 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 429 if (COffset) { 430 Offset = COffset->getZExtValue(); 431 if (Offset >= IntSize) { 432 return IC.replaceOperand( 433 II, 1, 434 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 435 } 436 } 437 438 bool Signed = IID == Intrinsic::amdgcn_sbfe; 439 440 if (!CWidth || !COffset) 441 break; 442 443 // The case of Width == 0 is handled above, which makes this tranformation 444 // safe. If Width == 0, then the ashr and lshr instructions become poison 445 // value since the shift amount would be equal to the bit size. 446 assert(Width != 0); 447 448 // TODO: This allows folding to undef when the hardware has specific 449 // behavior? 450 if (Offset + Width < IntSize) { 451 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 452 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 453 : IC.Builder.CreateLShr(Shl, IntSize - Width); 454 RightShift->takeName(&II); 455 return IC.replaceInstUsesWith(II, RightShift); 456 } 457 458 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 459 : IC.Builder.CreateLShr(Src, Offset); 460 461 RightShift->takeName(&II); 462 return IC.replaceInstUsesWith(II, RightShift); 463 } 464 case Intrinsic::amdgcn_exp: 465 case Intrinsic::amdgcn_exp_compr: { 466 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 467 unsigned EnBits = En->getZExtValue(); 468 if (EnBits == 0xf) 469 break; // All inputs enabled. 470 471 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 472 bool Changed = false; 473 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 474 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 475 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 476 Value *Src = II.getArgOperand(I + 2); 477 if (!isa<UndefValue>(Src)) { 478 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 479 Changed = true; 480 } 481 } 482 } 483 484 if (Changed) { 485 return &II; 486 } 487 488 break; 489 } 490 case Intrinsic::amdgcn_fmed3: { 491 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 492 // for the shader. 493 494 Value *Src0 = II.getArgOperand(0); 495 Value *Src1 = II.getArgOperand(1); 496 Value *Src2 = II.getArgOperand(2); 497 498 // Checking for NaN before canonicalization provides better fidelity when 499 // mapping other operations onto fmed3 since the order of operands is 500 // unchanged. 501 CallInst *NewCall = nullptr; 502 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 503 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 504 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 505 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 506 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 507 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 508 } 509 510 if (NewCall) { 511 NewCall->copyFastMathFlags(&II); 512 NewCall->takeName(&II); 513 return IC.replaceInstUsesWith(II, NewCall); 514 } 515 516 bool Swap = false; 517 // Canonicalize constants to RHS operands. 518 // 519 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 520 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 521 std::swap(Src0, Src1); 522 Swap = true; 523 } 524 525 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 526 std::swap(Src1, Src2); 527 Swap = true; 528 } 529 530 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 531 std::swap(Src0, Src1); 532 Swap = true; 533 } 534 535 if (Swap) { 536 II.setArgOperand(0, Src0); 537 II.setArgOperand(1, Src1); 538 II.setArgOperand(2, Src2); 539 return &II; 540 } 541 542 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 543 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 544 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 545 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 546 C2->getValueAPF()); 547 return IC.replaceInstUsesWith( 548 II, ConstantFP::get(IC.Builder.getContext(), Result)); 549 } 550 } 551 } 552 553 break; 554 } 555 case Intrinsic::amdgcn_icmp: 556 case Intrinsic::amdgcn_fcmp: { 557 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 558 // Guard against invalid arguments. 559 int64_t CCVal = CC->getZExtValue(); 560 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 561 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 562 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 563 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 564 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 565 break; 566 567 Value *Src0 = II.getArgOperand(0); 568 Value *Src1 = II.getArgOperand(1); 569 570 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 571 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 572 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 573 if (CCmp->isNullValue()) { 574 return IC.replaceInstUsesWith( 575 II, ConstantExpr::getSExt(CCmp, II.getType())); 576 } 577 578 // The result of V_ICMP/V_FCMP assembly instructions (which this 579 // intrinsic exposes) is one bit per thread, masked with the EXEC 580 // register (which contains the bitmask of live threads). So a 581 // comparison that always returns true is the same as a read of the 582 // EXEC register. 583 Function *NewF = Intrinsic::getDeclaration( 584 II.getModule(), Intrinsic::read_register, II.getType()); 585 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 586 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 587 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 588 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 589 NewCall->addAttribute(AttributeList::FunctionIndex, 590 Attribute::Convergent); 591 NewCall->takeName(&II); 592 return IC.replaceInstUsesWith(II, NewCall); 593 } 594 595 // Canonicalize constants to RHS. 596 CmpInst::Predicate SwapPred = 597 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 598 II.setArgOperand(0, Src1); 599 II.setArgOperand(1, Src0); 600 II.setArgOperand( 601 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 602 return &II; 603 } 604 605 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 606 break; 607 608 // Canonicalize compare eq with true value to compare != 0 609 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 610 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 611 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 612 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 613 Value *ExtSrc; 614 if (CCVal == CmpInst::ICMP_EQ && 615 ((match(Src1, PatternMatch::m_One()) && 616 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 617 (match(Src1, PatternMatch::m_AllOnes()) && 618 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 619 ExtSrc->getType()->isIntegerTy(1)) { 620 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 621 IC.replaceOperand(II, 2, 622 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 623 return &II; 624 } 625 626 CmpInst::Predicate SrcPred; 627 Value *SrcLHS; 628 Value *SrcRHS; 629 630 // Fold compare eq/ne with 0 from a compare result as the predicate to the 631 // intrinsic. The typical use is a wave vote function in the library, which 632 // will be fed from a user code condition compared with 0. Fold in the 633 // redundant compare. 634 635 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 636 // -> llvm.amdgcn.[if]cmp(a, b, pred) 637 // 638 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 639 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 640 if (match(Src1, PatternMatch::m_Zero()) && 641 match(Src0, PatternMatch::m_ZExtOrSExt( 642 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 643 PatternMatch::m_Value(SrcRHS))))) { 644 if (CCVal == CmpInst::ICMP_EQ) 645 SrcPred = CmpInst::getInversePredicate(SrcPred); 646 647 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 648 ? Intrinsic::amdgcn_fcmp 649 : Intrinsic::amdgcn_icmp; 650 651 Type *Ty = SrcLHS->getType(); 652 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 653 // Promote to next legal integer type. 654 unsigned Width = CmpType->getBitWidth(); 655 unsigned NewWidth = Width; 656 657 // Don't do anything for i1 comparisons. 658 if (Width == 1) 659 break; 660 661 if (Width <= 16) 662 NewWidth = 16; 663 else if (Width <= 32) 664 NewWidth = 32; 665 else if (Width <= 64) 666 NewWidth = 64; 667 else if (Width > 64) 668 break; // Can't handle this. 669 670 if (Width != NewWidth) { 671 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 672 if (CmpInst::isSigned(SrcPred)) { 673 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 674 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 675 } else { 676 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 677 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 678 } 679 } 680 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 681 break; 682 683 Function *NewF = Intrinsic::getDeclaration( 684 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 685 Value *Args[] = {SrcLHS, SrcRHS, 686 ConstantInt::get(CC->getType(), SrcPred)}; 687 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 688 NewCall->takeName(&II); 689 return IC.replaceInstUsesWith(II, NewCall); 690 } 691 692 break; 693 } 694 case Intrinsic::amdgcn_ballot: { 695 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 696 if (Src->isZero()) { 697 // amdgcn.ballot(i1 0) is zero. 698 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 699 } 700 701 if (Src->isOne()) { 702 // amdgcn.ballot(i1 1) is exec. 703 const char *RegName = "exec"; 704 if (II.getType()->isIntegerTy(32)) 705 RegName = "exec_lo"; 706 else if (!II.getType()->isIntegerTy(64)) 707 break; 708 709 Function *NewF = Intrinsic::getDeclaration( 710 II.getModule(), Intrinsic::read_register, II.getType()); 711 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 712 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 713 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 714 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 715 NewCall->addAttribute(AttributeList::FunctionIndex, 716 Attribute::Convergent); 717 NewCall->takeName(&II); 718 return IC.replaceInstUsesWith(II, NewCall); 719 } 720 } 721 break; 722 } 723 case Intrinsic::amdgcn_wqm_vote: { 724 // wqm_vote is identity when the argument is constant. 725 if (!isa<Constant>(II.getArgOperand(0))) 726 break; 727 728 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 729 } 730 case Intrinsic::amdgcn_kill: { 731 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 732 if (!C || !C->getZExtValue()) 733 break; 734 735 // amdgcn.kill(i1 1) is a no-op 736 return IC.eraseInstFromFunction(II); 737 } 738 case Intrinsic::amdgcn_update_dpp: { 739 Value *Old = II.getArgOperand(0); 740 741 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 742 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 743 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 744 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 745 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 746 break; 747 748 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 749 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 750 } 751 case Intrinsic::amdgcn_permlane16: 752 case Intrinsic::amdgcn_permlanex16: { 753 // Discard vdst_in if it's not going to be read. 754 Value *VDstIn = II.getArgOperand(0); 755 if (isa<UndefValue>(VDstIn)) 756 break; 757 758 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 759 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 760 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 761 break; 762 763 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 764 } 765 case Intrinsic::amdgcn_readfirstlane: 766 case Intrinsic::amdgcn_readlane: { 767 // A constant value is trivially uniform. 768 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 769 return IC.replaceInstUsesWith(II, C); 770 } 771 772 // The rest of these may not be safe if the exec may not be the same between 773 // the def and use. 774 Value *Src = II.getArgOperand(0); 775 Instruction *SrcInst = dyn_cast<Instruction>(Src); 776 if (SrcInst && SrcInst->getParent() != II.getParent()) 777 break; 778 779 // readfirstlane (readfirstlane x) -> readfirstlane x 780 // readlane (readfirstlane x), y -> readfirstlane x 781 if (match(Src, 782 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 783 return IC.replaceInstUsesWith(II, Src); 784 } 785 786 if (IID == Intrinsic::amdgcn_readfirstlane) { 787 // readfirstlane (readlane x, y) -> readlane x, y 788 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 789 return IC.replaceInstUsesWith(II, Src); 790 } 791 } else { 792 // readlane (readlane x, y), y -> readlane x, y 793 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 794 PatternMatch::m_Value(), 795 PatternMatch::m_Specific(II.getArgOperand(1))))) { 796 return IC.replaceInstUsesWith(II, Src); 797 } 798 } 799 800 break; 801 } 802 case Intrinsic::amdgcn_ldexp: { 803 // FIXME: This doesn't introduce new instructions and belongs in 804 // InstructionSimplify. 805 Type *Ty = II.getType(); 806 Value *Op0 = II.getArgOperand(0); 807 Value *Op1 = II.getArgOperand(1); 808 809 // Folding undef to qnan is safe regardless of the FP mode. 810 if (isa<UndefValue>(Op0)) { 811 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 812 return IC.replaceInstUsesWith(II, QNaN); 813 } 814 815 const APFloat *C = nullptr; 816 match(Op0, PatternMatch::m_APFloat(C)); 817 818 // FIXME: Should flush denorms depending on FP mode, but that's ignored 819 // everywhere else. 820 // 821 // These cases should be safe, even with strictfp. 822 // ldexp(0.0, x) -> 0.0 823 // ldexp(-0.0, x) -> -0.0 824 // ldexp(inf, x) -> inf 825 // ldexp(-inf, x) -> -inf 826 if (C && (C->isZero() || C->isInfinity())) { 827 return IC.replaceInstUsesWith(II, Op0); 828 } 829 830 // With strictfp, be more careful about possibly needing to flush denormals 831 // or not, and snan behavior depends on ieee_mode. 832 if (II.isStrictFP()) 833 break; 834 835 if (C && C->isNaN()) { 836 // FIXME: We just need to make the nan quiet here, but that's unavailable 837 // on APFloat, only IEEEfloat 838 auto *Quieted = 839 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 840 return IC.replaceInstUsesWith(II, Quieted); 841 } 842 843 // ldexp(x, 0) -> x 844 // ldexp(x, undef) -> x 845 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 846 return IC.replaceInstUsesWith(II, Op0); 847 } 848 849 break; 850 } 851 case Intrinsic::amdgcn_fmul_legacy: { 852 Value *Op0 = II.getArgOperand(0); 853 Value *Op1 = II.getArgOperand(1); 854 855 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 856 // infinity, gives +0.0. 857 // TODO: Move to InstSimplify? 858 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 859 match(Op1, PatternMatch::m_AnyZeroFP())) 860 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 861 862 // If we can prove we don't have one of the special cases then we can use a 863 // normal fmul instruction instead. 864 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 865 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 866 FMul->takeName(&II); 867 return IC.replaceInstUsesWith(II, FMul); 868 } 869 break; 870 } 871 case Intrinsic::amdgcn_fma_legacy: { 872 Value *Op0 = II.getArgOperand(0); 873 Value *Op1 = II.getArgOperand(1); 874 Value *Op2 = II.getArgOperand(2); 875 876 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 877 // infinity, gives +0.0. 878 // TODO: Move to InstSimplify? 879 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 880 match(Op1, PatternMatch::m_AnyZeroFP())) { 881 // It's tempting to just return Op2 here, but that would give the wrong 882 // result if Op2 was -0.0. 883 auto *Zero = ConstantFP::getNullValue(II.getType()); 884 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 885 FAdd->takeName(&II); 886 return IC.replaceInstUsesWith(II, FAdd); 887 } 888 889 // If we can prove we don't have one of the special cases then we can use a 890 // normal fma instead. 891 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 892 II.setCalledOperand(Intrinsic::getDeclaration( 893 II.getModule(), Intrinsic::fma, II.getType())); 894 return &II; 895 } 896 break; 897 } 898 default: { 899 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 900 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 901 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 902 } 903 } 904 } 905 return None; 906 } 907 908 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 909 /// 910 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 911 /// struct returns. 912 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 913 IntrinsicInst &II, 914 APInt DemandedElts, 915 int DMaskIdx = -1) { 916 917 auto *IIVTy = cast<FixedVectorType>(II.getType()); 918 unsigned VWidth = IIVTy->getNumElements(); 919 if (VWidth == 1) 920 return nullptr; 921 922 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 923 IC.Builder.SetInsertPoint(&II); 924 925 // Assume the arguments are unchanged and later override them, if needed. 926 SmallVector<Value *, 16> Args(II.args()); 927 928 if (DMaskIdx < 0) { 929 // Buffer case. 930 931 const unsigned ActiveBits = DemandedElts.getActiveBits(); 932 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 933 934 // Start assuming the prefix of elements is demanded, but possibly clear 935 // some other bits if there are trailing zeros (unused components at front) 936 // and update offset. 937 DemandedElts = (1 << ActiveBits) - 1; 938 939 if (UnusedComponentsAtFront > 0) { 940 static const unsigned InvalidOffsetIdx = 0xf; 941 942 unsigned OffsetIdx; 943 switch (II.getIntrinsicID()) { 944 case Intrinsic::amdgcn_raw_buffer_load: 945 OffsetIdx = 1; 946 break; 947 case Intrinsic::amdgcn_s_buffer_load: 948 // If resulting type is vec3, there is no point in trimming the 949 // load with updated offset, as the vec3 would most likely be widened to 950 // vec4 anyway during lowering. 951 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 952 OffsetIdx = InvalidOffsetIdx; 953 else 954 OffsetIdx = 1; 955 break; 956 case Intrinsic::amdgcn_struct_buffer_load: 957 OffsetIdx = 2; 958 break; 959 default: 960 // TODO: handle tbuffer* intrinsics. 961 OffsetIdx = InvalidOffsetIdx; 962 break; 963 } 964 965 if (OffsetIdx != InvalidOffsetIdx) { 966 // Clear demanded bits and update the offset. 967 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 968 auto *Offset = II.getArgOperand(OffsetIdx); 969 unsigned SingleComponentSizeInBits = 970 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 971 unsigned OffsetAdd = 972 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 973 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 974 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 975 } 976 } 977 } else { 978 // Image case. 979 980 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 981 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 982 983 // Mask off values that are undefined because the dmask doesn't cover them 984 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 985 986 unsigned NewDMaskVal = 0; 987 unsigned OrigLoadIdx = 0; 988 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 989 const unsigned Bit = 1 << SrcIdx; 990 if (!!(DMaskVal & Bit)) { 991 if (!!DemandedElts[OrigLoadIdx]) 992 NewDMaskVal |= Bit; 993 OrigLoadIdx++; 994 } 995 } 996 997 if (DMaskVal != NewDMaskVal) 998 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 999 } 1000 1001 unsigned NewNumElts = DemandedElts.countPopulation(); 1002 if (!NewNumElts) 1003 return UndefValue::get(II.getType()); 1004 1005 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1006 if (DMaskIdx >= 0) 1007 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1008 return nullptr; 1009 } 1010 1011 // Validate function argument and return types, extracting overloaded types 1012 // along the way. 1013 SmallVector<Type *, 6> OverloadTys; 1014 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1015 return nullptr; 1016 1017 Module *M = II.getParent()->getParent()->getParent(); 1018 Type *EltTy = IIVTy->getElementType(); 1019 Type *NewTy = 1020 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1021 1022 OverloadTys[0] = NewTy; 1023 Function *NewIntrin = 1024 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1025 1026 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1027 NewCall->takeName(&II); 1028 NewCall->copyMetadata(II); 1029 1030 if (NewNumElts == 1) { 1031 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1032 NewCall, 1033 DemandedElts.countTrailingZeros()); 1034 } 1035 1036 SmallVector<int, 8> EltMask; 1037 unsigned NewLoadIdx = 0; 1038 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1039 if (!!DemandedElts[OrigLoadIdx]) 1040 EltMask.push_back(NewLoadIdx++); 1041 else 1042 EltMask.push_back(NewNumElts); 1043 } 1044 1045 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1046 1047 return Shuffle; 1048 } 1049 1050 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1051 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1052 APInt &UndefElts2, APInt &UndefElts3, 1053 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1054 SimplifyAndSetOp) const { 1055 switch (II.getIntrinsicID()) { 1056 case Intrinsic::amdgcn_buffer_load: 1057 case Intrinsic::amdgcn_buffer_load_format: 1058 case Intrinsic::amdgcn_raw_buffer_load: 1059 case Intrinsic::amdgcn_raw_buffer_load_format: 1060 case Intrinsic::amdgcn_raw_tbuffer_load: 1061 case Intrinsic::amdgcn_s_buffer_load: 1062 case Intrinsic::amdgcn_struct_buffer_load: 1063 case Intrinsic::amdgcn_struct_buffer_load_format: 1064 case Intrinsic::amdgcn_struct_tbuffer_load: 1065 case Intrinsic::amdgcn_tbuffer_load: 1066 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1067 default: { 1068 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1069 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1070 } 1071 break; 1072 } 1073 } 1074 return None; 1075 } 1076