1 //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "AMDGPUCombinerHelper.h" 10 #include "GCNSubtarget.h" 11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 12 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 13 #include "llvm/IR/IntrinsicsAMDGPU.h" 14 #include "llvm/Target/TargetMachine.h" 15 16 using namespace llvm; 17 using namespace MIPatternMatch; 18 19 LLVM_READNONE 20 static bool fnegFoldsIntoMI(const MachineInstr &MI) { 21 switch (MI.getOpcode()) { 22 case AMDGPU::G_FADD: 23 case AMDGPU::G_FSUB: 24 case AMDGPU::G_FMUL: 25 case AMDGPU::G_FMA: 26 case AMDGPU::G_FMAD: 27 case AMDGPU::G_FMINNUM: 28 case AMDGPU::G_FMAXNUM: 29 case AMDGPU::G_FMINNUM_IEEE: 30 case AMDGPU::G_FMAXNUM_IEEE: 31 case AMDGPU::G_FSIN: 32 case AMDGPU::G_FPEXT: 33 case AMDGPU::G_INTRINSIC_TRUNC: 34 case AMDGPU::G_FPTRUNC: 35 case AMDGPU::G_FRINT: 36 case AMDGPU::G_FNEARBYINT: 37 case AMDGPU::G_INTRINSIC_ROUND: 38 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 39 case AMDGPU::G_FCANONICALIZE: 40 case AMDGPU::G_AMDGPU_RCP_IFLAG: 41 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 42 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 43 return true; 44 case AMDGPU::G_INTRINSIC: { 45 unsigned IntrinsicID = MI.getIntrinsicID(); 46 switch (IntrinsicID) { 47 case Intrinsic::amdgcn_rcp: 48 case Intrinsic::amdgcn_rcp_legacy: 49 case Intrinsic::amdgcn_sin: 50 case Intrinsic::amdgcn_fmul_legacy: 51 case Intrinsic::amdgcn_fmed3: 52 case Intrinsic::amdgcn_fma_legacy: 53 return true; 54 default: 55 return false; 56 } 57 } 58 default: 59 return false; 60 } 61 } 62 63 /// \p returns true if the operation will definitely need to use a 64-bit 64 /// encoding, and thus will use a VOP3 encoding regardless of the source 65 /// modifiers. 66 LLVM_READONLY 67 static bool opMustUseVOP3Encoding(const MachineInstr &MI, 68 const MachineRegisterInfo &MRI) { 69 return MI.getNumOperands() > 70 (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) || 71 MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64; 72 } 73 74 // Most FP instructions support source modifiers. 75 LLVM_READONLY 76 static bool hasSourceMods(const MachineInstr &MI) { 77 if (!MI.memoperands().empty()) 78 return false; 79 80 switch (MI.getOpcode()) { 81 case AMDGPU::COPY: 82 case AMDGPU::G_SELECT: 83 case AMDGPU::G_FDIV: 84 case AMDGPU::G_FREM: 85 case TargetOpcode::INLINEASM: 86 case TargetOpcode::INLINEASM_BR: 87 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: 88 case AMDGPU::G_BITCAST: 89 case AMDGPU::G_ANYEXT: 90 case AMDGPU::G_BUILD_VECTOR: 91 case AMDGPU::G_BUILD_VECTOR_TRUNC: 92 case AMDGPU::G_PHI: 93 return false; 94 case AMDGPU::G_INTRINSIC: { 95 unsigned IntrinsicID = MI.getIntrinsicID(); 96 switch (IntrinsicID) { 97 case Intrinsic::amdgcn_interp_p1: 98 case Intrinsic::amdgcn_interp_p2: 99 case Intrinsic::amdgcn_interp_mov: 100 case Intrinsic::amdgcn_interp_p1_f16: 101 case Intrinsic::amdgcn_interp_p2_f16: 102 case Intrinsic::amdgcn_div_scale: 103 return false; 104 default: 105 return true; 106 } 107 } 108 default: 109 return true; 110 } 111 } 112 113 static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, 114 unsigned CostThreshold = 4) { 115 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus 116 // it is truly free to use a source modifier in all cases. If there are 117 // multiple users but for each one will necessitate using VOP3, there will be 118 // a code size increase. Try to avoid increasing code size unless we know it 119 // will save on the instruction count. 120 unsigned NumMayIncreaseSize = 0; 121 Register Dst = MI.getOperand(0).getReg(); 122 for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) { 123 if (!hasSourceMods(Use)) 124 return false; 125 126 if (!opMustUseVOP3Encoding(Use, MRI)) { 127 if (++NumMayIncreaseSize > CostThreshold) 128 return false; 129 } 130 } 131 return true; 132 } 133 134 static bool mayIgnoreSignedZero(MachineInstr &MI) { 135 const TargetOptions &Options = MI.getMF()->getTarget().Options; 136 return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz); 137 } 138 139 static bool isInv2Pi(const APFloat &APF) { 140 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); 141 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); 142 static const APFloat KF64(APFloat::IEEEdouble(), 143 APInt(64, 0x3fc45f306dc9c882)); 144 145 return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) || 146 APF.bitwiseIsEqual(KF64); 147 } 148 149 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an 150 // additional cost to negate them. 151 static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, 152 MachineRegisterInfo &MRI) { 153 std::optional<FPValueAndVReg> FPValReg; 154 if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) { 155 if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) 156 return true; 157 158 const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>(); 159 if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value)) 160 return true; 161 } 162 return false; 163 } 164 165 static unsigned inverseMinMax(unsigned Opc) { 166 switch (Opc) { 167 case AMDGPU::G_FMAXNUM: 168 return AMDGPU::G_FMINNUM; 169 case AMDGPU::G_FMINNUM: 170 return AMDGPU::G_FMAXNUM; 171 case AMDGPU::G_FMAXNUM_IEEE: 172 return AMDGPU::G_FMINNUM_IEEE; 173 case AMDGPU::G_FMINNUM_IEEE: 174 return AMDGPU::G_FMAXNUM_IEEE; 175 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 176 return AMDGPU::G_AMDGPU_FMIN_LEGACY; 177 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 178 return AMDGPU::G_AMDGPU_FMAX_LEGACY; 179 default: 180 llvm_unreachable("invalid min/max opcode"); 181 } 182 } 183 184 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, 185 MachineInstr *&MatchInfo) { 186 Register Src = MI.getOperand(1).getReg(); 187 MatchInfo = MRI.getVRegDef(Src); 188 189 // If the input has multiple uses and we can either fold the negate down, or 190 // the other uses cannot, give up. This both prevents unprofitable 191 // transformations and infinite loops: we won't repeatedly try to fold around 192 // a negate that has no 'good' form. 193 if (MRI.hasOneNonDBGUse(Src)) { 194 if (allUsesHaveSourceMods(MI, MRI, 0)) 195 return false; 196 } else { 197 if (fnegFoldsIntoMI(*MatchInfo) && 198 (allUsesHaveSourceMods(MI, MRI) || 199 !allUsesHaveSourceMods(*MatchInfo, MRI))) 200 return false; 201 } 202 203 switch (MatchInfo->getOpcode()) { 204 case AMDGPU::G_FMINNUM: 205 case AMDGPU::G_FMAXNUM: 206 case AMDGPU::G_FMINNUM_IEEE: 207 case AMDGPU::G_FMAXNUM_IEEE: 208 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 209 case AMDGPU::G_AMDGPU_FMAX_LEGACY: 210 // 0 doesn't have a negated inline immediate. 211 return !isConstantCostlierToNegate(*MatchInfo, 212 MatchInfo->getOperand(2).getReg(), MRI); 213 case AMDGPU::G_FADD: 214 case AMDGPU::G_FSUB: 215 case AMDGPU::G_FMA: 216 case AMDGPU::G_FMAD: 217 return mayIgnoreSignedZero(*MatchInfo); 218 case AMDGPU::G_FMUL: 219 case AMDGPU::G_FPEXT: 220 case AMDGPU::G_INTRINSIC_TRUNC: 221 case AMDGPU::G_FPTRUNC: 222 case AMDGPU::G_FRINT: 223 case AMDGPU::G_FNEARBYINT: 224 case AMDGPU::G_INTRINSIC_ROUND: 225 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 226 case AMDGPU::G_FSIN: 227 case AMDGPU::G_FCANONICALIZE: 228 case AMDGPU::G_AMDGPU_RCP_IFLAG: 229 return true; 230 case AMDGPU::G_INTRINSIC: { 231 unsigned IntrinsicID = MatchInfo->getIntrinsicID(); 232 switch (IntrinsicID) { 233 case Intrinsic::amdgcn_rcp: 234 case Intrinsic::amdgcn_rcp_legacy: 235 case Intrinsic::amdgcn_sin: 236 case Intrinsic::amdgcn_fmul_legacy: 237 case Intrinsic::amdgcn_fmed3: 238 return true; 239 case Intrinsic::amdgcn_fma_legacy: 240 return mayIgnoreSignedZero(*MatchInfo); 241 default: 242 return false; 243 } 244 } 245 default: 246 return false; 247 } 248 } 249 250 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, 251 MachineInstr *&MatchInfo) { 252 // Transform: 253 // %A = inst %Op1, ... 254 // %B = fneg %A 255 // 256 // into: 257 // 258 // (if %A has one use, specifically fneg above) 259 // %B = inst (maybe fneg %Op1), ... 260 // 261 // (if %A has multiple uses) 262 // %B = inst (maybe fneg %Op1), ... 263 // %A = fneg %B 264 265 // Replace register in operand with a register holding negated value. 266 auto NegateOperand = [&](MachineOperand &Op) { 267 Register Reg = Op.getReg(); 268 if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg)))) 269 Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0); 270 replaceRegOpWith(MRI, Op, Reg); 271 }; 272 273 // Replace either register in operands with a register holding negated value. 274 auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { 275 Register XReg = X.getReg(); 276 Register YReg = Y.getReg(); 277 if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg)))) 278 replaceRegOpWith(MRI, X, XReg); 279 else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg)))) 280 replaceRegOpWith(MRI, Y, YReg); 281 else { 282 YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0); 283 replaceRegOpWith(MRI, Y, YReg); 284 } 285 }; 286 287 Builder.setInstrAndDebugLoc(*MatchInfo); 288 289 // Negate appropriate operands so that resulting value of MatchInfo is 290 // negated. 291 switch (MatchInfo->getOpcode()) { 292 case AMDGPU::G_FADD: 293 case AMDGPU::G_FSUB: 294 NegateOperand(MatchInfo->getOperand(1)); 295 NegateOperand(MatchInfo->getOperand(2)); 296 break; 297 case AMDGPU::G_FMUL: 298 NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); 299 break; 300 case AMDGPU::G_FMINNUM: 301 case AMDGPU::G_FMAXNUM: 302 case AMDGPU::G_FMINNUM_IEEE: 303 case AMDGPU::G_FMAXNUM_IEEE: 304 case AMDGPU::G_AMDGPU_FMIN_LEGACY: 305 case AMDGPU::G_AMDGPU_FMAX_LEGACY: { 306 NegateOperand(MatchInfo->getOperand(1)); 307 NegateOperand(MatchInfo->getOperand(2)); 308 unsigned Opposite = inverseMinMax(MatchInfo->getOpcode()); 309 replaceOpcodeWith(*MatchInfo, Opposite); 310 break; 311 } 312 case AMDGPU::G_FMA: 313 case AMDGPU::G_FMAD: 314 NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); 315 NegateOperand(MatchInfo->getOperand(3)); 316 break; 317 case AMDGPU::G_FPEXT: 318 case AMDGPU::G_INTRINSIC_TRUNC: 319 case AMDGPU::G_FRINT: 320 case AMDGPU::G_FNEARBYINT: 321 case AMDGPU::G_INTRINSIC_ROUND: 322 case AMDGPU::G_INTRINSIC_ROUNDEVEN: 323 case AMDGPU::G_FSIN: 324 case AMDGPU::G_FCANONICALIZE: 325 case AMDGPU::G_AMDGPU_RCP_IFLAG: 326 case AMDGPU::G_FPTRUNC: 327 NegateOperand(MatchInfo->getOperand(1)); 328 break; 329 case AMDGPU::G_INTRINSIC: { 330 unsigned IntrinsicID = MatchInfo->getIntrinsicID(); 331 switch (IntrinsicID) { 332 case Intrinsic::amdgcn_rcp: 333 case Intrinsic::amdgcn_rcp_legacy: 334 case Intrinsic::amdgcn_sin: 335 NegateOperand(MatchInfo->getOperand(2)); 336 break; 337 case Intrinsic::amdgcn_fmul_legacy: 338 NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); 339 break; 340 case Intrinsic::amdgcn_fmed3: 341 NegateOperand(MatchInfo->getOperand(2)); 342 NegateOperand(MatchInfo->getOperand(3)); 343 NegateOperand(MatchInfo->getOperand(4)); 344 break; 345 case Intrinsic::amdgcn_fma_legacy: 346 NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); 347 NegateOperand(MatchInfo->getOperand(4)); 348 break; 349 default: 350 llvm_unreachable("folding fneg not supported for this intrinsic"); 351 } 352 break; 353 } 354 default: 355 llvm_unreachable("folding fneg not supported for this instruction"); 356 } 357 358 Register Dst = MI.getOperand(0).getReg(); 359 Register MatchInfoDst = MatchInfo->getOperand(0).getReg(); 360 361 if (MRI.hasOneNonDBGUse(MatchInfoDst)) { 362 // MatchInfo now has negated value so use that instead of old Dst. 363 replaceRegWith(MRI, Dst, MatchInfoDst); 364 } else { 365 // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa 366 // but replaceRegWith will replace defs as well. It is easier to replace one 367 // def with a new register. 368 LLT Type = MRI.getType(Dst); 369 Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type); 370 replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo); 371 372 // MatchInfo now has negated value so use that instead of old Dst. 373 replaceRegWith(MRI, Dst, NegatedMatchInfo); 374 375 // Recreate non negated value for other uses of old MatchInfoDst 376 auto NextInst = ++MatchInfo->getIterator(); 377 Builder.setInstrAndDebugLoc(*NextInst); 378 Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags()); 379 } 380 381 MI.eraseFromParent(); 382 } 383 384 // TODO: Should return converted value / extension source and avoid introducing 385 // intermediate fptruncs in the apply function. 386 static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI, 387 Register Reg) { 388 const MachineInstr *Def = MRI.getVRegDef(Reg); 389 if (Def->getOpcode() == TargetOpcode::G_FPEXT) { 390 Register SrcReg = Def->getOperand(1).getReg(); 391 return MRI.getType(SrcReg) == LLT::scalar(16); 392 } 393 394 if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) { 395 APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF(); 396 bool LosesInfo = true; 397 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo); 398 return !LosesInfo; 399 } 400 401 return false; 402 } 403 404 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI, 405 Register Src0, 406 Register Src1, 407 Register Src2) { 408 assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC); 409 Register SrcReg = MI.getOperand(1).getReg(); 410 if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32)) 411 return false; 412 413 return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) && 414 isFPExtFromF16OrConst(MRI, Src2); 415 } 416 417 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI, 418 Register Src0, 419 Register Src1, 420 Register Src2) { 421 Builder.setInstrAndDebugLoc(MI); 422 423 // We expect fptrunc (fpext x) to fold out, and to constant fold any constant 424 // sources. 425 Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0); 426 Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0); 427 Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0); 428 429 LLT Ty = MRI.getType(Src0); 430 auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1); 431 auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1); 432 auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2); 433 Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1); 434 MI.eraseFromParent(); 435 } 436