Lines Matching +full:tri +full:- +full:band
1 //===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
69 #define DEBUG_TYPE "nvptx-lower"
76 "nvptx-sched4reg",
80 "nvptx-fma-level", cl::Hidden,
86 "nvptx-prec-divf32", cl::Hidden,
92 "nvptx-prec-sqrtf32", cl::Hidden,
97 "nvptx-force-min-byval-param-align", cl::Hidden,
98 cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
104 // If nvptx-prec-div32=N is used on the command-line, always honor it in getDivF32Level()
117 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it in usePrecSqrtF32()
162 /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
165 /// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
175 // Special case for i128 - decompose to (i64, i64) in ComputePTXValueVTs()
176 if (Ty->isIntegerTy(128)) { in ComputePTXValueVTs()
181 Offsets->push_back(StartingOffset + 0); in ComputePTXValueVTs()
182 Offsets->push_back(StartingOffset + 8); in ComputePTXValueVTs()
192 for(auto *EI : STy->elements()) { in ComputePTXValueVTs()
194 StartingOffset + SL->getElementOffset(ElementNum)); in ComputePTXValueVTs()
240 Offsets->push_back(Off + j * EltVT.getStoreSize()); in ComputePTXValueVTs()
245 Offsets->push_back(Off); in ComputePTXValueVTs()
260 "Promotion is not suitable for scalars of size larger than 64-bits"); in PromoteScalarIntegerPTX()
302 if (Offsets[Idx] & (AccessSize - 1)) in CanMergeParamLoadStoresStartingAt()
321 // PTX ISA can only deal with 2- and 4-element vector ops. in CanMergeParamLoadStoresStartingAt()
331 if (Offsets[j] - Offsets[j - 1] != EltSize) in CanMergeParamLoadStoresStartingAt()
338 // Flags for tracking per-element vectorization state of loads/stores
344 // Scalar is effectively a 1-element vector.
368 // Check what we can vectorize using 128/64/32-bit accesses. in VectorizePTXValueVTs()
574 // that don't have h/w rotation we lower them to multi-instruction assembly. in NVPTXTargetLowering()
741 // user passed --nvptx-no-fp16-math. The flag is useful because, in NVPTXTargetLowering()
867 // Custom lowering for inline asm with 128-bit operands in NVPTXTargetLowering()
1376 auto PtrVT = getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace()); in LowerGlobalAddress()
1377 Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT); in LowerGlobalAddress()
1382 return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) || in IsTypePassedAsArray()
1383 Ty->isHalfTy() || Ty->isBFloatTy(); in IsTypePassedAsArray()
1394 assert(isABI && "Non-ABI compilation is not supported"); in getPrototype()
1402 if (retTy->getTypeID() == Type::VoidTyID) { in getPrototype()
1406 if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) && in getPrototype()
1410 size = ITy->getBitWidth(); in getPrototype()
1412 assert(retTy->isFloatingPointTy() && in getPrototype()
1414 size = retTy->getPrimitiveSizeInBits(); in getPrototype()
1425 O << ".param .align " << (retAlignment ? retAlignment->value() : 0) in getPrototype()
1436 unsigned NumArgs = VAInfo ? VAInfo->first : Args.size(); in getPrototype()
1455 OIdx += len - 1; in getPrototype()
1465 sz = cast<IntegerType>(Ty)->getBitWidth(); in getPrototype()
1470 sz = Ty->getPrimitiveSizeInBits(); in getPrototype()
1490 O << (first ? "" : ",") << " .param .align " << VAInfo->second in getPrototype()
1513 const Function *DirectCallee = CB->getCalledFunction(); in getArgumentAlignment()
1556 // Use byte-store when the param address of the argument value is unaligned.
1588 // Use byte-load when the param adress of the returned value is unaligned.
1658 assert(isABI && "Non-ABI compilation is not supported"); in LowerCall()
1672 // initially set to 0, so it can be used for non-variadic arguments (which use in LowerCall()
1692 // * if there is a vector argument with more than typical vector-length in LowerCall()
1720 ArgAlign = getFunctionByValParamAlign(CB->getCalledFunction(), ETy, in LowerCall()
1772 // than 32-bits are sign extended or zero extended, depending on in LowerCall()
1776 Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32; in LowerCall()
1814 // Use 16-bit registers for small stores as it's the in LowerCall()
1852 unsigned NumElts = StoreOperands.size() - 3; in LowerCall()
1897 --OIdx; in LowerCall()
1912 // .param .align N .b8 retval0[<size-in-bytes>], or in LowerCall()
1913 // .param .b<size-in-bits> retval0 in LowerCall()
1929 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32), in LowerCall()
1947 VADeclareParam->getVTList(), DeclareParamOps); in LowerCall()
1964 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true"); in LowerCall()
1980 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1))) in LowerCall()
1983 const char *ProtoStr = nvTM->getStrPool().save(Proto).data(); in LowerCall()
2021 if (i == (e - 1)) in LowerCall()
2068 int VecIdx = -1; // Index of the first element of the vector. in LowerCall()
2071 // 32-bits are sign extended or zero extended, depending on whether in LowerCall()
2074 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; in LowerCall()
2101 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType() && in LowerCall()
2104 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); in LowerCall()
2117 assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list."); in LowerCall()
2163 VecIdx = -1; in LowerCall()
2227 DAG.getContext()->diagnose(NoDynamicAlloca); in LowerDYNAMIC_STACKALLOC()
2235 uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); in LowerDYNAMIC_STACKALLOC()
2238 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. in LowerDYNAMIC_STACKALLOC()
2239 if (nvTM->is64Bit()) in LowerDYNAMIC_STACKALLOC()
2247 nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps); in LowerDYNAMIC_STACKALLOC()
2261 unsigned NumOperands = Node->getNumOperands(); in LowerCONCAT_VECTORS()
2263 SDValue SubOp = Node->getOperand(i); in LowerCONCAT_VECTORS()
2264 EVT VVT = SubOp.getNode()->getValueType(0); in LowerCONCAT_VECTORS()
2272 return DAG.getBuildVector(Node->getValueType(0), dl, Ops); in LowerCONCAT_VECTORS()
2276 // would get lowered as two constant loads and vector-packing move.
2281 EVT VT = Op->getValueType(0); in LowerBUILD_VECTOR()
2287 if (!llvm::all_of(Op->ops(), [](SDValue Operand) { in LowerBUILD_VECTOR()
2288 return Operand->isUndef() || isa<ConstantSDNode>(Operand) || in LowerBUILD_VECTOR()
2291 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us in LowerBUILD_VECTOR()
2297 DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), in LowerBUILD_VECTOR()
2298 DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); in LowerBUILD_VECTOR()
2301 DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), in LowerBUILD_VECTOR()
2305 DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), in LowerBUILD_VECTOR()
2313 auto GetOperand = [](SDValue Op, int N) -> APInt { in LowerBUILD_VECTOR()
2314 const SDValue &Operand = Op->getOperand(N); in LowerBUILD_VECTOR()
2315 EVT VT = Op->getValueType(0); in LowerBUILD_VECTOR()
2316 if (Operand->isUndef()) in LowerBUILD_VECTOR()
2320 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt(); in LowerBUILD_VECTOR()
2322 Value = Operand->getAsAPIntVal(); in LowerBUILD_VECTOR()
2341 return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op->getValueType(0), Const); in LowerBUILD_VECTOR()
2346 SDValue Index = Op->getOperand(1); in LowerEXTRACT_VECTOR_ELT()
2347 SDValue Vector = Op->getOperand(0); in LowerEXTRACT_VECTOR_ELT()
2359 return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); in LowerEXTRACT_VECTOR_ELT()
2381 SDValue Vector = Op->getOperand(0); in LowerINSERT_VECTOR_ELT()
2387 SDValue Value = Op->getOperand(1); in LowerINSERT_VECTOR_ELT()
2388 if (Value->isUndef()) in LowerINSERT_VECTOR_ELT()
2391 SDValue Index = Op->getOperand(2); in LowerINSERT_VECTOR_ELT()
2400 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI); in LowerINSERT_VECTOR_ELT()
2414 for (auto I : llvm::enumerate(SVN->getMask())) { in LowerVECTOR_SHUFFLE()
2415 if (I.value() != -1) // -1 is a placeholder for undef. in LowerVECTOR_SHUFFLE()
2424 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2431 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); in LowerShiftRightParts()
2457 // - if (Amt>=size) then in LowerShiftRightParts()
2458 // dLo = aHi >> (Amt-size) in LowerShiftRightParts()
2461 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) in LowerShiftRightParts()
2485 /// LowerShiftLeftParts - Lower SHL_PARTS, which
2492 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); in LowerShiftLeftParts()
2517 // - if (Amt>=size) then in LowerShiftLeftParts()
2519 // dLo = aLo << (Amt-size) in LowerShiftLeftParts()
2522 // dHi = (aHi << Amt) | (aLo >> (size-Amt)) in LowerShiftLeftParts()
2560 // float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2572 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) in LowerFROUND32()
2676 return TLI->expandFP_ROUND(Op.getNode(), DAG); in LowerFP_ROUND()
2679 // This combination was the first to support f32 -> bf16. in LowerFP_ROUND()
2686 // Round-inexact-to-odd f64 to f32, then do the final rounding using in LowerFP_ROUND()
2687 // the hardware f32 -> bf16 instruction. in LowerFP_ROUND()
2688 SDValue rod = TLI->expandRoundInexactToOdd( in LowerFP_ROUND()
2695 return TLI->expandFP_ROUND(Op.getNode(), DAG); in LowerFP_ROUND()
2740 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs), in LowerVectorArith()
2830 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue(); in LowerVAARG()
2831 EVT VT = Node->getValueType(0); in LowerVAARG()
2833 SDValue Tmp1 = Node->getOperand(0); in LowerVAARG()
2834 SDValue Tmp2 = Node->getOperand(1); in LowerVAARG()
2835 const MaybeAlign MA(Node->getConstantOperandVal(3)); in LowerVAARG()
2837 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL, in LowerVAARG()
2841 if (MA && *MA > TLI->getMinStackArgumentAlignment()) { in LowerVAARG()
2844 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType())); in LowerVAARG()
2848 DAG.getConstant(-(int64_t)MA->value(), DL, VAList.getValueType())); in LowerVAARG()
2870 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout()); in LowerVASTART()
2873 SDValue Arg = getParamSymbol(DAG, /* vararg */ -1, PtrVT); in LowerVASTART()
2876 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); in LowerVASTART()
2882 SDValue Op0 = Op->getOperand(0); in LowerSelect()
2883 SDValue Op1 = Op->getOperand(1); in LowerSelect()
2884 SDValue Op2 = Op->getOperand(2); in LowerSelect()
2906 EVT MemVT = Load->getMemoryVT(); in LowerLOAD()
2908 MemVT, *Load->getMemOperand())) { in LowerLOAD()
2920 // v1 = ld i8* addr (-> i16)
2926 assert(LD->getExtensionType() == ISD::NON_EXTLOAD); in LowerLOADi1()
2927 assert(Node->getValueType(0) == MVT::i1 && in LowerLOADi1()
2929 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(), in LowerLOADi1()
2930 LD->getBasePtr(), LD->getPointerInfo(), in LowerLOADi1()
2931 MVT::i8, LD->getAlign(), in LowerLOADi1()
2932 LD->getMemOperand()->getFlags()); in LowerLOADi1()
2937 SDValue Ops[] = { result, LD->getChain() }; in LowerLOADi1()
2943 EVT VT = Store->getMemoryVT(); in LowerSTORE()
2952 VT, *Store->getMemOperand())) in LowerSTORE()
2968 SDValue Val = N->getOperand(1); in LowerSTOREVector()
3005 Align Alignment = MemSD->getAlign(); in LowerSTOREVector()
3051 Ops.push_back(N->getOperand(0)); in LowerSTOREVector()
3054 // Combine f16,f16 -> v2f16 in LowerSTOREVector()
3077 Ops.append(N->op_begin() + 2, N->op_end()); in LowerSTOREVector()
3081 MemSD->getMemoryVT(), MemSD->getMemOperand()); in LowerSTOREVector()
3098 SDValue Tmp1 = ST->getChain(); in LowerSTOREi1()
3099 SDValue Tmp2 = ST->getBasePtr(); in LowerSTOREi1()
3100 SDValue Tmp3 = ST->getValue(); in LowerSTOREi1()
3104 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8, in LowerSTOREi1()
3105 ST->getAlign(), ST->getMemOperand()->getFlags()); in LowerSTOREi1()
3111 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit in LowerCopyToReg_128()
3115 "Custom lowering for 128-bit CopyToReg only"); in LowerCopyToReg_128()
3120 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2)); in LowerCopyToReg_128()
3126 SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1); in LowerCopyToReg_128()
3127 SmallVector<EVT, 3> ResultsType(Node->values()); in LowerCopyToReg_128()
3129 NewOps[0] = Op->getOperand(0); // Chain in LowerCopyToReg_128()
3130 NewOps[1] = Op->getOperand(1); // Dst Reg in LowerCopyToReg_128()
3131 NewOps[2] = Lo; // Lower 64-bit in LowerCopyToReg_128()
3132 NewOps[3] = Hi; // Higher 64-bit in LowerCopyToReg_128()
3134 NewOps[4] = Op->getOperand(3); // Glue if exists in LowerCopyToReg_128()
3163 StringRef SavedStr = nvTM->getStrPool().save( in getParamSymbol()
3177 const AttributeList &PAL = F->getAttributes(); in LowerFormalArguments()
3184 assert(isABI && "Non-ABI compilation is not supported"); in LowerFormalArguments()
3190 for (const Argument &I : F->args()) { in LowerFormalArguments()
3198 // * if there is a vector argument with more than typical vector-length in LowerFormalArguments()
3208 if (theArgs[i]->use_empty()) { in LowerFormalArguments()
3210 if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) { in LowerFormalArguments()
3223 --InsIdx; in LowerFormalArguments()
3226 if (Ty->isVectorTy()) { in LowerFormalArguments()
3228 unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT); in LowerFormalArguments()
3234 --InsIdx; in LowerFormalArguments()
3248 aggregateIsPacked = STy->isPacked(); in LowerFormalArguments()
3261 int VecIdx = -1; // Index of the first element of the current vector. in LowerFormalArguments()
3264 assert(VecIdx == -1 && "Orphaned vector."); in LowerFormalArguments()
3270 unsigned NumElts = parti - VecIdx + 1; in LowerFormalArguments()
3282 EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts); in LowerFormalArguments()
3287 EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM)); in LowerFormalArguments()
3289 const MaybeAlign PartAlign = [&]() -> MaybeAlign { in LowerFormalArguments()
3295 DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext())); in LowerFormalArguments()
3303 P.getNode()->setIROrder(i + 1); in LowerFormalArguments()
3333 VecIdx = -1; in LowerFormalArguments()
3338 --InsIdx; in LowerFormalArguments()
3355 p.getNode()->setIROrder(i + 1); in LowerFormalArguments()
3365 // Use byte-store when the param adress of the return value is unaligned.
3403 assert(isABI && "Non-ABI compilation is not supported"); in LowerReturn()
3430 RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL) in LowerReturn()
3434 // 32-bits are sign extended or zero extended, depending on whether in LowerReturn()
3437 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32; in LowerReturn()
3449 // Use 16-bit registers for small load-stores as it's the in LowerReturn()
3456 if (VectorInfo[i] == PVF_SCALAR && RetTy->isAggregateType()) { in LowerReturn()
3459 DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext())); in LowerReturn()
3486 unsigned NumElts = StoreOperands.size() - 2; in LowerReturn()
4676 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); in getTgtMemIntrinsic()
4695 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue(); in getTgtMemIntrinsic()
5090 /// getFunctionParamOptimizedAlign - since function arguments are passed via
5095 /// alignment. To allow using 128-bit vectorized loads/stores, this function
5106 if (!F || !F->hasLocalLinkage() || in getFunctionParamOptimizedAlign()
5107 F->hasAddressTaken(/*Users=*/nullptr, in getFunctionParamOptimizedAlign()
5113 assert(!isKernelFunction(*F) && "Expect kernels to have non-local linkage"); in getFunctionParamOptimizedAlign()
5134 // on non-deprecated ptxas versions. in getFunctionByValParamAlign()
5149 ParamStr << getTargetMachine().getSymbol(F)->getName(); in getParamName()
5158 /// isLegalAddressingMode - Return true if the addressing mode represented
5166 // AddrMode - This represents an addressing mode of: in isLegalAddressingMode()
5170 // - [avar] in isLegalAddressingMode()
5171 // - [areg] in isLegalAddressingMode()
5172 // - [areg+immoff] in isLegalAddressingMode()
5173 // - [immAddr] in isLegalAddressingMode()
5175 // immoff must fit in a signed 32-bit int in isLegalAddressingMode()
5197 //===----------------------------------------------------------------------===//
5199 //===----------------------------------------------------------------------===//
5201 /// getConstraintType - Given a constraint letter, return the type of
5226 NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, in getRegForInlineAsmConstraint() argument
5254 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); in getRegForInlineAsmConstraint()
5257 //===----------------------------------------------------------------------===//
5259 //===----------------------------------------------------------------------===//
5263 // Always honor command-line argument in allowFMA()
5283 // Allow unsafe math if unsafe-fp-math attribute explicitly says so. in allowUnsafeFPMath()
5285 return F.getFnAttribute("unsafe-fp-math").getValueAsBool(); in allowUnsafeFPMath()
5290 return Const && Const->getZExtValue() == 0; in isConstZero()
5293 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5302 // Since integer multiply-add costs the same as integer multiply in PerformADDCombineWithOperands()
5306 if (!N0.getNode()->hasOneUse()) in PerformADDCombineWithOperands()
5309 // fold (add (mul a, b), c) -> (mad a, b, c) in PerformADDCombineWithOperands()
5316 // -> (select cond, c, (mad a, b, c)) in PerformADDCombineWithOperands()
5320 if (isConstZero(N0->getOperand(1))) in PerformADDCombineWithOperands()
5322 else if (isConstZero(N0->getOperand(2))) in PerformADDCombineWithOperands()
5327 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1); in PerformADDCombineWithOperands()
5328 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse()) in PerformADDCombineWithOperands()
5332 M->getOperand(0), M->getOperand(1), N1); in PerformADDCombineWithOperands()
5333 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0), in PerformADDCombineWithOperands()
5349 if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel)) in PerformFADDCombineWithOperands()
5362 for (const SDNode *User : N0.getNode()->uses()) { in PerformFADDCombineWithOperands()
5364 if (User->getOpcode() != ISD::FADD) in PerformFADDCombineWithOperands()
5370 int orderNo = N->getIROrder(); in PerformFADDCombineWithOperands()
5371 int orderNo2 = N0.getNode()->getIROrder(); in PerformFADDCombineWithOperands()
5376 if (orderNo - orderNo2 < 500) in PerformFADDCombineWithOperands()
5390 for (const SDNode *User : left->uses()) { in PerformFADDCombineWithOperands()
5391 int orderNo3 = User->getIROrder(); in PerformFADDCombineWithOperands()
5399 for (const SDNode *User : right->uses()) { in PerformFADDCombineWithOperands()
5400 int orderNo3 = User->getIROrder(); in PerformFADDCombineWithOperands()
5420 if (all_of(N->ops().drop_front(Front).drop_back(Back), in PerformStoreCombineHelper()
5421 [](const SDUse &U) { return U.get()->isUndef(); })) in PerformStoreCombineHelper()
5424 return N->getOperand(0); in PerformStoreCombineHelper()
5440 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
5448 SDValue N0 = N->getOperand(0); in PerformADDCombine()
5449 SDValue N1 = N->getOperand(1); in PerformADDCombine()
5451 // Skip non-integer, non-scalar case in PerformADDCombine()
5464 /// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
5469 SDValue N0 = N->getOperand(0); in PerformFADDCombine()
5470 SDValue N1 = N->getOperand(1); in PerformFADDCombine()
5489 // target-specific DAG node, the DAG combiner fails to eliminate these AND in PerformANDCombine()
5491 SDValue Val = N->getOperand(0); in PerformANDCombine()
5492 SDValue Mask = N->getOperand(1); in PerformANDCombine()
5500 // Convert BFE-> truncate i16 -> and 255 in PerformANDCombine()
5501 // To just BFE-> truncate i16, as the value already has all the bits in the in PerformANDCombine()
5511 uint64_t BFEBitsVal = BFEBits->getZExtValue(); in PerformANDCombine()
5518 uint64_t MaskVal = MaskCnst->getZExtValue(); in PerformANDCombine()
5520 if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) in PerformANDCombine()
5525 // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and in PerformANDCombine()
5528 Val = Val->getOperand(0); in PerformANDCombine()
5531 if (Val->isMachineOpcode() && Val->getMachineOpcode() == NVPTX::IMOV16rr) { in PerformANDCombine()
5532 Val = Val->getOperand(0); in PerformANDCombine()
5535 if (Val->getOpcode() == NVPTXISD::LoadV2 || in PerformANDCombine()
5536 Val->getOpcode() == NVPTXISD::LoadV4) { in PerformANDCombine()
5543 uint64_t MaskVal = MaskCnst->getZExtValue(); in PerformANDCombine()
5555 EVT MemVT = Mem->getMemoryVT(); in PerformANDCombine()
5561 unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1); in PerformANDCombine()
5570 // Re-insert the ext as a zext. in PerformANDCombine()
5586 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM); in PerformREMCombine()
5588 // Don't do anything at less than -O2. in PerformREMCombine()
5594 EVT VT = N->getValueType(0); in PerformREMCombine()
5595 bool IsSigned = N->getOpcode() == ISD::SREM; in PerformREMCombine()
5598 const SDValue &Num = N->getOperand(0); in PerformREMCombine()
5599 const SDValue &Den = N->getOperand(1); in PerformREMCombine()
5601 for (const SDNode *U : Num->uses()) { in PerformREMCombine()
5602 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num && in PerformREMCombine()
5603 U->getOperand(1) == Den) { in PerformREMCombine()
5604 // Num % Den -> Num - (Num / Den) * Den in PerformREMCombine()
5620 /// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
5646 /// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
5667 const APInt &Val = CI->getAPIntValue(); in AreMulWideOperandsDemotable()
5682 /// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
5683 /// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
5688 EVT MulType = N->getValueType(0); in TryMULWIDECombine()
5695 SDValue LHS = N->getOperand(0); in TryMULWIDECombine()
5696 SDValue RHS = N->getOperand(1); in TryMULWIDECombine()
5699 if (N->getOpcode() == ISD::MUL) { in TryMULWIDECombine()
5706 if (N->getOpcode() == ISD::SHL) { in TryMULWIDECombine()
5712 APInt ShiftAmt = ShlRHS->getAPIntValue(); in TryMULWIDECombine()
5754 return Const && Const->getZExtValue() == 1; in isConstOne()
5758 if (Add->getOpcode() != ISD::ADD) in matchMADConstOnePattern()
5761 if (isConstOne(Add->getOperand(0))) in matchMADConstOnePattern()
5762 return Add->getOperand(1); in matchMADConstOnePattern()
5764 if (isConstOne(Add->getOperand(1))) in matchMADConstOnePattern()
5765 return Add->getOperand(0); in matchMADConstOnePattern()
5782 if (Select->getOpcode() != ISD::SELECT) in combineMulSelectConstOne()
5785 SDValue Cond = Select->getOperand(0); in combineMulSelectConstOne()
5788 if (isConstOne(Select->getOperand(1))) in combineMulSelectConstOne()
5790 else if (isConstOne(Select->getOperand(2))) in combineMulSelectConstOne()
5795 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1); in combineMulSelectConstOne()
5821 // (mul x, (add y, 1)) -> (mad x, y, x) in PerformMULCombineWithOperands()
5827 // (mul x, (select y, 1)) -> (select (mul x, y), x) in PerformMULCombineWithOperands()
5836 /// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5846 SDValue N0 = N->getOperand(0); in PerformMULCombine()
5847 SDValue N1 = N->getOperand(1); in PerformMULCombine()
5851 /// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5867 EVT CCType = N->getValueType(0); in PerformSETCCCombine()
5868 SDValue A = N->getOperand(0); in PerformSETCCCombine()
5869 SDValue B = N->getOperand(1); in PerformSETCCCombine()
5886 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)}); in PerformSETCCCombine()
5893 SDValue Vector = N->getOperand(0); in PerformEXTRACTCombine()
5896 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() && in PerformEXTRACTCombine()
5907 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode())) in PerformEXTRACTCombine()
5911 // We only handle the types we can extract in-register. in PerformEXTRACTCombine()
5915 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1)); in PerformEXTRACTCombine()
5917 if (!Index || Index->getZExtValue() == 0) in PerformEXTRACTCombine()
5929 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT))); in PerformEXTRACTCombine()
5931 // If element has non-integer type, bitcast it back to the expected type. in PerformEXTRACTCombine()
5934 // Past legalizer, we may need to extent i8 -> i16 to match the register type. in PerformEXTRACTCombine()
5935 if (EltVT != N->getValueType(0)) in PerformEXTRACTCombine()
5936 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result); in PerformEXTRACTCombine()
5943 SDValue VA = N->getOperand(1); in PerformVSELECTCombine()
5948 // We need to split vselect into individual per-element operations Because we in PerformVSELECTCombine()
5950 // 32-bit values, so we may as well do comparison as i32 to avoid conversions in PerformVSELECTCombine()
5954 SDValue VCond = N->getOperand(0); in PerformVSELECTCombine()
5955 SDValue VB = N->getOperand(2); in PerformVSELECTCombine()
5980 // This is done at dag-combine1 time, so that vector operations with i8 in PerformLOADCombine()
5983 EVT VT = N->getValueType(0); in PerformLOADCombine()
5996 SmallVector<SDValue, 8> Ops(N->ops()); in PerformLOADCombine()
5997 Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); in PerformLOADCombine()
5999 LD->getMemOperand()); in PerformLOADCombine()
6015 switch (N->getOpcode()) { in PerformDAGCombine()
6050 /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
6053 EVT ResVT = N->getValueType(0); in ReplaceLoadVector()
6086 Align Alignment = LD->getAlign(); in ReplaceLoadVector()
6089 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext())); in ReplaceLoadVector()
6156 SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end()); in ReplaceLoadVector()
6160 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); in ReplaceLoadVector()
6163 LD->getMemoryVT(), in ReplaceLoadVector()
6164 LD->getMemOperand()); in ReplaceLoadVector()
6198 SDValue Chain = N->getOperand(0); in ReplaceINTRINSIC_W_CHAIN()
6199 SDValue Intrin = N->getOperand(1); in ReplaceINTRINSIC_W_CHAIN()
6203 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal(); in ReplaceINTRINSIC_W_CHAIN()
6213 EVT ResVT = N->getValueType(0); in ReplaceINTRINSIC_W_CHAIN()
6282 OtherOps.append(N->op_begin() + 2, N->op_end()); in ReplaceINTRINSIC_W_CHAIN()
6287 MemSD->getMemoryVT(), in ReplaceINTRINSIC_W_CHAIN()
6288 MemSD->getMemOperand()); in ReplaceINTRINSIC_W_CHAIN()
6310 "Custom handling of non-i8 ldu/ldg?"); in ReplaceINTRINSIC_W_CHAIN()
6312 // Just copy all operands as-is in ReplaceINTRINSIC_W_CHAIN()
6313 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); in ReplaceINTRINSIC_W_CHAIN()
6324 MVT::i8, MemSD->getMemOperand()); in ReplaceINTRINSIC_W_CHAIN()
6336 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit in ReplaceCopyFromReg_128()
6339 SDValue Chain = N->getOperand(0); in ReplaceCopyFromReg_128()
6340 SDValue Reg = N->getOperand(1); in ReplaceCopyFromReg_128()
6341 SDValue Glue = N->getOperand(2); in ReplaceCopyFromReg_128()
6344 "Custom lowering for CopyFromReg with 128-bit reg only"); in ReplaceCopyFromReg_128()
6345 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1), in ReplaceCopyFromReg_128()
6346 N->getValueType(2)}; in ReplaceCopyFromReg_128()
6360 switch (N->getOpcode()) { in ReplaceNodeResults()
6377 Type *Ty = AI->getValOperand()->getType(); in shouldExpandAtomicRMWInIR()
6379 if (AI->isFloatingPointOperation()) { in shouldExpandAtomicRMWInIR()
6380 if (AI->getOperation() == AtomicRMWInst::BinOp::FAdd) { in shouldExpandAtomicRMWInIR()
6381 if (Ty->isHalfTy() && STI.getSmVersion() >= 70 && in shouldExpandAtomicRMWInIR()
6384 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 && in shouldExpandAtomicRMWInIR()
6387 if (Ty->isFloatTy()) in shouldExpandAtomicRMWInIR()
6389 if (Ty->isDoubleTy() && STI.hasAtomAddF64()) in shouldExpandAtomicRMWInIR()
6395 assert(Ty->isIntegerTy() && "Ty should be integer at this point"); in shouldExpandAtomicRMWInIR()
6398 switch (AI->getOperation()) { in shouldExpandAtomicRMWInIR()
6405 switch (ITy->getBitWidth()) { in shouldExpandAtomicRMWInIR()
6424 switch (ITy->getBitWidth()) { in shouldExpandAtomicRMWInIR()