1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the ARMSelectionDAGInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "ARMTargetMachine.h" 14 #include "ARMTargetTransformInfo.h" 15 #include "llvm/CodeGen/SelectionDAG.h" 16 #include "llvm/IR/DerivedTypes.h" 17 #include "llvm/Support/CommandLine.h" 18 using namespace llvm; 19 20 #define DEBUG_TYPE "arm-selectiondag-info" 21 22 cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop( 23 "arm-memtransfer-tploop", cl::Hidden, 24 cl::desc("Control conversion of memcpy to " 25 "Tail predicated loops (WLSTP)"), 26 cl::init(TPLoop::ForceDisabled), 27 cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled", 28 "Don't convert memcpy to TP loop."), 29 clEnumValN(TPLoop::ForceEnabled, "force-enabled", 30 "Always convert memcpy to TP loop."), 31 clEnumValN(TPLoop::Allow, "allow", 32 "Allow (may be subject to certain conditions) " 33 "conversion of memcpy to TP loop."))); 34 35 // Emit, if possible, a specialized version of the given Libcall. Typically this 36 // means selecting the appropriately aligned version, but we also convert memset 37 // of 0 into memclr. 38 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( 39 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 40 SDValue Size, unsigned Align, RTLIB::Libcall LC) const { 41 const ARMSubtarget &Subtarget = 42 DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); 43 const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); 44 45 // Only use a specialized AEABI function if the default version of this 46 // Libcall is an AEABI function. 47 if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) 48 return SDValue(); 49 50 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be 51 // able to translate memset to memclr and use the value to index the function 52 // name array. 53 enum { 54 AEABI_MEMCPY = 0, 55 AEABI_MEMMOVE, 56 AEABI_MEMSET, 57 AEABI_MEMCLR 58 } AEABILibcall; 59 switch (LC) { 60 case RTLIB::MEMCPY: 61 AEABILibcall = AEABI_MEMCPY; 62 break; 63 case RTLIB::MEMMOVE: 64 AEABILibcall = AEABI_MEMMOVE; 65 break; 66 case RTLIB::MEMSET: 67 AEABILibcall = AEABI_MEMSET; 68 if (isNullConstant(Src)) 69 AEABILibcall = AEABI_MEMCLR; 70 break; 71 default: 72 return SDValue(); 73 } 74 75 // Choose the most-aligned libcall variant that we can 76 enum { 77 ALIGN1 = 0, 78 ALIGN4, 79 ALIGN8 80 } AlignVariant; 81 if ((Align & 7) == 0) 82 AlignVariant = ALIGN8; 83 else if ((Align & 3) == 0) 84 AlignVariant = ALIGN4; 85 else 86 AlignVariant = ALIGN1; 87 88 TargetLowering::ArgListTy Args; 89 TargetLowering::ArgListEntry Entry; 90 Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); 91 Entry.Node = Dst; 92 Args.push_back(Entry); 93 if (AEABILibcall == AEABI_MEMCLR) { 94 Entry.Node = Size; 95 Args.push_back(Entry); 96 } else if (AEABILibcall == AEABI_MEMSET) { 97 // Adjust parameters for memset, EABI uses format (ptr, size, value), 98 // GNU library uses (ptr, value, size) 99 // See RTABI section 4.3.4 100 Entry.Node = Size; 101 Args.push_back(Entry); 102 103 // Extend or truncate the argument to be an i32 value for the call. 104 if (Src.getValueType().bitsGT(MVT::i32)) 105 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); 106 else if (Src.getValueType().bitsLT(MVT::i32)) 107 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); 108 109 Entry.Node = Src; 110 Entry.Ty = Type::getInt32Ty(*DAG.getContext()); 111 Entry.IsSExt = false; 112 Args.push_back(Entry); 113 } else { 114 Entry.Node = Src; 115 Args.push_back(Entry); 116 117 Entry.Node = Size; 118 Args.push_back(Entry); 119 } 120 121 char const *FunctionNames[4][3] = { 122 { "__aeabi_memcpy", "__aeabi_memcpy4", "__aeabi_memcpy8" }, 123 { "__aeabi_memmove", "__aeabi_memmove4", "__aeabi_memmove8" }, 124 { "__aeabi_memset", "__aeabi_memset4", "__aeabi_memset8" }, 125 { "__aeabi_memclr", "__aeabi_memclr4", "__aeabi_memclr8" } 126 }; 127 TargetLowering::CallLoweringInfo CLI(DAG); 128 CLI.setDebugLoc(dl) 129 .setChain(Chain) 130 .setLibCallee( 131 TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), 132 DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant], 133 TLI->getPointerTy(DAG.getDataLayout())), 134 std::move(Args)) 135 .setDiscardResult(); 136 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); 137 138 return CallResult.second; 139 } 140 141 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, 142 const SelectionDAG &DAG, 143 ConstantSDNode *ConstantSize, 144 Align Alignment, bool IsMemcpy) { 145 auto &F = DAG.getMachineFunction().getFunction(); 146 if (!EnableMemtransferTPLoop) 147 return false; 148 if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) 149 return true; 150 // Do not generate inline TP loop if optimizations is disabled, 151 // or if optimization for size (-Os or -Oz) is on. 152 if (F.hasOptNone() || F.hasOptSize()) 153 return false; 154 // If cli option is unset, for memset always generate inline TP. 155 // For memcpy, check some conditions 156 if (!IsMemcpy) 157 return true; 158 if (!ConstantSize && Alignment >= Align(4)) 159 return true; 160 if (ConstantSize && 161 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && 162 ConstantSize->getZExtValue() < 163 Subtarget.getMaxMemcpyTPInlineSizeThreshold()) 164 return true; 165 return false; 166 } 167 168 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( 169 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 170 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, 171 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { 172 const ARMSubtarget &Subtarget = 173 DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); 174 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 175 176 if (Subtarget.hasMVEIntegerOps() && 177 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true)) 178 return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src, 179 DAG.getZExtOrTrunc(Size, dl, MVT::i32)); 180 181 // Do repeated 4-byte loads and stores. To be improved. 182 // This requires 4-byte alignment. 183 if (Alignment < Align(4)) 184 return SDValue(); 185 // This requires the copy size to be a constant, preferably 186 // within a subtarget-specific limit. 187 if (!ConstantSize) 188 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 189 Alignment.value(), RTLIB::MEMCPY); 190 uint64_t SizeVal = ConstantSize->getZExtValue(); 191 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) 192 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 193 Alignment.value(), RTLIB::MEMCPY); 194 195 unsigned BytesLeft = SizeVal & 3; 196 unsigned NumMemOps = SizeVal >> 2; 197 unsigned EmittedNumMemOps = 0; 198 EVT VT = MVT::i32; 199 unsigned VTSize = 4; 200 unsigned i = 0; 201 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers 202 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; 203 SDValue TFOps[6]; 204 SDValue Loads[6]; 205 uint64_t SrcOff = 0, DstOff = 0; 206 207 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to 208 // VLDM/VSTM and make this code emit it when appropriate. This would reduce 209 // pressure on the general purpose registers. However this seems harder to map 210 // onto the register allocator's view of the world. 211 212 // The number of MEMCPY pseudo-instructions to emit. We use up to 213 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm 214 // later on. This is a lower bound on the number of MEMCPY operations we must 215 // emit. 216 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; 217 218 // Code size optimisation: do not inline memcpy if expansion results in 219 // more instructions than the libary call. 220 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) { 221 return SDValue(); 222 } 223 224 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); 225 226 for (unsigned I = 0; I != NumMEMCPYs; ++I) { 227 // Evenly distribute registers among MEMCPY operations to reduce register 228 // pressure. 229 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; 230 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; 231 232 Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, 233 DAG.getConstant(NumRegs, dl, MVT::i32)); 234 Src = Dst.getValue(1); 235 Chain = Dst.getValue(2); 236 237 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); 238 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); 239 240 EmittedNumMemOps = NextEmittedNumMemOps; 241 } 242 243 if (BytesLeft == 0) 244 return Chain; 245 246 // Issue loads / stores for the trailing (1 - 3) bytes. 247 auto getRemainingValueType = [](unsigned BytesLeft) { 248 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8; 249 }; 250 auto getRemainingSize = [](unsigned BytesLeft) { 251 return (BytesLeft >= 2) ? 2 : 1; 252 }; 253 254 unsigned BytesLeftSave = BytesLeft; 255 i = 0; 256 while (BytesLeft) { 257 VT = getRemainingValueType(BytesLeft); 258 VTSize = getRemainingSize(BytesLeft); 259 Loads[i] = DAG.getLoad(VT, dl, Chain, 260 DAG.getNode(ISD::ADD, dl, MVT::i32, Src, 261 DAG.getConstant(SrcOff, dl, MVT::i32)), 262 SrcPtrInfo.getWithOffset(SrcOff)); 263 TFOps[i] = Loads[i].getValue(1); 264 ++i; 265 SrcOff += VTSize; 266 BytesLeft -= VTSize; 267 } 268 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i)); 269 270 i = 0; 271 BytesLeft = BytesLeftSave; 272 while (BytesLeft) { 273 VT = getRemainingValueType(BytesLeft); 274 VTSize = getRemainingSize(BytesLeft); 275 TFOps[i] = DAG.getStore(Chain, dl, Loads[i], 276 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, 277 DAG.getConstant(DstOff, dl, MVT::i32)), 278 DstPtrInfo.getWithOffset(DstOff)); 279 ++i; 280 DstOff += VTSize; 281 BytesLeft -= VTSize; 282 } 283 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i)); 284 } 285 286 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove( 287 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 288 SDValue Size, Align Alignment, bool isVolatile, 289 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { 290 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 291 Alignment.value(), RTLIB::MEMMOVE); 292 } 293 294 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( 295 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 296 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, 297 MachinePointerInfo DstPtrInfo) const { 298 299 const ARMSubtarget &Subtarget = 300 DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); 301 302 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 303 304 // Generate TP loop for llvm.memset 305 if (Subtarget.hasMVEIntegerOps() && 306 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, 307 false)) { 308 Src = DAG.getSplatBuildVector(MVT::v16i8, dl, 309 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src)); 310 return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src, 311 DAG.getZExtOrTrunc(Size, dl, MVT::i32)); 312 } 313 314 if (!AlwaysInline) 315 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 316 Alignment.value(), RTLIB::MEMSET); 317 318 return SDValue(); 319 } 320