1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the ARMSelectionDAGInfo class. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "ARMTargetTransformInfo.h" 14 #include "llvm/CodeGen/SelectionDAG.h" 15 #include "llvm/Support/CommandLine.h" 16 using namespace llvm; 17 18 #define DEBUG_TYPE "arm-selectiondag-info" 19 20 static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop( 21 "arm-memtransfer-tploop", cl::Hidden, 22 cl::desc("Control conversion of memcpy to " 23 "Tail predicated loops (WLSTP)"), 24 cl::init(TPLoop::ForceDisabled), 25 cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled", 26 "Don't convert memcpy to TP loop."), 27 clEnumValN(TPLoop::ForceEnabled, "force-enabled", 28 "Always convert memcpy to TP loop."), 29 clEnumValN(TPLoop::Allow, "allow", 30 "Allow (may be subject to certain conditions) " 31 "conversion of memcpy to TP loop."))); 32 33 bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const { 34 return Opcode >= ARMISD::FIRST_MEMORY_OPCODE && 35 Opcode <= ARMISD::LAST_MEMORY_OPCODE; 36 } 37 38 // Emit, if possible, a specialized version of the given Libcall. Typically this 39 // means selecting the appropriately aligned version, but we also convert memset 40 // of 0 into memclr. 41 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall( 42 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 43 SDValue Size, unsigned Align, RTLIB::Libcall LC) const { 44 const ARMSubtarget &Subtarget = 45 DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); 46 const ARMTargetLowering *TLI = Subtarget.getTargetLowering(); 47 48 // Only use a specialized AEABI function if the default version of this 49 // Libcall is an AEABI function. 50 if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0) 51 return SDValue(); 52 53 // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be 54 // able to translate memset to memclr and use the value to index the function 55 // name array. 56 enum { 57 AEABI_MEMCPY = 0, 58 AEABI_MEMMOVE, 59 AEABI_MEMSET, 60 AEABI_MEMCLR 61 } AEABILibcall; 62 switch (LC) { 63 case RTLIB::MEMCPY: 64 AEABILibcall = AEABI_MEMCPY; 65 break; 66 case RTLIB::MEMMOVE: 67 AEABILibcall = AEABI_MEMMOVE; 68 break; 69 case RTLIB::MEMSET: 70 AEABILibcall = AEABI_MEMSET; 71 if (isNullConstant(Src)) 72 AEABILibcall = AEABI_MEMCLR; 73 break; 74 default: 75 return SDValue(); 76 } 77 78 // Choose the most-aligned libcall variant that we can 79 enum { 80 ALIGN1 = 0, 81 ALIGN4, 82 ALIGN8 83 } AlignVariant; 84 if ((Align & 7) == 0) 85 AlignVariant = ALIGN8; 86 else if ((Align & 3) == 0) 87 AlignVariant = ALIGN4; 88 else 89 AlignVariant = ALIGN1; 90 91 TargetLowering::ArgListTy Args; 92 TargetLowering::ArgListEntry Entry; 93 Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); 94 Entry.Node = Dst; 95 Args.push_back(Entry); 96 if (AEABILibcall == AEABI_MEMCLR) { 97 Entry.Node = Size; 98 Args.push_back(Entry); 99 } else if (AEABILibcall == AEABI_MEMSET) { 100 // Adjust parameters for memset, EABI uses format (ptr, size, value), 101 // GNU library uses (ptr, value, size) 102 // See RTABI section 4.3.4 103 Entry.Node = Size; 104 Args.push_back(Entry); 105 106 // Extend or truncate the argument to be an i32 value for the call. 107 if (Src.getValueType().bitsGT(MVT::i32)) 108 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src); 109 else if (Src.getValueType().bitsLT(MVT::i32)) 110 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src); 111 112 Entry.Node = Src; 113 Entry.Ty = Type::getInt32Ty(*DAG.getContext()); 114 Entry.IsSExt = false; 115 Args.push_back(Entry); 116 } else { 117 Entry.Node = Src; 118 Args.push_back(Entry); 119 120 Entry.Node = Size; 121 Args.push_back(Entry); 122 } 123 124 static const RTLIB::Libcall FunctionImpls[4][3] = { 125 {RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8}, 126 {RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8}, 127 {RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8}, 128 {RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}}; 129 130 RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant]; 131 132 TargetLowering::CallLoweringInfo CLI(DAG); 133 CLI.setDebugLoc(dl) 134 .setChain(Chain) 135 .setLibCallee( 136 TLI->getLibcallCallingConv(NewLC), Type::getVoidTy(*DAG.getContext()), 137 DAG.getExternalSymbol(TLI->getLibcallName(NewLC), 138 TLI->getPointerTy(DAG.getDataLayout())), 139 std::move(Args)) 140 .setDiscardResult(); 141 std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI); 142 143 return CallResult.second; 144 } 145 146 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget, 147 const SelectionDAG &DAG, 148 ConstantSDNode *ConstantSize, 149 Align Alignment, bool IsMemcpy) { 150 auto &F = DAG.getMachineFunction().getFunction(); 151 if (!EnableMemtransferTPLoop) 152 return false; 153 if (EnableMemtransferTPLoop == TPLoop::ForceEnabled) 154 return true; 155 // Do not generate inline TP loop if optimizations is disabled, 156 // or if optimization for size (-Os or -Oz) is on. 157 if (F.hasOptNone() || F.hasOptSize()) 158 return false; 159 // If cli option is unset, for memset always generate inline TP. 160 // For memcpy, check some conditions 161 if (!IsMemcpy) 162 return true; 163 if (!ConstantSize && Alignment >= Align(4)) 164 return true; 165 if (ConstantSize && 166 ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() && 167 ConstantSize->getZExtValue() < 168 Subtarget.getMaxMemcpyTPInlineSizeThreshold()) 169 return true; 170 return false; 171 } 172 173 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy( 174 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 175 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, 176 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { 177 const ARMSubtarget &Subtarget = 178 DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); 179 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 180 181 if (Subtarget.hasMVEIntegerOps() && 182 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true)) 183 return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src, 184 DAG.getZExtOrTrunc(Size, dl, MVT::i32)); 185 186 // Do repeated 4-byte loads and stores. To be improved. 187 // This requires 4-byte alignment. 188 if (Alignment < Align(4)) 189 return SDValue(); 190 // This requires the copy size to be a constant, preferably 191 // within a subtarget-specific limit. 192 if (!ConstantSize) 193 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 194 Alignment.value(), RTLIB::MEMCPY); 195 uint64_t SizeVal = ConstantSize->getZExtValue(); 196 if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) 197 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 198 Alignment.value(), RTLIB::MEMCPY); 199 200 unsigned BytesLeft = SizeVal & 3; 201 unsigned NumMemOps = SizeVal >> 2; 202 unsigned EmittedNumMemOps = 0; 203 EVT VT = MVT::i32; 204 unsigned VTSize = 4; 205 unsigned i = 0; 206 // Emit a maximum of 4 loads in Thumb1 since we have fewer registers 207 const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; 208 SDValue TFOps[6]; 209 SDValue Loads[6]; 210 uint64_t SrcOff = 0, DstOff = 0; 211 212 // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to 213 // VLDM/VSTM and make this code emit it when appropriate. This would reduce 214 // pressure on the general purpose registers. However this seems harder to map 215 // onto the register allocator's view of the world. 216 217 // The number of MEMCPY pseudo-instructions to emit. We use up to 218 // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm 219 // later on. This is a lower bound on the number of MEMCPY operations we must 220 // emit. 221 unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; 222 223 // Code size optimisation: do not inline memcpy if expansion results in 224 // more instructions than the libary call. 225 if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) { 226 return SDValue(); 227 } 228 229 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); 230 231 for (unsigned I = 0; I != NumMEMCPYs; ++I) { 232 // Evenly distribute registers among MEMCPY operations to reduce register 233 // pressure. 234 unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; 235 unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; 236 237 Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, 238 DAG.getConstant(NumRegs, dl, MVT::i32)); 239 Src = Dst.getValue(1); 240 Chain = Dst.getValue(2); 241 242 DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); 243 SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); 244 245 EmittedNumMemOps = NextEmittedNumMemOps; 246 } 247 248 if (BytesLeft == 0) 249 return Chain; 250 251 // Issue loads / stores for the trailing (1 - 3) bytes. 252 auto getRemainingValueType = [](unsigned BytesLeft) { 253 return (BytesLeft >= 2) ? MVT::i16 : MVT::i8; 254 }; 255 auto getRemainingSize = [](unsigned BytesLeft) { 256 return (BytesLeft >= 2) ? 2 : 1; 257 }; 258 259 unsigned BytesLeftSave = BytesLeft; 260 i = 0; 261 while (BytesLeft) { 262 VT = getRemainingValueType(BytesLeft); 263 VTSize = getRemainingSize(BytesLeft); 264 Loads[i] = DAG.getLoad(VT, dl, Chain, 265 DAG.getNode(ISD::ADD, dl, MVT::i32, Src, 266 DAG.getConstant(SrcOff, dl, MVT::i32)), 267 SrcPtrInfo.getWithOffset(SrcOff)); 268 TFOps[i] = Loads[i].getValue(1); 269 ++i; 270 SrcOff += VTSize; 271 BytesLeft -= VTSize; 272 } 273 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i)); 274 275 i = 0; 276 BytesLeft = BytesLeftSave; 277 while (BytesLeft) { 278 VT = getRemainingValueType(BytesLeft); 279 VTSize = getRemainingSize(BytesLeft); 280 TFOps[i] = DAG.getStore(Chain, dl, Loads[i], 281 DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, 282 DAG.getConstant(DstOff, dl, MVT::i32)), 283 DstPtrInfo.getWithOffset(DstOff)); 284 ++i; 285 DstOff += VTSize; 286 BytesLeft -= VTSize; 287 } 288 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i)); 289 } 290 291 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove( 292 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 293 SDValue Size, Align Alignment, bool isVolatile, 294 MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { 295 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 296 Alignment.value(), RTLIB::MEMMOVE); 297 } 298 299 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( 300 SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, 301 SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, 302 MachinePointerInfo DstPtrInfo) const { 303 304 const ARMSubtarget &Subtarget = 305 DAG.getMachineFunction().getSubtarget<ARMSubtarget>(); 306 307 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 308 309 // Generate TP loop for llvm.memset 310 if (Subtarget.hasMVEIntegerOps() && 311 shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, 312 false)) { 313 Src = DAG.getSplatBuildVector(MVT::v16i8, dl, 314 DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src)); 315 return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src, 316 DAG.getZExtOrTrunc(Size, dl, MVT::i32)); 317 } 318 319 if (!AlwaysInline) 320 return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, 321 Alignment.value(), RTLIB::MEMSET); 322 323 return SDValue(); 324 } 325