xref: /freebsd/contrib/llvm-project/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp (revision 1342eb5a832fa10e689a29faab3acb6054e4778c)
1 //===-- ARMSelectionDAGInfo.cpp - ARM SelectionDAG Info -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the ARMSelectionDAGInfo class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "ARMTargetTransformInfo.h"
14 #include "llvm/CodeGen/SelectionDAG.h"
15 #include "llvm/Support/CommandLine.h"
16 using namespace llvm;
17 
18 #define DEBUG_TYPE "arm-selectiondag-info"
19 
20 static cl::opt<TPLoop::MemTransfer> EnableMemtransferTPLoop(
21     "arm-memtransfer-tploop", cl::Hidden,
22     cl::desc("Control conversion of memcpy to "
23              "Tail predicated loops (WLSTP)"),
24     cl::init(TPLoop::ForceDisabled),
25     cl::values(clEnumValN(TPLoop::ForceDisabled, "force-disabled",
26                           "Don't convert memcpy to TP loop."),
27                clEnumValN(TPLoop::ForceEnabled, "force-enabled",
28                           "Always convert memcpy to TP loop."),
29                clEnumValN(TPLoop::Allow, "allow",
30                           "Allow (may be subject to certain conditions) "
31                           "conversion of memcpy to TP loop.")));
32 
33 bool ARMSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
34   return Opcode >= ARMISD::FIRST_MEMORY_OPCODE &&
35          Opcode <= ARMISD::LAST_MEMORY_OPCODE;
36 }
37 
38 // Emit, if possible, a specialized version of the given Libcall. Typically this
39 // means selecting the appropriately aligned version, but we also convert memset
40 // of 0 into memclr.
41 SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
42     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
43     SDValue Size, unsigned Align, RTLIB::Libcall LC) const {
44   const ARMSubtarget &Subtarget =
45       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
46   const ARMTargetLowering *TLI = Subtarget.getTargetLowering();
47 
48   // Only use a specialized AEABI function if the default version of this
49   // Libcall is an AEABI function.
50   if (std::strncmp(TLI->getLibcallName(LC), "__aeabi", 7) != 0)
51     return SDValue();
52 
53   // Translate RTLIB::Libcall to AEABILibcall. We only do this in order to be
54   // able to translate memset to memclr and use the value to index the function
55   // name array.
56   enum {
57     AEABI_MEMCPY = 0,
58     AEABI_MEMMOVE,
59     AEABI_MEMSET,
60     AEABI_MEMCLR
61   } AEABILibcall;
62   switch (LC) {
63   case RTLIB::MEMCPY:
64     AEABILibcall = AEABI_MEMCPY;
65     break;
66   case RTLIB::MEMMOVE:
67     AEABILibcall = AEABI_MEMMOVE;
68     break;
69   case RTLIB::MEMSET:
70     AEABILibcall = AEABI_MEMSET;
71     if (isNullConstant(Src))
72       AEABILibcall = AEABI_MEMCLR;
73     break;
74   default:
75     return SDValue();
76   }
77 
78   // Choose the most-aligned libcall variant that we can
79   enum {
80     ALIGN1 = 0,
81     ALIGN4,
82     ALIGN8
83   } AlignVariant;
84   if ((Align & 7) == 0)
85     AlignVariant = ALIGN8;
86   else if ((Align & 3) == 0)
87     AlignVariant = ALIGN4;
88   else
89     AlignVariant = ALIGN1;
90 
91   TargetLowering::ArgListTy Args;
92   TargetLowering::ArgListEntry Entry;
93   Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
94   Entry.Node = Dst;
95   Args.push_back(Entry);
96   if (AEABILibcall == AEABI_MEMCLR) {
97     Entry.Node = Size;
98     Args.push_back(Entry);
99   } else if (AEABILibcall == AEABI_MEMSET) {
100     // Adjust parameters for memset, EABI uses format (ptr, size, value),
101     // GNU library uses (ptr, value, size)
102     // See RTABI section 4.3.4
103     Entry.Node = Size;
104     Args.push_back(Entry);
105 
106     // Extend or truncate the argument to be an i32 value for the call.
107     if (Src.getValueType().bitsGT(MVT::i32))
108       Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
109     else if (Src.getValueType().bitsLT(MVT::i32))
110       Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Src);
111 
112     Entry.Node = Src;
113     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
114     Entry.IsSExt = false;
115     Args.push_back(Entry);
116   } else {
117     Entry.Node = Src;
118     Args.push_back(Entry);
119 
120     Entry.Node = Size;
121     Args.push_back(Entry);
122   }
123 
124   static const RTLIB::Libcall FunctionImpls[4][3] = {
125       {RTLIB::MEMCPY, RTLIB::AEABI_MEMCPY4, RTLIB::AEABI_MEMCPY8},
126       {RTLIB::MEMMOVE, RTLIB::AEABI_MEMMOVE4, RTLIB::AEABI_MEMMOVE8},
127       {RTLIB::MEMSET, RTLIB::AEABI_MEMSET4, RTLIB::AEABI_MEMSET8},
128       {RTLIB::AEABI_MEMCLR, RTLIB::AEABI_MEMCLR4, RTLIB::AEABI_MEMCLR8}};
129 
130   RTLIB::Libcall NewLC = FunctionImpls[AEABILibcall][AlignVariant];
131 
132   TargetLowering::CallLoweringInfo CLI(DAG);
133   CLI.setDebugLoc(dl)
134       .setChain(Chain)
135       .setLibCallee(
136           TLI->getLibcallCallingConv(NewLC), Type::getVoidTy(*DAG.getContext()),
137           DAG.getExternalSymbol(TLI->getLibcallName(NewLC),
138                                 TLI->getPointerTy(DAG.getDataLayout())),
139           std::move(Args))
140       .setDiscardResult();
141   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
142 
143   return CallResult.second;
144 }
145 
146 static bool shouldGenerateInlineTPLoop(const ARMSubtarget &Subtarget,
147                                        const SelectionDAG &DAG,
148                                        ConstantSDNode *ConstantSize,
149                                        Align Alignment, bool IsMemcpy) {
150   auto &F = DAG.getMachineFunction().getFunction();
151   if (!EnableMemtransferTPLoop)
152     return false;
153   if (EnableMemtransferTPLoop == TPLoop::ForceEnabled)
154     return true;
155   // Do not generate inline TP loop if optimizations is disabled,
156   // or if optimization for size (-Os or -Oz) is on.
157   if (F.hasOptNone() || F.hasOptSize())
158     return false;
159   // If cli option is unset, for memset always generate inline TP.
160   // For memcpy, check some conditions
161   if (!IsMemcpy)
162     return true;
163   if (!ConstantSize && Alignment >= Align(4))
164     return true;
165   if (ConstantSize &&
166       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold() &&
167       ConstantSize->getZExtValue() <
168           Subtarget.getMaxMemcpyTPInlineSizeThreshold())
169     return true;
170   return false;
171 }
172 
173 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
174     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
175     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
176     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
177   const ARMSubtarget &Subtarget =
178       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
179   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
180 
181   if (Subtarget.hasMVEIntegerOps() &&
182       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment, true))
183     return DAG.getNode(ARMISD::MEMCPYLOOP, dl, MVT::Other, Chain, Dst, Src,
184                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
185 
186   // Do repeated 4-byte loads and stores. To be improved.
187   // This requires 4-byte alignment.
188   if (Alignment < Align(4))
189     return SDValue();
190   // This requires the copy size to be a constant, preferably
191   // within a subtarget-specific limit.
192   if (!ConstantSize)
193     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
194                                   Alignment.value(), RTLIB::MEMCPY);
195   uint64_t SizeVal = ConstantSize->getZExtValue();
196   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
197     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
198                                   Alignment.value(), RTLIB::MEMCPY);
199 
200   unsigned BytesLeft = SizeVal & 3;
201   unsigned NumMemOps = SizeVal >> 2;
202   unsigned EmittedNumMemOps = 0;
203   EVT VT = MVT::i32;
204   unsigned VTSize = 4;
205   unsigned i = 0;
206   // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
207   const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6;
208   SDValue TFOps[6];
209   SDValue Loads[6];
210   uint64_t SrcOff = 0, DstOff = 0;
211 
212   // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to
213   // VLDM/VSTM and make this code emit it when appropriate. This would reduce
214   // pressure on the general purpose registers. However this seems harder to map
215   // onto the register allocator's view of the world.
216 
217   // The number of MEMCPY pseudo-instructions to emit. We use up to
218   // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm
219   // later on. This is a lower bound on the number of MEMCPY operations we must
220   // emit.
221   unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM;
222 
223   // Code size optimisation: do not inline memcpy if expansion results in
224   // more instructions than the libary call.
225   if (NumMEMCPYs > 1 && Subtarget.hasMinSize()) {
226     return SDValue();
227   }
228 
229   SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue);
230 
231   for (unsigned I = 0; I != NumMEMCPYs; ++I) {
232     // Evenly distribute registers among MEMCPY operations to reduce register
233     // pressure.
234     unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs;
235     unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps;
236 
237     Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src,
238                       DAG.getConstant(NumRegs, dl, MVT::i32));
239     Src = Dst.getValue(1);
240     Chain = Dst.getValue(2);
241 
242     DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize);
243     SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize);
244 
245     EmittedNumMemOps = NextEmittedNumMemOps;
246   }
247 
248   if (BytesLeft == 0)
249     return Chain;
250 
251   // Issue loads / stores for the trailing (1 - 3) bytes.
252   auto getRemainingValueType = [](unsigned BytesLeft) {
253     return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
254   };
255   auto getRemainingSize = [](unsigned BytesLeft) {
256     return (BytesLeft >= 2) ? 2 : 1;
257   };
258 
259   unsigned BytesLeftSave = BytesLeft;
260   i = 0;
261   while (BytesLeft) {
262     VT = getRemainingValueType(BytesLeft);
263     VTSize = getRemainingSize(BytesLeft);
264     Loads[i] = DAG.getLoad(VT, dl, Chain,
265                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
266                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
267                            SrcPtrInfo.getWithOffset(SrcOff));
268     TFOps[i] = Loads[i].getValue(1);
269     ++i;
270     SrcOff += VTSize;
271     BytesLeft -= VTSize;
272   }
273   Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
274 
275   i = 0;
276   BytesLeft = BytesLeftSave;
277   while (BytesLeft) {
278     VT = getRemainingValueType(BytesLeft);
279     VTSize = getRemainingSize(BytesLeft);
280     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
281                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
282                                         DAG.getConstant(DstOff, dl, MVT::i32)),
283                             DstPtrInfo.getWithOffset(DstOff));
284     ++i;
285     DstOff += VTSize;
286     BytesLeft -= VTSize;
287   }
288   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ArrayRef(TFOps, i));
289 }
290 
291 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
292     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
293     SDValue Size, Align Alignment, bool isVolatile,
294     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
295   return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
296                                 Alignment.value(), RTLIB::MEMMOVE);
297 }
298 
299 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
300     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
301     SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
302     MachinePointerInfo DstPtrInfo) const {
303 
304   const ARMSubtarget &Subtarget =
305       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
306 
307   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
308 
309   // Generate TP loop for llvm.memset
310   if (Subtarget.hasMVEIntegerOps() &&
311       shouldGenerateInlineTPLoop(Subtarget, DAG, ConstantSize, Alignment,
312                                  false)) {
313     Src = DAG.getSplatBuildVector(MVT::v16i8, dl,
314                                   DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src));
315     return DAG.getNode(ARMISD::MEMSETLOOP, dl, MVT::Other, Chain, Dst, Src,
316                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
317   }
318 
319   if (!AlwaysInline)
320     return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
321                                   Alignment.value(), RTLIB::MEMSET);
322 
323   return SDValue();
324 }
325