xref: /freebsd/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp (revision 3ceba58a7509418b47b8fca2d2b6bbf088714e26)
1 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the interfaces that VE uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "VEISelLowering.h"
15 #include "MCTargetDesc/VEMCExpr.h"
16 #include "VECustomDAG.h"
17 #include "VEInstrBuilder.h"
18 #include "VEMachineFunctionInfo.h"
19 #include "VERegisterInfo.h"
20 #include "VETargetMachine.h"
21 #include "llvm/ADT/StringSwitch.h"
22 #include "llvm/CodeGen/CallingConvLower.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineJumpTableInfo.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/SelectionDAG.h"
30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/Module.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/KnownBits.h"
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "ve-lower"
40 
41 //===----------------------------------------------------------------------===//
42 // Calling Convention Implementation
43 //===----------------------------------------------------------------------===//
44 
45 #include "VEGenCallingConv.inc"
46 
47 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
48   switch (CallConv) {
49   default:
50     return RetCC_VE_C;
51   case CallingConv::Fast:
52     return RetCC_VE_Fast;
53   }
54 }
55 
56 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
57   if (IsVarArg)
58     return CC_VE2;
59   switch (CallConv) {
60   default:
61     return CC_VE_C;
62   case CallingConv::Fast:
63     return CC_VE_Fast;
64   }
65 }
66 
67 bool VETargetLowering::CanLowerReturn(
68     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
69     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
70   CCAssignFn *RetCC = getReturnCC(CallConv);
71   SmallVector<CCValAssign, 16> RVLocs;
72   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
73   return CCInfo.CheckReturn(Outs, RetCC);
74 }
75 
76 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
77                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
78 
79 static const MVT AllMaskVTs[] = {MVT::v256i1, MVT::v512i1};
80 
81 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
82 
83 void VETargetLowering::initRegisterClasses() {
84   // Set up the register classes.
85   addRegisterClass(MVT::i32, &VE::I32RegClass);
86   addRegisterClass(MVT::i64, &VE::I64RegClass);
87   addRegisterClass(MVT::f32, &VE::F32RegClass);
88   addRegisterClass(MVT::f64, &VE::I64RegClass);
89   addRegisterClass(MVT::f128, &VE::F128RegClass);
90 
91   if (Subtarget->enableVPU()) {
92     for (MVT VecVT : AllVectorVTs)
93       addRegisterClass(VecVT, &VE::V64RegClass);
94     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
95     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
96   }
97 }
98 
99 void VETargetLowering::initSPUActions() {
100   const auto &TM = getTargetMachine();
101   /// Load & Store {
102 
103   // VE doesn't have i1 sign extending load.
104   for (MVT VT : MVT::integer_valuetypes()) {
105     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
106     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
107     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
108     setTruncStoreAction(VT, MVT::i1, Expand);
109   }
110 
111   // VE doesn't have floating point extload/truncstore, so expand them.
112   for (MVT FPVT : MVT::fp_valuetypes()) {
113     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
114       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
115       setTruncStoreAction(FPVT, OtherFPVT, Expand);
116     }
117   }
118 
119   // VE doesn't have fp128 load/store, so expand them in custom lower.
120   setOperationAction(ISD::LOAD, MVT::f128, Custom);
121   setOperationAction(ISD::STORE, MVT::f128, Custom);
122 
123   /// } Load & Store
124 
125   // Custom legalize address nodes into LO/HI parts.
126   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
127   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
128   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
129   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
130   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
131   setOperationAction(ISD::JumpTable, PtrVT, Custom);
132 
133   /// VAARG handling {
134   setOperationAction(ISD::VASTART, MVT::Other, Custom);
135   // VAARG needs to be lowered to access with 8 bytes alignment.
136   setOperationAction(ISD::VAARG, MVT::Other, Custom);
137   // Use the default implementation.
138   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
139   setOperationAction(ISD::VAEND, MVT::Other, Expand);
140   /// } VAARG handling
141 
142   /// Stack {
143   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
144   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
145 
146   // Use the default implementation.
147   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
148   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
149   /// } Stack
150 
151   /// Branch {
152 
153   // VE doesn't have BRCOND
154   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
155 
156   // BR_JT is not implemented yet.
157   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
158 
159   /// } Branch
160 
161   /// Int Ops {
162   for (MVT IntVT : {MVT::i32, MVT::i64}) {
163     // VE has no REM or DIVREM operations.
164     setOperationAction(ISD::UREM, IntVT, Expand);
165     setOperationAction(ISD::SREM, IntVT, Expand);
166     setOperationAction(ISD::SDIVREM, IntVT, Expand);
167     setOperationAction(ISD::UDIVREM, IntVT, Expand);
168 
169     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
170     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
171     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
172     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
173 
174     // VE has no MULHU/S or U/SMUL_LOHI operations.
175     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
176     setOperationAction(ISD::MULHU, IntVT, Expand);
177     setOperationAction(ISD::MULHS, IntVT, Expand);
178     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
179     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
180 
181     // VE has no CTTZ, ROTL, ROTR operations.
182     setOperationAction(ISD::CTTZ, IntVT, Expand);
183     setOperationAction(ISD::ROTL, IntVT, Expand);
184     setOperationAction(ISD::ROTR, IntVT, Expand);
185 
186     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
187     // instruction works fine as i32 BSWAP operation with an additional
188     // parameter.  Use isel patterns to lower BSWAP.
189     setOperationAction(ISD::BSWAP, IntVT, Legal);
190 
191     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
192     // operations.  Use isel patterns for i64, promote for i32.
193     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
194     setOperationAction(ISD::BITREVERSE, IntVT, Act);
195     setOperationAction(ISD::CTLZ, IntVT, Act);
196     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
197     setOperationAction(ISD::CTPOP, IntVT, Act);
198 
199     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
200     // Use isel patterns for i64, promote for i32.
201     setOperationAction(ISD::AND, IntVT, Act);
202     setOperationAction(ISD::OR, IntVT, Act);
203     setOperationAction(ISD::XOR, IntVT, Act);
204 
205     // Legal smax and smin
206     setOperationAction(ISD::SMAX, IntVT, Legal);
207     setOperationAction(ISD::SMIN, IntVT, Legal);
208   }
209   /// } Int Ops
210 
211   /// Conversion {
212   // VE doesn't have instructions for fp<->uint, so expand them by llvm
213   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
214   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
215   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
216   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
217 
218   // fp16 not supported
219   for (MVT FPVT : MVT::fp_valuetypes()) {
220     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
221     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
222   }
223   /// } Conversion
224 
225   /// Floating-point Ops {
226   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
227   ///       and fcmp.
228 
229   // VE doesn't have following floating point operations.
230   for (MVT VT : MVT::fp_valuetypes()) {
231     setOperationAction(ISD::FNEG, VT, Expand);
232     setOperationAction(ISD::FREM, VT, Expand);
233   }
234 
235   // VE doesn't have fdiv of f128.
236   setOperationAction(ISD::FDIV, MVT::f128, Expand);
237 
238   for (MVT FPVT : {MVT::f32, MVT::f64}) {
239     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
240     setOperationAction(ISD::ConstantFP, FPVT, Legal);
241   }
242   /// } Floating-point Ops
243 
244   /// Floating-point math functions {
245 
246   // VE doesn't have following floating point math functions.
247   for (MVT VT : MVT::fp_valuetypes()) {
248     setOperationAction(ISD::FABS, VT, Expand);
249     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
250     setOperationAction(ISD::FCOS, VT, Expand);
251     setOperationAction(ISD::FMA, VT, Expand);
252     setOperationAction(ISD::FPOW, VT, Expand);
253     setOperationAction(ISD::FSIN, VT, Expand);
254     setOperationAction(ISD::FSQRT, VT, Expand);
255   }
256 
257   // VE has single and double FMINNUM and FMAXNUM
258   for (MVT VT : {MVT::f32, MVT::f64}) {
259     setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, VT, Legal);
260   }
261 
262   /// } Floating-point math functions
263 
264   /// Atomic instructions {
265 
266   setMaxAtomicSizeInBitsSupported(64);
267   setMinCmpXchgSizeInBits(32);
268   setSupportsUnalignedAtomics(false);
269 
270   // Use custom inserter for ATOMIC_FENCE.
271   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
272 
273   // Other atomic instructions.
274   for (MVT VT : MVT::integer_valuetypes()) {
275     // Support i8/i16 atomic swap.
276     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
277 
278     // FIXME: Support "atmam" instructions.
279     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
280     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
281     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
282     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
283 
284     // VE doesn't have follwing instructions.
285     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
286     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
287     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
288     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
289     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
290     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
291     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
292     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
293   }
294 
295   /// } Atomic instructions
296 
297   /// SJLJ instructions {
298   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
299   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
300   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
301   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
302     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
303   /// } SJLJ instructions
304 
305   // Intrinsic instructions
306   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
307 }
308 
309 void VETargetLowering::initVPUActions() {
310   for (MVT LegalMaskVT : AllMaskVTs)
311     setOperationAction(ISD::BUILD_VECTOR, LegalMaskVT, Custom);
312 
313   for (unsigned Opc : {ISD::AND, ISD::OR, ISD::XOR})
314     setOperationAction(Opc, MVT::v512i1, Custom);
315 
316   for (MVT LegalVecVT : AllVectorVTs) {
317     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
318     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
319     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
320     // Translate all vector instructions with legal element types to VVP_*
321     // nodes.
322     // TODO We will custom-widen into VVP_* nodes in the future. While we are
323     // buildling the infrastructure for this, we only do this for legal vector
324     // VTs.
325 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
326   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
327 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
328   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
329     setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_LOAD, LegalVecVT, Custom);
330     setOperationAction(ISD::EXPERIMENTAL_VP_STRIDED_STORE, LegalVecVT, Custom);
331 #include "VVPNodes.def"
332   }
333 
334   for (MVT LegalPackedVT : AllPackedVTs) {
335     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
336     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
337   }
338 
339   // vNt32, vNt64 ops (legal element types)
340   for (MVT VT : MVT::vector_valuetypes()) {
341     MVT ElemVT = VT.getVectorElementType();
342     unsigned ElemBits = ElemVT.getScalarSizeInBits();
343     if (ElemBits != 32 && ElemBits != 64)
344       continue;
345 
346     for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
347       setOperationAction(MemOpc, VT, Custom);
348 
349     const ISD::NodeType IntReductionOCs[] = {
350         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_MUL,  ISD::VECREDUCE_AND,
351         ISD::VECREDUCE_OR,   ISD::VECREDUCE_XOR,  ISD::VECREDUCE_SMIN,
352         ISD::VECREDUCE_SMAX, ISD::VECREDUCE_UMIN, ISD::VECREDUCE_UMAX};
353 
354     for (unsigned IntRedOpc : IntReductionOCs)
355       setOperationAction(IntRedOpc, VT, Custom);
356   }
357 
358   // v256i1 and v512i1 ops
359   for (MVT MaskVT : AllMaskVTs) {
360     // Custom lower mask ops
361     setOperationAction(ISD::STORE, MaskVT, Custom);
362     setOperationAction(ISD::LOAD, MaskVT, Custom);
363   }
364 }
365 
366 SDValue
367 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
368                               bool IsVarArg,
369                               const SmallVectorImpl<ISD::OutputArg> &Outs,
370                               const SmallVectorImpl<SDValue> &OutVals,
371                               const SDLoc &DL, SelectionDAG &DAG) const {
372   // CCValAssign - represent the assignment of the return value to locations.
373   SmallVector<CCValAssign, 16> RVLocs;
374 
375   // CCState - Info about the registers and stack slot.
376   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
377                  *DAG.getContext());
378 
379   // Analyze return values.
380   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
381 
382   SDValue Glue;
383   SmallVector<SDValue, 4> RetOps(1, Chain);
384 
385   // Copy the result values into the output registers.
386   for (unsigned i = 0; i != RVLocs.size(); ++i) {
387     CCValAssign &VA = RVLocs[i];
388     assert(VA.isRegLoc() && "Can only return in registers!");
389     assert(!VA.needsCustom() && "Unexpected custom lowering");
390     SDValue OutVal = OutVals[i];
391 
392     // Integer return values must be sign or zero extended by the callee.
393     switch (VA.getLocInfo()) {
394     case CCValAssign::Full:
395       break;
396     case CCValAssign::SExt:
397       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
398       break;
399     case CCValAssign::ZExt:
400       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
401       break;
402     case CCValAssign::AExt:
403       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
404       break;
405     case CCValAssign::BCvt: {
406       // Convert a float return value to i64 with padding.
407       //     63     31   0
408       //    +------+------+
409       //    | float|   0  |
410       //    +------+------+
411       assert(VA.getLocVT() == MVT::i64);
412       assert(VA.getValVT() == MVT::f32);
413       SDValue Undef = SDValue(
414           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
415       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
416       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
417                                           MVT::i64, Undef, OutVal, Sub_f32),
418                        0);
419       break;
420     }
421     default:
422       llvm_unreachable("Unknown loc info!");
423     }
424 
425     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Glue);
426 
427     // Guarantee that all emitted copies are stuck together with flags.
428     Glue = Chain.getValue(1);
429     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
430   }
431 
432   RetOps[0] = Chain; // Update chain.
433 
434   // Add the glue if we have it.
435   if (Glue.getNode())
436     RetOps.push_back(Glue);
437 
438   return DAG.getNode(VEISD::RET_GLUE, DL, MVT::Other, RetOps);
439 }
440 
441 SDValue VETargetLowering::LowerFormalArguments(
442     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
443     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
444     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
445   MachineFunction &MF = DAG.getMachineFunction();
446 
447   // Get the base offset of the incoming arguments stack space.
448   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
449   // Get the size of the preserved arguments area
450   unsigned ArgsPreserved = 64;
451 
452   // Analyze arguments according to CC_VE.
453   SmallVector<CCValAssign, 16> ArgLocs;
454   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
455                  *DAG.getContext());
456   // Allocate the preserved area first.
457   CCInfo.AllocateStack(ArgsPreserved, Align(8));
458   // We already allocated the preserved area, so the stack offset computed
459   // by CC_VE would be correct now.
460   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
461 
462   for (const CCValAssign &VA : ArgLocs) {
463     assert(!VA.needsCustom() && "Unexpected custom lowering");
464     if (VA.isRegLoc()) {
465       // This argument is passed in a register.
466       // All integer register arguments are promoted by the caller to i64.
467 
468       // Create a virtual register for the promoted live-in value.
469       Register VReg =
470           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
471       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
472 
473       // The caller promoted the argument, so insert an Assert?ext SDNode so we
474       // won't promote the value again in this function.
475       switch (VA.getLocInfo()) {
476       case CCValAssign::SExt:
477         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
478                           DAG.getValueType(VA.getValVT()));
479         break;
480       case CCValAssign::ZExt:
481         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
482                           DAG.getValueType(VA.getValVT()));
483         break;
484       case CCValAssign::BCvt: {
485         // Extract a float argument from i64 with padding.
486         //     63     31   0
487         //    +------+------+
488         //    | float|   0  |
489         //    +------+------+
490         assert(VA.getLocVT() == MVT::i64);
491         assert(VA.getValVT() == MVT::f32);
492         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
493         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
494                                          MVT::f32, Arg, Sub_f32),
495                       0);
496         break;
497       }
498       default:
499         break;
500       }
501 
502       // Truncate the register down to the argument type.
503       if (VA.isExtInLoc())
504         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
505 
506       InVals.push_back(Arg);
507       continue;
508     }
509 
510     // The registers are exhausted. This argument was passed on the stack.
511     assert(VA.isMemLoc());
512     // The CC_VE_Full/Half functions compute stack offsets relative to the
513     // beginning of the arguments area at %fp + the size of reserved area.
514     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
515     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
516 
517     // Adjust offset for a float argument by adding 4 since the argument is
518     // stored in 8 bytes buffer with offset like below.  LLVM generates
519     // 4 bytes load instruction, so need to adjust offset here.  This
520     // adjustment is required in only LowerFormalArguments.  In LowerCall,
521     // a float argument is converted to i64 first, and stored as 8 bytes
522     // data, which is required by ABI, so no need for adjustment.
523     //    0      4
524     //    +------+------+
525     //    | empty| float|
526     //    +------+------+
527     if (VA.getValVT() == MVT::f32)
528       Offset += 4;
529 
530     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
531     InVals.push_back(
532         DAG.getLoad(VA.getValVT(), DL, Chain,
533                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
534                     MachinePointerInfo::getFixedStack(MF, FI)));
535   }
536 
537   if (!IsVarArg)
538     return Chain;
539 
540   // This function takes variable arguments, some of which may have been passed
541   // in registers %s0-%s8.
542   //
543   // The va_start intrinsic needs to know the offset to the first variable
544   // argument.
545   // TODO: need to calculate offset correctly once we support f128.
546   unsigned ArgOffset = ArgLocs.size() * 8;
547   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
548   // Skip the reserved area at the top of stack.
549   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
550 
551   return Chain;
552 }
553 
554 // FIXME? Maybe this could be a TableGen attribute on some registers and
555 // this table could be generated automatically from RegInfo.
556 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
557                                              const MachineFunction &MF) const {
558   Register Reg = StringSwitch<Register>(RegName)
559                      .Case("sp", VE::SX11)    // Stack pointer
560                      .Case("fp", VE::SX9)     // Frame pointer
561                      .Case("sl", VE::SX8)     // Stack limit
562                      .Case("lr", VE::SX10)    // Link register
563                      .Case("tp", VE::SX14)    // Thread pointer
564                      .Case("outer", VE::SX12) // Outer regiser
565                      .Case("info", VE::SX17)  // Info area register
566                      .Case("got", VE::SX15)   // Global offset table register
567                      .Case("plt", VE::SX16) // Procedure linkage table register
568                      .Default(0);
569 
570   if (Reg)
571     return Reg;
572 
573   report_fatal_error("Invalid register name global variable");
574 }
575 
576 //===----------------------------------------------------------------------===//
577 // TargetLowering Implementation
578 //===----------------------------------------------------------------------===//
579 
580 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
581                                     SmallVectorImpl<SDValue> &InVals) const {
582   SelectionDAG &DAG = CLI.DAG;
583   SDLoc DL = CLI.DL;
584   SDValue Chain = CLI.Chain;
585   auto PtrVT = getPointerTy(DAG.getDataLayout());
586 
587   // VE target does not yet support tail call optimization.
588   CLI.IsTailCall = false;
589 
590   // Get the base offset of the outgoing arguments stack space.
591   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
592   // Get the size of the preserved arguments area
593   unsigned ArgsPreserved = 8 * 8u;
594 
595   // Analyze operands of the call, assigning locations to each operand.
596   SmallVector<CCValAssign, 16> ArgLocs;
597   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
598                  *DAG.getContext());
599   // Allocate the preserved area first.
600   CCInfo.AllocateStack(ArgsPreserved, Align(8));
601   // We already allocated the preserved area, so the stack offset computed
602   // by CC_VE would be correct now.
603   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
604 
605   // VE requires to use both register and stack for varargs or no-prototyped
606   // functions.
607   bool UseBoth = CLI.IsVarArg;
608 
609   // Analyze operands again if it is required to store BOTH.
610   SmallVector<CCValAssign, 16> ArgLocs2;
611   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
612                   ArgLocs2, *DAG.getContext());
613   if (UseBoth)
614     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
615 
616   // Get the size of the outgoing arguments stack space requirement.
617   unsigned ArgsSize = CCInfo.getStackSize();
618 
619   // Keep stack frames 16-byte aligned.
620   ArgsSize = alignTo(ArgsSize, 16);
621 
622   // Adjust the stack pointer to make room for the arguments.
623   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
624   // with more than 6 arguments.
625   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
626 
627   // Collect the set of registers to pass to the function and their values.
628   // This will be emitted as a sequence of CopyToReg nodes glued to the call
629   // instruction.
630   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
631 
632   // Collect chains from all the memory opeations that copy arguments to the
633   // stack. They must follow the stack pointer adjustment above and precede the
634   // call instruction itself.
635   SmallVector<SDValue, 8> MemOpChains;
636 
637   // VE needs to get address of callee function in a register
638   // So, prepare to copy it to SX12 here.
639 
640   // If the callee is a GlobalAddress node (quite common, every direct call is)
641   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
642   // Likewise ExternalSymbol -> TargetExternalSymbol.
643   SDValue Callee = CLI.Callee;
644 
645   bool IsPICCall = isPositionIndependent();
646 
647   // PC-relative references to external symbols should go through $stub.
648   // If so, we need to prepare GlobalBaseReg first.
649   const TargetMachine &TM = DAG.getTarget();
650   const GlobalValue *GV = nullptr;
651   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
652   if (CalleeG)
653     GV = CalleeG->getGlobal();
654   bool Local = TM.shouldAssumeDSOLocal(GV);
655   bool UsePlt = !Local;
656   MachineFunction &MF = DAG.getMachineFunction();
657 
658   // Turn GlobalAddress/ExternalSymbol node into a value node
659   // containing the address of them here.
660   if (CalleeG) {
661     if (IsPICCall) {
662       if (UsePlt)
663         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
664       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
665       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
666     } else {
667       Callee =
668           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
669     }
670   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
671     if (IsPICCall) {
672       if (UsePlt)
673         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
674       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
675       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
676     } else {
677       Callee =
678           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
679     }
680   }
681 
682   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
683 
684   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
685     CCValAssign &VA = ArgLocs[i];
686     SDValue Arg = CLI.OutVals[i];
687 
688     // Promote the value if needed.
689     switch (VA.getLocInfo()) {
690     default:
691       llvm_unreachable("Unknown location info!");
692     case CCValAssign::Full:
693       break;
694     case CCValAssign::SExt:
695       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
696       break;
697     case CCValAssign::ZExt:
698       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
699       break;
700     case CCValAssign::AExt:
701       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
702       break;
703     case CCValAssign::BCvt: {
704       // Convert a float argument to i64 with padding.
705       //     63     31   0
706       //    +------+------+
707       //    | float|   0  |
708       //    +------+------+
709       assert(VA.getLocVT() == MVT::i64);
710       assert(VA.getValVT() == MVT::f32);
711       SDValue Undef = SDValue(
712           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
713       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
714       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
715                                        MVT::i64, Undef, Arg, Sub_f32),
716                     0);
717       break;
718     }
719     }
720 
721     if (VA.isRegLoc()) {
722       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
723       if (!UseBoth)
724         continue;
725       VA = ArgLocs2[i];
726     }
727 
728     assert(VA.isMemLoc());
729 
730     // Create a store off the stack pointer for this argument.
731     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
732     // The argument area starts at %fp/%sp + the size of reserved area.
733     SDValue PtrOff =
734         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
735     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
736     MemOpChains.push_back(
737         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
738   }
739 
740   // Emit all stores, make sure they occur before the call.
741   if (!MemOpChains.empty())
742     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
743 
744   // Build a sequence of CopyToReg nodes glued together with token chain and
745   // glue operands which copy the outgoing args into registers. The InGlue is
746   // necessary since all emitted instructions must be stuck together in order
747   // to pass the live physical registers.
748   SDValue InGlue;
749   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
750     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
751                              RegsToPass[i].second, InGlue);
752     InGlue = Chain.getValue(1);
753   }
754 
755   // Build the operands for the call instruction itself.
756   SmallVector<SDValue, 8> Ops;
757   Ops.push_back(Chain);
758   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
759     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
760                                   RegsToPass[i].second.getValueType()));
761 
762   // Add a register mask operand representing the call-preserved registers.
763   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
764   const uint32_t *Mask =
765       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
766   assert(Mask && "Missing call preserved mask for calling convention");
767   Ops.push_back(DAG.getRegisterMask(Mask));
768 
769   // Make sure the CopyToReg nodes are glued to the call instruction which
770   // consumes the registers.
771   if (InGlue.getNode())
772     Ops.push_back(InGlue);
773 
774   // Now the call itself.
775   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
776   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
777   InGlue = Chain.getValue(1);
778 
779   // Revert the stack pointer immediately after the call.
780   Chain = DAG.getCALLSEQ_END(Chain, ArgsSize, 0, InGlue, DL);
781   InGlue = Chain.getValue(1);
782 
783   // Now extract the return values. This is more or less the same as
784   // LowerFormalArguments.
785 
786   // Assign locations to each value returned by this call.
787   SmallVector<CCValAssign, 16> RVLocs;
788   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
789                  *DAG.getContext());
790 
791   // Set inreg flag manually for codegen generated library calls that
792   // return float.
793   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
794     CLI.Ins[0].Flags.setInReg();
795 
796   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
797 
798   // Copy all of the result registers out of their specified physreg.
799   for (unsigned i = 0; i != RVLocs.size(); ++i) {
800     CCValAssign &VA = RVLocs[i];
801     assert(!VA.needsCustom() && "Unexpected custom lowering");
802     Register Reg = VA.getLocReg();
803 
804     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
805     // reside in the same register in the high and low bits. Reuse the
806     // CopyFromReg previous node to avoid duplicate copies.
807     SDValue RV;
808     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
809       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
810         RV = Chain.getValue(0);
811 
812     // But usually we'll create a new CopyFromReg for a different register.
813     if (!RV.getNode()) {
814       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
815       Chain = RV.getValue(1);
816       InGlue = Chain.getValue(2);
817     }
818 
819     // The callee promoted the return value, so insert an Assert?ext SDNode so
820     // we won't promote the value again in this function.
821     switch (VA.getLocInfo()) {
822     case CCValAssign::SExt:
823       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
824                        DAG.getValueType(VA.getValVT()));
825       break;
826     case CCValAssign::ZExt:
827       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
828                        DAG.getValueType(VA.getValVT()));
829       break;
830     case CCValAssign::BCvt: {
831       // Extract a float return value from i64 with padding.
832       //     63     31   0
833       //    +------+------+
834       //    | float|   0  |
835       //    +------+------+
836       assert(VA.getLocVT() == MVT::i64);
837       assert(VA.getValVT() == MVT::f32);
838       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
839       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
840                                       MVT::f32, RV, Sub_f32),
841                    0);
842       break;
843     }
844     default:
845       break;
846     }
847 
848     // Truncate the register down to the return value type.
849     if (VA.isExtInLoc())
850       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
851 
852     InVals.push_back(RV);
853   }
854 
855   return Chain;
856 }
857 
858 bool VETargetLowering::isOffsetFoldingLegal(
859     const GlobalAddressSDNode *GA) const {
860   // VE uses 64 bit addressing, so we need multiple instructions to generate
861   // an address.  Folding address with offset increases the number of
862   // instructions, so that we disable it here.  Offsets will be folded in
863   // the DAG combine later if it worth to do so.
864   return false;
865 }
866 
867 /// isFPImmLegal - Returns true if the target can instruction select the
868 /// specified FP immediate natively. If false, the legalizer will
869 /// materialize the FP immediate as a load from a constant pool.
870 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
871                                     bool ForCodeSize) const {
872   return VT == MVT::f32 || VT == MVT::f64;
873 }
874 
875 /// Determine if the target supports unaligned memory accesses.
876 ///
877 /// This function returns true if the target allows unaligned memory accesses
878 /// of the specified type in the given address space. If true, it also returns
879 /// whether the unaligned memory access is "fast" in the last argument by
880 /// reference. This is used, for example, in situations where an array
881 /// copy/move/set is converted to a sequence of store operations. Its use
882 /// helps to ensure that such replacements don't generate code that causes an
883 /// alignment error (trap) on the target machine.
884 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
885                                                       unsigned AddrSpace,
886                                                       Align A,
887                                                       MachineMemOperand::Flags,
888                                                       unsigned *Fast) const {
889   if (Fast) {
890     // It's fast anytime on VE
891     *Fast = 1;
892   }
893   return true;
894 }
895 
896 VETargetLowering::VETargetLowering(const TargetMachine &TM,
897                                    const VESubtarget &STI)
898     : TargetLowering(TM), Subtarget(&STI) {
899   // Instructions which use registers as conditionals examine all the
900   // bits (as does the pseudo SELECT_CC expansion). I don't think it
901   // matters much whether it's ZeroOrOneBooleanContent, or
902   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
903   // former.
904   setBooleanContents(ZeroOrOneBooleanContent);
905   setBooleanVectorContents(ZeroOrOneBooleanContent);
906 
907   initRegisterClasses();
908   initSPUActions();
909   initVPUActions();
910 
911   setStackPointerRegisterToSaveRestore(VE::SX11);
912 
913   // We have target-specific dag combine patterns for the following nodes:
914   setTargetDAGCombine(ISD::TRUNCATE);
915   setTargetDAGCombine(ISD::SELECT);
916   setTargetDAGCombine(ISD::SELECT_CC);
917 
918   // Set function alignment to 16 bytes
919   setMinFunctionAlignment(Align(16));
920 
921   // VE stores all argument by 8 bytes alignment
922   setMinStackArgumentAlignment(Align(8));
923 
924   computeRegisterProperties(Subtarget->getRegisterInfo());
925 }
926 
927 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
928 #define TARGET_NODE_CASE(NAME)                                                 \
929   case VEISD::NAME:                                                            \
930     return "VEISD::" #NAME;
931   switch ((VEISD::NodeType)Opcode) {
932   case VEISD::FIRST_NUMBER:
933     break;
934     TARGET_NODE_CASE(CMPI)
935     TARGET_NODE_CASE(CMPU)
936     TARGET_NODE_CASE(CMPF)
937     TARGET_NODE_CASE(CMPQ)
938     TARGET_NODE_CASE(CMOV)
939     TARGET_NODE_CASE(CALL)
940     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
941     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
942     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
943     TARGET_NODE_CASE(GETFUNPLT)
944     TARGET_NODE_CASE(GETSTACKTOP)
945     TARGET_NODE_CASE(GETTLSADDR)
946     TARGET_NODE_CASE(GLOBAL_BASE_REG)
947     TARGET_NODE_CASE(Hi)
948     TARGET_NODE_CASE(Lo)
949     TARGET_NODE_CASE(RET_GLUE)
950     TARGET_NODE_CASE(TS1AM)
951     TARGET_NODE_CASE(VEC_UNPACK_LO)
952     TARGET_NODE_CASE(VEC_UNPACK_HI)
953     TARGET_NODE_CASE(VEC_PACK)
954     TARGET_NODE_CASE(VEC_BROADCAST)
955     TARGET_NODE_CASE(REPL_I32)
956     TARGET_NODE_CASE(REPL_F32)
957 
958     TARGET_NODE_CASE(LEGALAVL)
959 
960     // Register the VVP_* SDNodes.
961 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
962 #include "VVPNodes.def"
963   }
964 #undef TARGET_NODE_CASE
965   return nullptr;
966 }
967 
968 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
969                                          EVT VT) const {
970   return MVT::i32;
971 }
972 
973 // Convert to a target node and set target flags.
974 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
975                                           SelectionDAG &DAG) const {
976   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
977     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
978                                       GA->getValueType(0), GA->getOffset(), TF);
979 
980   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
981     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
982                                      0, TF);
983 
984   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
985     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
986                                      CP->getAlign(), CP->getOffset(), TF);
987 
988   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
989     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
990                                        TF);
991 
992   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
993     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
994 
995   llvm_unreachable("Unhandled address SDNode");
996 }
997 
998 // Split Op into high and low parts according to HiTF and LoTF.
999 // Return an ADD node combining the parts.
1000 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
1001                                        SelectionDAG &DAG) const {
1002   SDLoc DL(Op);
1003   EVT VT = Op.getValueType();
1004   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
1005   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
1006   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
1007 }
1008 
1009 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
1010 // or ExternalSymbol SDNode.
1011 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
1012   SDLoc DL(Op);
1013   EVT PtrVT = Op.getValueType();
1014 
1015   // Handle PIC mode first. VE needs a got load for every variable!
1016   if (isPositionIndependent()) {
1017     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
1018 
1019     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
1020         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
1021       // Create following instructions for local linkage PIC code.
1022       //     lea %reg, label@gotoff_lo
1023       //     and %reg, %reg, (32)0
1024       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
1025       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
1026                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1027       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
1028       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
1029     }
1030     // Create following instructions for not local linkage PIC code.
1031     //     lea %reg, label@got_lo
1032     //     and %reg, %reg, (32)0
1033     //     lea.sl %reg, label@got_hi(%reg)
1034     //     ld %reg, (%reg, %got)
1035     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
1036                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
1037     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
1038     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
1039     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
1040                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
1041   }
1042 
1043   // This is one of the absolute code models.
1044   switch (getTargetMachine().getCodeModel()) {
1045   default:
1046     llvm_unreachable("Unsupported absolute code model");
1047   case CodeModel::Small:
1048   case CodeModel::Medium:
1049   case CodeModel::Large:
1050     // abs64.
1051     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1052   }
1053 }
1054 
1055 /// Custom Lower {
1056 
1057 // The mappings for emitLeading/TrailingFence for VE is designed by following
1058 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
1059 Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder,
1060                                                 Instruction *Inst,
1061                                                 AtomicOrdering Ord) const {
1062   switch (Ord) {
1063   case AtomicOrdering::NotAtomic:
1064   case AtomicOrdering::Unordered:
1065     llvm_unreachable("Invalid fence: unordered/non-atomic");
1066   case AtomicOrdering::Monotonic:
1067   case AtomicOrdering::Acquire:
1068     return nullptr; // Nothing to do
1069   case AtomicOrdering::Release:
1070   case AtomicOrdering::AcquireRelease:
1071     return Builder.CreateFence(AtomicOrdering::Release);
1072   case AtomicOrdering::SequentiallyConsistent:
1073     if (!Inst->hasAtomicStore())
1074       return nullptr; // Nothing to do
1075     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1076   }
1077   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
1078 }
1079 
1080 Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
1081                                                  Instruction *Inst,
1082                                                  AtomicOrdering Ord) const {
1083   switch (Ord) {
1084   case AtomicOrdering::NotAtomic:
1085   case AtomicOrdering::Unordered:
1086     llvm_unreachable("Invalid fence: unordered/not-atomic");
1087   case AtomicOrdering::Monotonic:
1088   case AtomicOrdering::Release:
1089     return nullptr; // Nothing to do
1090   case AtomicOrdering::Acquire:
1091   case AtomicOrdering::AcquireRelease:
1092     return Builder.CreateFence(AtomicOrdering::Acquire);
1093   case AtomicOrdering::SequentiallyConsistent:
1094     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1095   }
1096   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
1097 }
1098 
1099 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
1100                                             SelectionDAG &DAG) const {
1101   SDLoc DL(Op);
1102   AtomicOrdering FenceOrdering =
1103       static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
1104   SyncScope::ID FenceSSID =
1105       static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
1106 
1107   // VE uses Release consistency, so need a fence instruction if it is a
1108   // cross-thread fence.
1109   if (FenceSSID == SyncScope::System) {
1110     switch (FenceOrdering) {
1111     case AtomicOrdering::NotAtomic:
1112     case AtomicOrdering::Unordered:
1113     case AtomicOrdering::Monotonic:
1114       // No need to generate fencem instruction here.
1115       break;
1116     case AtomicOrdering::Acquire:
1117       // Generate "fencem 2" as acquire fence.
1118       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1119                                         DAG.getTargetConstant(2, DL, MVT::i32),
1120                                         Op.getOperand(0)),
1121                      0);
1122     case AtomicOrdering::Release:
1123       // Generate "fencem 1" as release fence.
1124       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1125                                         DAG.getTargetConstant(1, DL, MVT::i32),
1126                                         Op.getOperand(0)),
1127                      0);
1128     case AtomicOrdering::AcquireRelease:
1129     case AtomicOrdering::SequentiallyConsistent:
1130       // Generate "fencem 3" as acq_rel and seq_cst fence.
1131       // FIXME: "fencem 3" doesn't wait for PCIe deveices accesses,
1132       //        so  seq_cst may require more instruction for them.
1133       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1134                                         DAG.getTargetConstant(3, DL, MVT::i32),
1135                                         Op.getOperand(0)),
1136                      0);
1137     }
1138   }
1139 
1140   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
1141   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
1142 }
1143 
1144 TargetLowering::AtomicExpansionKind
1145 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
1146   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
1147   if (AI->getOperation() == AtomicRMWInst::Xchg) {
1148     return AtomicExpansionKind::None;
1149   }
1150   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
1151 
1152   // Otherwise, expand it using compare and exchange instruction to not call
1153   // __sync_fetch_and_* functions.
1154   return AtomicExpansionKind::CmpXChg;
1155 }
1156 
1157 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
1158                             SDValue &Bits) {
1159   SDLoc DL(Op);
1160   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1161   SDValue Ptr = N->getOperand(1);
1162   SDValue Val = N->getOperand(2);
1163   EVT PtrVT = Ptr.getValueType();
1164   bool Byte = N->getMemoryVT() == MVT::i8;
1165   //   Remainder = AND Ptr, 3
1166   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
1167   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
1168   //   Bits = Remainder << 3
1169   //   NewVal = Val << Bits
1170   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
1171   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
1172   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
1173                       : DAG.getConstant(3, DL, MVT::i32);
1174   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
1175   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
1176   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
1177 }
1178 
1179 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
1180                              SDValue Bits) {
1181   SDLoc DL(Op);
1182   EVT VT = Data.getValueType();
1183   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
1184   //   NewData = Data >> Bits
1185   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
1186   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
1187 
1188   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
1189   return DAG.getNode(ISD::AND, DL, VT,
1190                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
1191 }
1192 
1193 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
1194                                            SelectionDAG &DAG) const {
1195   SDLoc DL(Op);
1196   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1197 
1198   if (N->getMemoryVT() == MVT::i8) {
1199     // For i8, use "ts1am"
1200     //   Input:
1201     //     ATOMIC_SWAP Ptr, Val, Order
1202     //
1203     //   Output:
1204     //     Remainder = AND Ptr, 3
1205     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
1206     //     Bits = Remainder << 3
1207     //     NewVal = Val << Bits
1208     //
1209     //     Aligned = AND Ptr, -4
1210     //     Data = TS1AM Aligned, Flag, NewVal
1211     //
1212     //     NewData = Data >> Bits
1213     //     Result = NewData & 0xff ; 1 byte result
1214     SDValue Flag;
1215     SDValue Bits;
1216     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1217 
1218     SDValue Ptr = N->getOperand(1);
1219     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1220                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1221     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1222                                   DAG.getVTList(Op.getNode()->getValueType(0),
1223                                                 Op.getNode()->getValueType(1)),
1224                                   {N->getChain(), Aligned, Flag, NewVal},
1225                                   N->getMemOperand());
1226 
1227     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1228     SDValue Chain = TS1AM.getValue(1);
1229     return DAG.getMergeValues({Result, Chain}, DL);
1230   }
1231   if (N->getMemoryVT() == MVT::i16) {
1232     // For i16, use "ts1am"
1233     SDValue Flag;
1234     SDValue Bits;
1235     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1236 
1237     SDValue Ptr = N->getOperand(1);
1238     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1239                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1240     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1241                                   DAG.getVTList(Op.getNode()->getValueType(0),
1242                                                 Op.getNode()->getValueType(1)),
1243                                   {N->getChain(), Aligned, Flag, NewVal},
1244                                   N->getMemOperand());
1245 
1246     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1247     SDValue Chain = TS1AM.getValue(1);
1248     return DAG.getMergeValues({Result, Chain}, DL);
1249   }
1250   // Otherwise, let llvm legalize it.
1251   return Op;
1252 }
1253 
1254 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
1255                                              SelectionDAG &DAG) const {
1256   return makeAddress(Op, DAG);
1257 }
1258 
1259 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
1260                                             SelectionDAG &DAG) const {
1261   return makeAddress(Op, DAG);
1262 }
1263 
1264 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
1265                                             SelectionDAG &DAG) const {
1266   return makeAddress(Op, DAG);
1267 }
1268 
1269 SDValue
1270 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
1271                                                 SelectionDAG &DAG) const {
1272   SDLoc DL(Op);
1273 
1274   // Generate the following code:
1275   //   t1: ch,glue = callseq_start t0, 0, 0
1276   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
1277   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
1278   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
1279   SDValue Label = withTargetFlags(Op, 0, DAG);
1280   EVT PtrVT = Op.getValueType();
1281 
1282   // Lowering the machine isd will make sure everything is in the right
1283   // location.
1284   SDValue Chain = DAG.getEntryNode();
1285   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1286   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
1287       DAG.getMachineFunction(), CallingConv::C);
1288   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
1289   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
1290   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
1291   Chain = DAG.getCALLSEQ_END(Chain, 64, 0, Chain.getValue(1), DL);
1292   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
1293 
1294   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
1295   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1296   MFI.setHasCalls(true);
1297 
1298   // Also generate code to prepare a GOT register if it is PIC.
1299   if (isPositionIndependent()) {
1300     MachineFunction &MF = DAG.getMachineFunction();
1301     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
1302   }
1303 
1304   return Chain;
1305 }
1306 
1307 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
1308                                                 SelectionDAG &DAG) const {
1309   // The current implementation of nld (2.26) doesn't allow local exec model
1310   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
1311   // generate the general dynamic model code sequence.
1312   //
1313   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
1314   return lowerToTLSGeneralDynamicModel(Op, DAG);
1315 }
1316 
1317 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
1318   return makeAddress(Op, DAG);
1319 }
1320 
1321 // Lower a f128 load into two f64 loads.
1322 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
1323   SDLoc DL(Op);
1324   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1325   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1326   Align Alignment = LdNode->getAlign();
1327   if (Alignment > 8)
1328     Alignment = Align(8);
1329 
1330   SDValue Lo64 =
1331       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
1332                   LdNode->getPointerInfo(), Alignment,
1333                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1334                                        : MachineMemOperand::MONone);
1335   EVT AddrVT = LdNode->getBasePtr().getValueType();
1336   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
1337                               DAG.getConstant(8, DL, AddrVT));
1338   SDValue Hi64 =
1339       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
1340                   LdNode->getPointerInfo(), Alignment,
1341                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1342                                        : MachineMemOperand::MONone);
1343 
1344   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1345   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1346 
1347   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1348   SDNode *InFP128 =
1349       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
1350   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1351                                SDValue(InFP128, 0), Hi64, SubRegEven);
1352   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1353                                SDValue(InFP128, 0), Lo64, SubRegOdd);
1354   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
1355                           SDValue(Hi64.getNode(), 1)};
1356   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1357   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
1358   return DAG.getMergeValues(Ops, DL);
1359 }
1360 
1361 // Lower a vXi1 load into following instructions
1362 //   LDrii %1, (,%addr)
1363 //   LVMxir  %vm, 0, %1
1364 //   LDrii %2, 8(,%addr)
1365 //   LVMxir  %vm, 0, %2
1366 //   ...
1367 static SDValue lowerLoadI1(SDValue Op, SelectionDAG &DAG) {
1368   SDLoc DL(Op);
1369   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1370   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1371 
1372   SDValue BasePtr = LdNode->getBasePtr();
1373   Align Alignment = LdNode->getAlign();
1374   if (Alignment > 8)
1375     Alignment = Align(8);
1376 
1377   EVT AddrVT = BasePtr.getValueType();
1378   EVT MemVT = LdNode->getMemoryVT();
1379   if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) {
1380     SDValue OutChains[4];
1381     SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT);
1382     for (int i = 0; i < 4; ++i) {
1383       // Generate load dag and prepare chains.
1384       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1385                                  DAG.getConstant(8 * i, DL, AddrVT));
1386       SDValue Val =
1387           DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr,
1388                       LdNode->getPointerInfo(), Alignment,
1389                       LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1390                                            : MachineMemOperand::MONone);
1391       OutChains[i] = SDValue(Val.getNode(), 1);
1392 
1393       VM = DAG.getMachineNode(VE::LVMir_m, DL, MVT::i64,
1394                               DAG.getTargetConstant(i, DL, MVT::i64), Val,
1395                               SDValue(VM, 0));
1396     }
1397     SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1398     SDValue Ops[2] = {SDValue(VM, 0), OutChain};
1399     return DAG.getMergeValues(Ops, DL);
1400   } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) {
1401     SDValue OutChains[8];
1402     SDNode *VM = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MemVT);
1403     for (int i = 0; i < 8; ++i) {
1404       // Generate load dag and prepare chains.
1405       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1406                                  DAG.getConstant(8 * i, DL, AddrVT));
1407       SDValue Val =
1408           DAG.getLoad(MVT::i64, DL, LdNode->getChain(), Addr,
1409                       LdNode->getPointerInfo(), Alignment,
1410                       LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1411                                            : MachineMemOperand::MONone);
1412       OutChains[i] = SDValue(Val.getNode(), 1);
1413 
1414       VM = DAG.getMachineNode(VE::LVMyir_y, DL, MVT::i64,
1415                               DAG.getTargetConstant(i, DL, MVT::i64), Val,
1416                               SDValue(VM, 0));
1417     }
1418     SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1419     SDValue Ops[2] = {SDValue(VM, 0), OutChain};
1420     return DAG.getMergeValues(Ops, DL);
1421   } else {
1422     // Otherwise, ask llvm to expand it.
1423     return SDValue();
1424   }
1425 }
1426 
1427 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1428   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
1429   EVT MemVT = LdNode->getMemoryVT();
1430 
1431   // If VPU is enabled, always expand non-mask vector loads to VVP
1432   if (Subtarget->enableVPU() && MemVT.isVector() && !isMaskType(MemVT))
1433     return lowerToVVP(Op, DAG);
1434 
1435   SDValue BasePtr = LdNode->getBasePtr();
1436   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1437     // Do not expand store instruction with frame index here because of
1438     // dependency problems.  We expand it later in eliminateFrameIndex().
1439     return Op;
1440   }
1441 
1442   if (MemVT == MVT::f128)
1443     return lowerLoadF128(Op, DAG);
1444   if (isMaskType(MemVT))
1445     return lowerLoadI1(Op, DAG);
1446 
1447   return Op;
1448 }
1449 
1450 // Lower a f128 store into two f64 stores.
1451 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
1452   SDLoc DL(Op);
1453   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1454   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1455 
1456   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1457   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1458 
1459   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1460                                     StNode->getValue(), SubRegEven);
1461   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1462                                     StNode->getValue(), SubRegOdd);
1463 
1464   Align Alignment = StNode->getAlign();
1465   if (Alignment > 8)
1466     Alignment = Align(8);
1467 
1468   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1469   SDValue OutChains[2];
1470   OutChains[0] =
1471       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
1472                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
1473                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1474                                         : MachineMemOperand::MONone);
1475   EVT AddrVT = StNode->getBasePtr().getValueType();
1476   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
1477                               DAG.getConstant(8, DL, AddrVT));
1478   OutChains[1] =
1479       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
1480                    MachinePointerInfo(), Alignment,
1481                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1482                                         : MachineMemOperand::MONone);
1483   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1484 }
1485 
1486 // Lower a vXi1 store into following instructions
1487 //   SVMi  %1, %vm, 0
1488 //   STrii %1, (,%addr)
1489 //   SVMi  %2, %vm, 1
1490 //   STrii %2, 8(,%addr)
1491 //   ...
1492 static SDValue lowerStoreI1(SDValue Op, SelectionDAG &DAG) {
1493   SDLoc DL(Op);
1494   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1495   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1496 
1497   SDValue BasePtr = StNode->getBasePtr();
1498   Align Alignment = StNode->getAlign();
1499   if (Alignment > 8)
1500     Alignment = Align(8);
1501   EVT AddrVT = BasePtr.getValueType();
1502   EVT MemVT = StNode->getMemoryVT();
1503   if (MemVT == MVT::v256i1 || MemVT == MVT::v4i64) {
1504     SDValue OutChains[4];
1505     for (int i = 0; i < 4; ++i) {
1506       SDNode *V =
1507           DAG.getMachineNode(VE::SVMmi, DL, MVT::i64, StNode->getValue(),
1508                              DAG.getTargetConstant(i, DL, MVT::i64));
1509       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1510                                  DAG.getConstant(8 * i, DL, AddrVT));
1511       OutChains[i] =
1512           DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr,
1513                        MachinePointerInfo(), Alignment,
1514                        StNode->isVolatile() ? MachineMemOperand::MOVolatile
1515                                             : MachineMemOperand::MONone);
1516     }
1517     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1518   } else if (MemVT == MVT::v512i1 || MemVT == MVT::v8i64) {
1519     SDValue OutChains[8];
1520     for (int i = 0; i < 8; ++i) {
1521       SDNode *V =
1522           DAG.getMachineNode(VE::SVMyi, DL, MVT::i64, StNode->getValue(),
1523                              DAG.getTargetConstant(i, DL, MVT::i64));
1524       SDValue Addr = DAG.getNode(ISD::ADD, DL, AddrVT, BasePtr,
1525                                  DAG.getConstant(8 * i, DL, AddrVT));
1526       OutChains[i] =
1527           DAG.getStore(StNode->getChain(), DL, SDValue(V, 0), Addr,
1528                        MachinePointerInfo(), Alignment,
1529                        StNode->isVolatile() ? MachineMemOperand::MOVolatile
1530                                             : MachineMemOperand::MONone);
1531     }
1532     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1533   } else {
1534     // Otherwise, ask llvm to expand it.
1535     return SDValue();
1536   }
1537 }
1538 
1539 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1540   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
1541   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1542   EVT MemVT = StNode->getMemoryVT();
1543 
1544   // If VPU is enabled, always expand non-mask vector stores to VVP
1545   if (Subtarget->enableVPU() && MemVT.isVector() && !isMaskType(MemVT))
1546     return lowerToVVP(Op, DAG);
1547 
1548   SDValue BasePtr = StNode->getBasePtr();
1549   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1550     // Do not expand store instruction with frame index here because of
1551     // dependency problems.  We expand it later in eliminateFrameIndex().
1552     return Op;
1553   }
1554 
1555   if (MemVT == MVT::f128)
1556     return lowerStoreF128(Op, DAG);
1557   if (isMaskType(MemVT))
1558     return lowerStoreI1(Op, DAG);
1559 
1560   // Otherwise, ask llvm to expand it.
1561   return SDValue();
1562 }
1563 
1564 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
1565   MachineFunction &MF = DAG.getMachineFunction();
1566   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
1567   auto PtrVT = getPointerTy(DAG.getDataLayout());
1568 
1569   // Need frame address to find the address of VarArgsFrameIndex.
1570   MF.getFrameInfo().setFrameAddressIsTaken(true);
1571 
1572   // vastart just stores the address of the VarArgsFrameIndex slot into the
1573   // memory location argument.
1574   SDLoc DL(Op);
1575   SDValue Offset =
1576       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
1577                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
1578   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
1579   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
1580                       MachinePointerInfo(SV));
1581 }
1582 
1583 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
1584   SDNode *Node = Op.getNode();
1585   EVT VT = Node->getValueType(0);
1586   SDValue InChain = Node->getOperand(0);
1587   SDValue VAListPtr = Node->getOperand(1);
1588   EVT PtrVT = VAListPtr.getValueType();
1589   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
1590   SDLoc DL(Node);
1591   SDValue VAList =
1592       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
1593   SDValue Chain = VAList.getValue(1);
1594   SDValue NextPtr;
1595 
1596   if (VT == MVT::f128) {
1597     // VE f128 values must be stored with 16 bytes alignment.  We don't
1598     // know the actual alignment of VAList, so we take alignment of it
1599     // dynamically.
1600     int Align = 16;
1601     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1602                          DAG.getConstant(Align - 1, DL, PtrVT));
1603     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
1604                          DAG.getConstant(-Align, DL, PtrVT));
1605     // Increment the pointer, VAList, by 16 to the next vaarg.
1606     NextPtr =
1607         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
1608   } else if (VT == MVT::f32) {
1609     // float --> need special handling like below.
1610     //    0      4
1611     //    +------+------+
1612     //    | empty| float|
1613     //    +------+------+
1614     // Increment the pointer, VAList, by 8 to the next vaarg.
1615     NextPtr =
1616         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1617     // Then, adjust VAList.
1618     unsigned InternalOffset = 4;
1619     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1620                          DAG.getConstant(InternalOffset, DL, PtrVT));
1621   } else {
1622     // Increment the pointer, VAList, by 8 to the next vaarg.
1623     NextPtr =
1624         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1625   }
1626 
1627   // Store the incremented VAList to the legalized pointer.
1628   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
1629 
1630   // Load the actual argument out of the pointer VAList.
1631   // We can't count on greater alignment than the word size.
1632   return DAG.getLoad(
1633       VT, DL, InChain, VAList, MachinePointerInfo(),
1634       Align(std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8));
1635 }
1636 
1637 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
1638                                                   SelectionDAG &DAG) const {
1639   // Generate following code.
1640   //   (void)__llvm_grow_stack(size);
1641   //   ret = GETSTACKTOP;        // pseudo instruction
1642   SDLoc DL(Op);
1643 
1644   // Get the inputs.
1645   SDNode *Node = Op.getNode();
1646   SDValue Chain = Op.getOperand(0);
1647   SDValue Size = Op.getOperand(1);
1648   MaybeAlign Alignment(Op.getConstantOperandVal(2));
1649   EVT VT = Node->getValueType(0);
1650 
1651   // Chain the dynamic stack allocation so that it doesn't modify the stack
1652   // pointer when other instructions are using the stack.
1653   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
1654 
1655   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
1656   Align StackAlign = TFI.getStackAlign();
1657   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
1658 
1659   // Prepare arguments
1660   TargetLowering::ArgListTy Args;
1661   TargetLowering::ArgListEntry Entry;
1662   Entry.Node = Size;
1663   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1664   Args.push_back(Entry);
1665   if (NeedsAlign) {
1666     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
1667     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1668     Args.push_back(Entry);
1669   }
1670   Type *RetTy = Type::getVoidTy(*DAG.getContext());
1671 
1672   EVT PtrVT = Op.getValueType();
1673   SDValue Callee;
1674   if (NeedsAlign) {
1675     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
1676   } else {
1677     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
1678   }
1679 
1680   TargetLowering::CallLoweringInfo CLI(DAG);
1681   CLI.setDebugLoc(DL)
1682       .setChain(Chain)
1683       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
1684       .setDiscardResult(true);
1685   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
1686   Chain = pair.second;
1687   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
1688   if (NeedsAlign) {
1689     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
1690                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
1691     Result = DAG.getNode(ISD::AND, DL, VT, Result,
1692                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
1693   }
1694   //  Chain = Result.getValue(1);
1695   Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
1696 
1697   SDValue Ops[2] = {Result, Chain};
1698   return DAG.getMergeValues(Ops, DL);
1699 }
1700 
1701 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
1702                                                SelectionDAG &DAG) const {
1703   SDLoc DL(Op);
1704   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
1705                      Op.getOperand(1));
1706 }
1707 
1708 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
1709                                               SelectionDAG &DAG) const {
1710   SDLoc DL(Op);
1711   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
1712                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
1713                      Op.getOperand(1));
1714 }
1715 
1716 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
1717                                                       SelectionDAG &DAG) const {
1718   SDLoc DL(Op);
1719   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
1720                      Op.getOperand(0));
1721 }
1722 
1723 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
1724                               const VETargetLowering &TLI,
1725                               const VESubtarget *Subtarget) {
1726   SDLoc DL(Op);
1727   MachineFunction &MF = DAG.getMachineFunction();
1728   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
1729 
1730   MachineFrameInfo &MFI = MF.getFrameInfo();
1731   MFI.setFrameAddressIsTaken(true);
1732 
1733   unsigned Depth = Op.getConstantOperandVal(0);
1734   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
1735   Register FrameReg = RegInfo->getFrameRegister(MF);
1736   SDValue FrameAddr =
1737       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
1738   while (Depth--)
1739     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
1740                             FrameAddr, MachinePointerInfo());
1741   return FrameAddr;
1742 }
1743 
1744 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
1745                                const VETargetLowering &TLI,
1746                                const VESubtarget *Subtarget) {
1747   MachineFunction &MF = DAG.getMachineFunction();
1748   MachineFrameInfo &MFI = MF.getFrameInfo();
1749   MFI.setReturnAddressIsTaken(true);
1750 
1751   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
1752     return SDValue();
1753 
1754   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
1755 
1756   SDLoc DL(Op);
1757   EVT VT = Op.getValueType();
1758   SDValue Offset = DAG.getConstant(8, DL, VT);
1759   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1760                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
1761                      MachinePointerInfo());
1762 }
1763 
1764 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
1765                                                   SelectionDAG &DAG) const {
1766   SDLoc DL(Op);
1767   unsigned IntNo = Op.getConstantOperandVal(0);
1768   switch (IntNo) {
1769   default: // Don't custom lower most intrinsics.
1770     return SDValue();
1771   case Intrinsic::eh_sjlj_lsda: {
1772     MachineFunction &MF = DAG.getMachineFunction();
1773     MVT VT = Op.getSimpleValueType();
1774     const VETargetMachine *TM =
1775         static_cast<const VETargetMachine *>(&DAG.getTarget());
1776 
1777     // Create GCC_except_tableXX string.  The real symbol for that will be
1778     // generated in EHStreamer::emitExceptionTable() later.  So, we just
1779     // borrow it's name here.
1780     TM->getStrList()->push_back(std::string(
1781         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
1782     SDValue Addr =
1783         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
1784     if (isPositionIndependent()) {
1785       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
1786                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1787       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
1788       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
1789     }
1790     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1791   }
1792   }
1793 }
1794 
1795 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
1796   if (!isa<BuildVectorSDNode>(N))
1797     return false;
1798   const auto *BVN = cast<BuildVectorSDNode>(N);
1799 
1800   // Find first non-undef insertion.
1801   unsigned Idx;
1802   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
1803     auto ElemV = BVN->getOperand(Idx);
1804     if (!ElemV->isUndef())
1805       break;
1806   }
1807   // Catch the (hypothetical) all-undef case.
1808   if (Idx == BVN->getNumOperands())
1809     return false;
1810   // Remember insertion.
1811   UniqueIdx = Idx++;
1812   // Verify that all other insertions are undef.
1813   for (; Idx < BVN->getNumOperands(); ++Idx) {
1814     auto ElemV = BVN->getOperand(Idx);
1815     if (!ElemV->isUndef())
1816       return false;
1817   }
1818   return true;
1819 }
1820 
1821 static SDValue getSplatValue(SDNode *N) {
1822   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
1823     return BuildVec->getSplatValue();
1824   }
1825   return SDValue();
1826 }
1827 
1828 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
1829                                             SelectionDAG &DAG) const {
1830   VECustomDAG CDAG(DAG, Op);
1831   MVT ResultVT = Op.getSimpleValueType();
1832 
1833   // If there is just one element, expand to INSERT_VECTOR_ELT.
1834   unsigned UniqueIdx;
1835   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
1836     SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
1837     auto ElemV = Op->getOperand(UniqueIdx);
1838     SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
1839     return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
1840   }
1841 
1842   // Else emit a broadcast.
1843   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
1844     unsigned NumEls = ResultVT.getVectorNumElements();
1845     auto AVL = CDAG.getConstant(NumEls, MVT::i32);
1846     return CDAG.getBroadcast(ResultVT, ScalarV, AVL);
1847   }
1848 
1849   // Expand
1850   return SDValue();
1851 }
1852 
1853 TargetLowering::LegalizeAction
1854 VETargetLowering::getCustomOperationAction(SDNode &Op) const {
1855   // Custom legalization on VVP_* and VEC_* opcodes is required to pack-legalize
1856   // these operations (transform nodes such that their AVL parameter refers to
1857   // packs of 64bit, instead of number of elements.
1858 
1859   // Packing opcodes are created with a pack-legal AVL (LEGALAVL). No need to
1860   // re-visit them.
1861   if (isPackingSupportOpcode(Op.getOpcode()))
1862     return Legal;
1863 
1864   // Custom lower to legalize AVL for packed mode.
1865   if (isVVPOrVEC(Op.getOpcode()))
1866     return Custom;
1867   return Legal;
1868 }
1869 
1870 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1871   LLVM_DEBUG(dbgs() << "::LowerOperation "; Op.dump(&DAG));
1872   unsigned Opcode = Op.getOpcode();
1873 
1874   /// Scalar isel.
1875   switch (Opcode) {
1876   case ISD::ATOMIC_FENCE:
1877     return lowerATOMIC_FENCE(Op, DAG);
1878   case ISD::ATOMIC_SWAP:
1879     return lowerATOMIC_SWAP(Op, DAG);
1880   case ISD::BlockAddress:
1881     return lowerBlockAddress(Op, DAG);
1882   case ISD::ConstantPool:
1883     return lowerConstantPool(Op, DAG);
1884   case ISD::DYNAMIC_STACKALLOC:
1885     return lowerDYNAMIC_STACKALLOC(Op, DAG);
1886   case ISD::EH_SJLJ_LONGJMP:
1887     return lowerEH_SJLJ_LONGJMP(Op, DAG);
1888   case ISD::EH_SJLJ_SETJMP:
1889     return lowerEH_SJLJ_SETJMP(Op, DAG);
1890   case ISD::EH_SJLJ_SETUP_DISPATCH:
1891     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
1892   case ISD::FRAMEADDR:
1893     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
1894   case ISD::GlobalAddress:
1895     return lowerGlobalAddress(Op, DAG);
1896   case ISD::GlobalTLSAddress:
1897     return lowerGlobalTLSAddress(Op, DAG);
1898   case ISD::INTRINSIC_WO_CHAIN:
1899     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
1900   case ISD::JumpTable:
1901     return lowerJumpTable(Op, DAG);
1902   case ISD::LOAD:
1903     return lowerLOAD(Op, DAG);
1904   case ISD::RETURNADDR:
1905     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
1906   case ISD::BUILD_VECTOR:
1907     return lowerBUILD_VECTOR(Op, DAG);
1908   case ISD::STORE:
1909     return lowerSTORE(Op, DAG);
1910   case ISD::VASTART:
1911     return lowerVASTART(Op, DAG);
1912   case ISD::VAARG:
1913     return lowerVAARG(Op, DAG);
1914 
1915   case ISD::INSERT_VECTOR_ELT:
1916     return lowerINSERT_VECTOR_ELT(Op, DAG);
1917   case ISD::EXTRACT_VECTOR_ELT:
1918     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
1919   }
1920 
1921   /// Vector isel.
1922   if (ISD::isVPOpcode(Opcode))
1923     return lowerToVVP(Op, DAG);
1924 
1925   switch (Opcode) {
1926   default:
1927     llvm_unreachable("Should not custom lower this!");
1928 
1929   // Legalize the AVL of this internal node.
1930   case VEISD::VEC_BROADCAST:
1931 #define ADD_VVP_OP(VVP_NAME, ...) case VEISD::VVP_NAME:
1932 #include "VVPNodes.def"
1933     // AVL already legalized.
1934     if (getAnnotatedNodeAVL(Op).second)
1935       return Op;
1936     return legalizeInternalVectorOp(Op, DAG);
1937 
1938     // Translate into a VEC_*/VVP_* layer operation.
1939   case ISD::MLOAD:
1940   case ISD::MSTORE:
1941 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
1942 #include "VVPNodes.def"
1943     if (isMaskArithmetic(Op) && isPackedVectorType(Op.getValueType()))
1944       return splitMaskArithmetic(Op, DAG);
1945     return lowerToVVP(Op, DAG);
1946   }
1947 }
1948 /// } Custom Lower
1949 
1950 void VETargetLowering::ReplaceNodeResults(SDNode *N,
1951                                           SmallVectorImpl<SDValue> &Results,
1952                                           SelectionDAG &DAG) const {
1953   switch (N->getOpcode()) {
1954   case ISD::ATOMIC_SWAP:
1955     // Let LLVM expand atomic swap instruction through LowerOperation.
1956     return;
1957   default:
1958     LLVM_DEBUG(N->dumpr(&DAG));
1959     llvm_unreachable("Do not know how to custom type legalize this operation!");
1960   }
1961 }
1962 
1963 /// JumpTable for VE.
1964 ///
1965 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
1966 ///   generate expressions using symbols in both text segment and data
1967 ///   segment like below.
1968 ///             .4byte  .LBB0_2-.LJTI0_0
1969 ///   So, we generate offset from the top of function like below as
1970 ///   a custom label.
1971 ///             .4byte  .LBB0_2-<function name>
1972 
1973 unsigned VETargetLowering::getJumpTableEncoding() const {
1974   // Use custom label for PIC.
1975   if (isPositionIndependent())
1976     return MachineJumpTableInfo::EK_Custom32;
1977 
1978   // Otherwise, use the normal jump table encoding heuristics.
1979   return TargetLowering::getJumpTableEncoding();
1980 }
1981 
1982 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
1983     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
1984     unsigned Uid, MCContext &Ctx) const {
1985   assert(isPositionIndependent());
1986 
1987   // Generate custom label for PIC like below.
1988   //    .4bytes  .LBB0_2-<function name>
1989   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
1990   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
1991   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
1992   return MCBinaryExpr::createSub(Value, Base, Ctx);
1993 }
1994 
1995 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
1996                                                    SelectionDAG &DAG) const {
1997   assert(isPositionIndependent());
1998   SDLoc DL(Table);
1999   Function *Function = &DAG.getMachineFunction().getFunction();
2000   assert(Function != nullptr);
2001   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
2002 
2003   // In the jump table, we have following values in PIC mode.
2004   //    .4bytes  .LBB0_2-<function name>
2005   // We need to add this value and the address of this function to generate
2006   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
2007   // instructions:
2008   //     lea %reg, fun@gotoff_lo
2009   //     and %reg, %reg, (32)0
2010   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
2011   // In order to do so, we need to genarate correctly marked DAG node using
2012   // makeHiLoPair.
2013   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
2014   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
2015                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
2016   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
2017   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
2018 }
2019 
2020 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
2021                                       MachineBasicBlock::iterator I,
2022                                       MachineBasicBlock *TargetBB,
2023                                       const DebugLoc &DL) const {
2024   MachineFunction *MF = MBB.getParent();
2025   MachineRegisterInfo &MRI = MF->getRegInfo();
2026   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2027 
2028   const TargetRegisterClass *RC = &VE::I64RegClass;
2029   Register Tmp1 = MRI.createVirtualRegister(RC);
2030   Register Tmp2 = MRI.createVirtualRegister(RC);
2031   Register Result = MRI.createVirtualRegister(RC);
2032 
2033   if (isPositionIndependent()) {
2034     // Create following instructions for local linkage PIC code.
2035     //     lea %Tmp1, TargetBB@gotoff_lo
2036     //     and %Tmp2, %Tmp1, (32)0
2037     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2038     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2039         .addImm(0)
2040         .addImm(0)
2041         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
2042     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2043         .addReg(Tmp1, getKillRegState(true))
2044         .addImm(M0(32));
2045     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
2046         .addReg(VE::SX15)
2047         .addReg(Tmp2, getKillRegState(true))
2048         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
2049   } else {
2050     // Create following instructions for non-PIC code.
2051     //     lea     %Tmp1, TargetBB@lo
2052     //     and     %Tmp2, %Tmp1, (32)0
2053     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
2054     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2055         .addImm(0)
2056         .addImm(0)
2057         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
2058     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2059         .addReg(Tmp1, getKillRegState(true))
2060         .addImm(M0(32));
2061     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
2062         .addReg(Tmp2, getKillRegState(true))
2063         .addImm(0)
2064         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
2065   }
2066   return Result;
2067 }
2068 
2069 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
2070                                          MachineBasicBlock::iterator I,
2071                                          StringRef Symbol, const DebugLoc &DL,
2072                                          bool IsLocal = false,
2073                                          bool IsCall = false) const {
2074   MachineFunction *MF = MBB.getParent();
2075   MachineRegisterInfo &MRI = MF->getRegInfo();
2076   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2077 
2078   const TargetRegisterClass *RC = &VE::I64RegClass;
2079   Register Result = MRI.createVirtualRegister(RC);
2080 
2081   if (isPositionIndependent()) {
2082     if (IsCall && !IsLocal) {
2083       // Create following instructions for non-local linkage PIC code function
2084       // calls.  These instructions uses IC and magic number -24, so we expand
2085       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
2086       //     lea %Reg, Symbol@plt_lo(-24)
2087       //     and %Reg, %Reg, (32)0
2088       //     sic %s16
2089       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
2090       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
2091           .addExternalSymbol("abort");
2092     } else if (IsLocal) {
2093       Register Tmp1 = MRI.createVirtualRegister(RC);
2094       Register Tmp2 = MRI.createVirtualRegister(RC);
2095       // Create following instructions for local linkage PIC code.
2096       //     lea %Tmp1, Symbol@gotoff_lo
2097       //     and %Tmp2, %Tmp1, (32)0
2098       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2099       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2100           .addImm(0)
2101           .addImm(0)
2102           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
2103       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2104           .addReg(Tmp1, getKillRegState(true))
2105           .addImm(M0(32));
2106       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
2107           .addReg(VE::SX15)
2108           .addReg(Tmp2, getKillRegState(true))
2109           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
2110     } else {
2111       Register Tmp1 = MRI.createVirtualRegister(RC);
2112       Register Tmp2 = MRI.createVirtualRegister(RC);
2113       // Create following instructions for not local linkage PIC code.
2114       //     lea %Tmp1, Symbol@got_lo
2115       //     and %Tmp2, %Tmp1, (32)0
2116       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2117       //     ld %Result, 0(%Tmp3)
2118       Register Tmp3 = MRI.createVirtualRegister(RC);
2119       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2120           .addImm(0)
2121           .addImm(0)
2122           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
2123       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2124           .addReg(Tmp1, getKillRegState(true))
2125           .addImm(M0(32));
2126       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
2127           .addReg(VE::SX15)
2128           .addReg(Tmp2, getKillRegState(true))
2129           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
2130       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
2131           .addReg(Tmp3, getKillRegState(true))
2132           .addImm(0)
2133           .addImm(0);
2134     }
2135   } else {
2136     Register Tmp1 = MRI.createVirtualRegister(RC);
2137     Register Tmp2 = MRI.createVirtualRegister(RC);
2138     // Create following instructions for non-PIC code.
2139     //     lea     %Tmp1, Symbol@lo
2140     //     and     %Tmp2, %Tmp1, (32)0
2141     //     lea.sl  %Result, Symbol@hi(%Tmp2)
2142     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
2143         .addImm(0)
2144         .addImm(0)
2145         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
2146     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
2147         .addReg(Tmp1, getKillRegState(true))
2148         .addImm(M0(32));
2149     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
2150         .addReg(Tmp2, getKillRegState(true))
2151         .addImm(0)
2152         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
2153   }
2154   return Result;
2155 }
2156 
2157 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
2158                                               MachineBasicBlock *MBB,
2159                                               MachineBasicBlock *DispatchBB,
2160                                               int FI, int Offset) const {
2161   DebugLoc DL = MI.getDebugLoc();
2162   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2163 
2164   Register LabelReg =
2165       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
2166 
2167   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
2168   // referenced by longjmp (throw) later.
2169   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2170   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
2171   MIB.addReg(LabelReg, getKillRegState(true));
2172 }
2173 
2174 MachineBasicBlock *
2175 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
2176                                    MachineBasicBlock *MBB) const {
2177   DebugLoc DL = MI.getDebugLoc();
2178   MachineFunction *MF = MBB->getParent();
2179   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2180   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
2181   MachineRegisterInfo &MRI = MF->getRegInfo();
2182 
2183   const BasicBlock *BB = MBB->getBasicBlock();
2184   MachineFunction::iterator I = ++MBB->getIterator();
2185 
2186   // Memory Reference.
2187   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2188                                            MI.memoperands_end());
2189   Register BufReg = MI.getOperand(1).getReg();
2190 
2191   Register DstReg;
2192 
2193   DstReg = MI.getOperand(0).getReg();
2194   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
2195   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
2196   (void)TRI;
2197   Register MainDestReg = MRI.createVirtualRegister(RC);
2198   Register RestoreDestReg = MRI.createVirtualRegister(RC);
2199 
2200   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
2201   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
2202   //
2203   // ThisMBB:
2204   //   buf[3] = %s17 iff %s17 is used as BP
2205   //   buf[1] = RestoreMBB as IC after longjmp
2206   //   # SjLjSetup RestoreMBB
2207   //
2208   // MainMBB:
2209   //   v_main = 0
2210   //
2211   // SinkMBB:
2212   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
2213   //   ...
2214   //
2215   // RestoreMBB:
2216   //   %s17 = buf[3] = iff %s17 is used as BP
2217   //   v_restore = 1
2218   //   goto SinkMBB
2219 
2220   MachineBasicBlock *ThisMBB = MBB;
2221   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
2222   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
2223   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
2224   MF->insert(I, MainMBB);
2225   MF->insert(I, SinkMBB);
2226   MF->push_back(RestoreMBB);
2227   RestoreMBB->setMachineBlockAddressTaken();
2228 
2229   // Transfer the remainder of BB and its successor edges to SinkMBB.
2230   SinkMBB->splice(SinkMBB->begin(), MBB,
2231                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2232   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
2233 
2234   // ThisMBB:
2235   Register LabelReg =
2236       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
2237 
2238   // Store BP in buf[3] iff this function is using BP.
2239   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
2240   if (TFI->hasBP(*MF)) {
2241     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2242     MIB.addReg(BufReg);
2243     MIB.addImm(0);
2244     MIB.addImm(24);
2245     MIB.addReg(VE::SX17);
2246     MIB.setMemRefs(MMOs);
2247   }
2248 
2249   // Store IP in buf[1].
2250   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2251   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
2252   MIB.addImm(0);
2253   MIB.addImm(8);
2254   MIB.addReg(LabelReg, getKillRegState(true));
2255   MIB.setMemRefs(MMOs);
2256 
2257   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
2258 
2259   // Insert setup.
2260   MIB =
2261       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
2262 
2263   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2264   MIB.addRegMask(RegInfo->getNoPreservedMask());
2265   ThisMBB->addSuccessor(MainMBB);
2266   ThisMBB->addSuccessor(RestoreMBB);
2267 
2268   // MainMBB:
2269   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
2270       .addImm(0)
2271       .addImm(0)
2272       .addImm(0);
2273   MainMBB->addSuccessor(SinkMBB);
2274 
2275   // SinkMBB:
2276   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
2277       .addReg(MainDestReg)
2278       .addMBB(MainMBB)
2279       .addReg(RestoreDestReg)
2280       .addMBB(RestoreMBB);
2281 
2282   // RestoreMBB:
2283   // Restore BP from buf[3] iff this function is using BP.  The address of
2284   // buf is in SX10.
2285   // FIXME: Better to not use SX10 here
2286   if (TFI->hasBP(*MF)) {
2287     MachineInstrBuilder MIB =
2288         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
2289     MIB.addReg(VE::SX10);
2290     MIB.addImm(0);
2291     MIB.addImm(24);
2292     MIB.setMemRefs(MMOs);
2293   }
2294   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
2295       .addImm(0)
2296       .addImm(0)
2297       .addImm(1);
2298   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
2299   RestoreMBB->addSuccessor(SinkMBB);
2300 
2301   MI.eraseFromParent();
2302   return SinkMBB;
2303 }
2304 
2305 MachineBasicBlock *
2306 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
2307                                     MachineBasicBlock *MBB) const {
2308   DebugLoc DL = MI.getDebugLoc();
2309   MachineFunction *MF = MBB->getParent();
2310   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2311   MachineRegisterInfo &MRI = MF->getRegInfo();
2312 
2313   // Memory Reference.
2314   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2315                                            MI.memoperands_end());
2316   Register BufReg = MI.getOperand(0).getReg();
2317 
2318   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
2319   // Since FP is only updated here but NOT referenced, it's treated as GPR.
2320   Register FP = VE::SX9;
2321   Register SP = VE::SX11;
2322 
2323   MachineInstrBuilder MIB;
2324 
2325   MachineBasicBlock *ThisMBB = MBB;
2326 
2327   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
2328   //
2329   // ThisMBB:
2330   //   %fp = load buf[0]
2331   //   %jmp = load buf[1]
2332   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
2333   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
2334   //   jmp %jmp
2335 
2336   // Reload FP.
2337   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
2338   MIB.addReg(BufReg);
2339   MIB.addImm(0);
2340   MIB.addImm(0);
2341   MIB.setMemRefs(MMOs);
2342 
2343   // Reload IP.
2344   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
2345   MIB.addReg(BufReg);
2346   MIB.addImm(0);
2347   MIB.addImm(8);
2348   MIB.setMemRefs(MMOs);
2349 
2350   // Copy BufReg to SX10 for later use in setjmp.
2351   // FIXME: Better to not use SX10 here
2352   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
2353       .addReg(BufReg)
2354       .addImm(0);
2355 
2356   // Reload SP.
2357   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
2358   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
2359   MIB.addImm(0);
2360   MIB.addImm(16);
2361   MIB.setMemRefs(MMOs);
2362 
2363   // Jump.
2364   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
2365       .addReg(Tmp, getKillRegState(true))
2366       .addImm(0);
2367 
2368   MI.eraseFromParent();
2369   return ThisMBB;
2370 }
2371 
2372 MachineBasicBlock *
2373 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
2374                                         MachineBasicBlock *BB) const {
2375   DebugLoc DL = MI.getDebugLoc();
2376   MachineFunction *MF = BB->getParent();
2377   MachineFrameInfo &MFI = MF->getFrameInfo();
2378   MachineRegisterInfo &MRI = MF->getRegInfo();
2379   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2380   int FI = MFI.getFunctionContextIndex();
2381 
2382   // Get a mapping of the call site numbers to all of the landing pads they're
2383   // associated with.
2384   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
2385   unsigned MaxCSNum = 0;
2386   for (auto &MBB : *MF) {
2387     if (!MBB.isEHPad())
2388       continue;
2389 
2390     MCSymbol *Sym = nullptr;
2391     for (const auto &MI : MBB) {
2392       if (MI.isDebugInstr())
2393         continue;
2394 
2395       assert(MI.isEHLabel() && "expected EH_LABEL");
2396       Sym = MI.getOperand(0).getMCSymbol();
2397       break;
2398     }
2399 
2400     if (!MF->hasCallSiteLandingPad(Sym))
2401       continue;
2402 
2403     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
2404       CallSiteNumToLPad[CSI].push_back(&MBB);
2405       MaxCSNum = std::max(MaxCSNum, CSI);
2406     }
2407   }
2408 
2409   // Get an ordered list of the machine basic blocks for the jump table.
2410   std::vector<MachineBasicBlock *> LPadList;
2411   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
2412   LPadList.reserve(CallSiteNumToLPad.size());
2413 
2414   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
2415     for (auto &LP : CallSiteNumToLPad[CSI]) {
2416       LPadList.push_back(LP);
2417       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
2418     }
2419   }
2420 
2421   assert(!LPadList.empty() &&
2422          "No landing pad destinations for the dispatch jump table!");
2423 
2424   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
2425   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
2426   //
2427   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
2428   // First `i64` is callsite, so callsite is FI+8.
2429   static const int OffsetIC = 72;
2430   static const int OffsetCS = 8;
2431 
2432   // Create the MBBs for the dispatch code like following:
2433   //
2434   // ThisMBB:
2435   //   Prepare DispatchBB address and store it to buf[1].
2436   //   ...
2437   //
2438   // DispatchBB:
2439   //   %s15 = GETGOT iff isPositionIndependent
2440   //   %callsite = load callsite
2441   //   brgt.l.t #size of callsites, %callsite, DispContBB
2442   //
2443   // TrapBB:
2444   //   Call abort.
2445   //
2446   // DispContBB:
2447   //   %breg = address of jump table
2448   //   %pc = load and calculate next pc from %breg and %callsite
2449   //   jmp %pc
2450 
2451   // Shove the dispatch's address into the return slot in the function context.
2452   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
2453   DispatchBB->setIsEHPad(true);
2454 
2455   // Trap BB will causes trap like `assert(0)`.
2456   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
2457   DispatchBB->addSuccessor(TrapBB);
2458 
2459   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
2460   DispatchBB->addSuccessor(DispContBB);
2461 
2462   // Insert MBBs.
2463   MF->push_back(DispatchBB);
2464   MF->push_back(DispContBB);
2465   MF->push_back(TrapBB);
2466 
2467   // Insert code to call abort in the TrapBB.
2468   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
2469                                  /* Local */ false, /* Call */ true);
2470   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
2471       .addReg(Abort, getKillRegState(true))
2472       .addImm(0)
2473       .addImm(0);
2474 
2475   // Insert code into the entry block that creates and registers the function
2476   // context.
2477   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
2478 
2479   // Create the jump table and associated information
2480   unsigned JTE = getJumpTableEncoding();
2481   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
2482   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
2483 
2484   const VERegisterInfo &RI = TII->getRegisterInfo();
2485   // Add a register mask with no preserved registers.  This results in all
2486   // registers being marked as clobbered.
2487   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
2488       .addRegMask(RI.getNoPreservedMask());
2489 
2490   if (isPositionIndependent()) {
2491     // Force to generate GETGOT, since current implementation doesn't store GOT
2492     // register.
2493     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
2494   }
2495 
2496   // IReg is used as an index in a memory operand and therefore can't be SP
2497   const TargetRegisterClass *RC = &VE::I64RegClass;
2498   Register IReg = MRI.createVirtualRegister(RC);
2499   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
2500                     OffsetCS);
2501   if (LPadList.size() < 64) {
2502     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
2503         .addImm(VECC::CC_ILE)
2504         .addImm(LPadList.size())
2505         .addReg(IReg)
2506         .addMBB(TrapBB);
2507   } else {
2508     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
2509     Register TmpReg = MRI.createVirtualRegister(RC);
2510     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
2511         .addImm(0)
2512         .addImm(0)
2513         .addImm(LPadList.size());
2514     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
2515         .addImm(VECC::CC_ILE)
2516         .addReg(TmpReg, getKillRegState(true))
2517         .addReg(IReg)
2518         .addMBB(TrapBB);
2519   }
2520 
2521   Register BReg = MRI.createVirtualRegister(RC);
2522   Register Tmp1 = MRI.createVirtualRegister(RC);
2523   Register Tmp2 = MRI.createVirtualRegister(RC);
2524 
2525   if (isPositionIndependent()) {
2526     // Create following instructions for local linkage PIC code.
2527     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
2528     //     and    %Tmp2, %Tmp1, (32)0
2529     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2530     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2531         .addImm(0)
2532         .addImm(0)
2533         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
2534     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2535         .addReg(Tmp1, getKillRegState(true))
2536         .addImm(M0(32));
2537     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
2538         .addReg(VE::SX15)
2539         .addReg(Tmp2, getKillRegState(true))
2540         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
2541   } else {
2542     // Create following instructions for non-PIC code.
2543     //     lea     %Tmp1, .LJTI0_0@lo
2544     //     and     %Tmp2, %Tmp1, (32)0
2545     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
2546     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2547         .addImm(0)
2548         .addImm(0)
2549         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
2550     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2551         .addReg(Tmp1, getKillRegState(true))
2552         .addImm(M0(32));
2553     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
2554         .addReg(Tmp2, getKillRegState(true))
2555         .addImm(0)
2556         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
2557   }
2558 
2559   switch (JTE) {
2560   case MachineJumpTableInfo::EK_BlockAddress: {
2561     // Generate simple block address code for no-PIC model.
2562     //     sll %Tmp1, %IReg, 3
2563     //     lds %TReg, 0(%Tmp1, %BReg)
2564     //     bcfla %TReg
2565 
2566     Register TReg = MRI.createVirtualRegister(RC);
2567     Register Tmp1 = MRI.createVirtualRegister(RC);
2568 
2569     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2570         .addReg(IReg, getKillRegState(true))
2571         .addImm(3);
2572     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
2573         .addReg(BReg, getKillRegState(true))
2574         .addReg(Tmp1, getKillRegState(true))
2575         .addImm(0);
2576     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2577         .addReg(TReg, getKillRegState(true))
2578         .addImm(0);
2579     break;
2580   }
2581   case MachineJumpTableInfo::EK_Custom32: {
2582     // Generate block address code using differences from the function pointer
2583     // for PIC model.
2584     //     sll %Tmp1, %IReg, 2
2585     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
2586     //     Prepare function address in BReg2.
2587     //     adds.l %TReg, %BReg2, %OReg
2588     //     bcfla %TReg
2589 
2590     assert(isPositionIndependent());
2591     Register OReg = MRI.createVirtualRegister(RC);
2592     Register TReg = MRI.createVirtualRegister(RC);
2593     Register Tmp1 = MRI.createVirtualRegister(RC);
2594 
2595     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2596         .addReg(IReg, getKillRegState(true))
2597         .addImm(2);
2598     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
2599         .addReg(BReg, getKillRegState(true))
2600         .addReg(Tmp1, getKillRegState(true))
2601         .addImm(0);
2602     Register BReg2 =
2603         prepareSymbol(*DispContBB, DispContBB->end(),
2604                       DispContBB->getParent()->getName(), DL, /* Local */ true);
2605     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
2606         .addReg(OReg, getKillRegState(true))
2607         .addReg(BReg2, getKillRegState(true));
2608     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2609         .addReg(TReg, getKillRegState(true))
2610         .addImm(0);
2611     break;
2612   }
2613   default:
2614     llvm_unreachable("Unexpected jump table encoding");
2615   }
2616 
2617   // Add the jump table entries as successors to the MBB.
2618   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
2619   for (auto &LP : LPadList)
2620     if (SeenMBBs.insert(LP).second)
2621       DispContBB->addSuccessor(LP);
2622 
2623   // N.B. the order the invoke BBs are processed in doesn't matter here.
2624   SmallVector<MachineBasicBlock *, 64> MBBLPads;
2625   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
2626   for (MachineBasicBlock *MBB : InvokeBBs) {
2627     // Remove the landing pad successor from the invoke block and replace it
2628     // with the new dispatch block.
2629     // Keep a copy of Successors since it's modified inside the loop.
2630     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
2631                                                    MBB->succ_rend());
2632     // FIXME: Avoid quadratic complexity.
2633     for (auto *MBBS : Successors) {
2634       if (MBBS->isEHPad()) {
2635         MBB->removeSuccessor(MBBS);
2636         MBBLPads.push_back(MBBS);
2637       }
2638     }
2639 
2640     MBB->addSuccessor(DispatchBB);
2641 
2642     // Find the invoke call and mark all of the callee-saved registers as
2643     // 'implicit defined' so that they're spilled.  This prevents code from
2644     // moving instructions to before the EH block, where they will never be
2645     // executed.
2646     for (auto &II : reverse(*MBB)) {
2647       if (!II.isCall())
2648         continue;
2649 
2650       DenseMap<Register, bool> DefRegs;
2651       for (auto &MOp : II.operands())
2652         if (MOp.isReg())
2653           DefRegs[MOp.getReg()] = true;
2654 
2655       MachineInstrBuilder MIB(*MF, &II);
2656       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
2657         Register Reg = SavedRegs[RI];
2658         if (!DefRegs[Reg])
2659           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
2660       }
2661 
2662       break;
2663     }
2664   }
2665 
2666   // Mark all former landing pads as non-landing pads.  The dispatch is the only
2667   // landing pad now.
2668   for (auto &LP : MBBLPads)
2669     LP->setIsEHPad(false);
2670 
2671   // The instruction is gone now.
2672   MI.eraseFromParent();
2673   return BB;
2674 }
2675 
2676 MachineBasicBlock *
2677 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
2678                                               MachineBasicBlock *BB) const {
2679   switch (MI.getOpcode()) {
2680   default:
2681     llvm_unreachable("Unknown Custom Instruction!");
2682   case VE::EH_SjLj_LongJmp:
2683     return emitEHSjLjLongJmp(MI, BB);
2684   case VE::EH_SjLj_SetJmp:
2685     return emitEHSjLjSetJmp(MI, BB);
2686   case VE::EH_SjLj_Setup_Dispatch:
2687     return emitSjLjDispatchBlock(MI, BB);
2688   }
2689 }
2690 
2691 static bool isSimm7(SDValue V) {
2692   EVT VT = V.getValueType();
2693   if (VT.isVector())
2694     return false;
2695 
2696   if (VT.isInteger()) {
2697     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(V))
2698       return isInt<7>(C->getSExtValue());
2699   } else if (VT.isFloatingPoint()) {
2700     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(V)) {
2701       if (VT == MVT::f32 || VT == MVT::f64) {
2702         const APInt &Imm = C->getValueAPF().bitcastToAPInt();
2703         uint64_t Val = Imm.getSExtValue();
2704         if (Imm.getBitWidth() == 32)
2705           Val <<= 32; // Immediate value of float place at higher bits on VE.
2706         return isInt<7>(Val);
2707       }
2708     }
2709   }
2710   return false;
2711 }
2712 
2713 static bool isMImm(SDValue V) {
2714   EVT VT = V.getValueType();
2715   if (VT.isVector())
2716     return false;
2717 
2718   if (VT.isInteger()) {
2719     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(V))
2720       return isMImmVal(getImmVal(C));
2721   } else if (VT.isFloatingPoint()) {
2722     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(V)) {
2723       if (VT == MVT::f32) {
2724         // Float value places at higher bits, so ignore lower 32 bits.
2725         return isMImm32Val(getFpImmVal(C) >> 32);
2726       } else if (VT == MVT::f64) {
2727         return isMImmVal(getFpImmVal(C));
2728       }
2729     }
2730   }
2731   return false;
2732 }
2733 
2734 static unsigned decideComp(EVT SrcVT, ISD::CondCode CC) {
2735   if (SrcVT.isFloatingPoint()) {
2736     if (SrcVT == MVT::f128)
2737       return VEISD::CMPQ;
2738     return VEISD::CMPF;
2739   }
2740   return isSignedIntSetCC(CC) ? VEISD::CMPI : VEISD::CMPU;
2741 }
2742 
2743 static EVT decideCompType(EVT SrcVT) {
2744   if (SrcVT == MVT::f128)
2745     return MVT::f64;
2746   return SrcVT;
2747 }
2748 
2749 static bool safeWithoutCompWithNull(EVT SrcVT, ISD::CondCode CC,
2750                                     bool WithCMov) {
2751   if (SrcVT.isFloatingPoint()) {
2752     // For the case of floating point setcc, only unordered comparison
2753     // or general comparison with -enable-no-nans-fp-math option reach
2754     // here, so it is safe even if values are NaN.  Only f128 doesn't
2755     // safe since VE uses f64 result of f128 comparison.
2756     return SrcVT != MVT::f128;
2757   }
2758   if (isIntEqualitySetCC(CC)) {
2759     // For the case of equal or not equal, it is safe without comparison with 0.
2760     return true;
2761   }
2762   if (WithCMov) {
2763     // For the case of integer setcc with cmov, all signed comparison with 0
2764     // are safe.
2765     return isSignedIntSetCC(CC);
2766   }
2767   // For the case of integer setcc, only signed 64 bits comparison is safe.
2768   // For unsigned, "CMPU 0x80000000, 0" has to be greater than 0, but it becomes
2769   // less than 0 witout CMPU.  For 32 bits, other half of 32 bits are
2770   // uncoditional, so it is not safe too without CMPI..
2771   return isSignedIntSetCC(CC) && SrcVT == MVT::i64;
2772 }
2773 
2774 static SDValue generateComparison(EVT VT, SDValue LHS, SDValue RHS,
2775                                   ISD::CondCode CC, bool WithCMov,
2776                                   const SDLoc &DL, SelectionDAG &DAG) {
2777   // Compare values.  If RHS is 0 and it is safe to calculate without
2778   // comparison, we don't generate an instruction for comparison.
2779   EVT CompVT = decideCompType(VT);
2780   if (CompVT == VT && safeWithoutCompWithNull(VT, CC, WithCMov) &&
2781       (isNullConstant(RHS) || isNullFPConstant(RHS))) {
2782     return LHS;
2783   }
2784   return DAG.getNode(decideComp(VT, CC), DL, CompVT, LHS, RHS);
2785 }
2786 
2787 SDValue VETargetLowering::combineSelect(SDNode *N,
2788                                         DAGCombinerInfo &DCI) const {
2789   assert(N->getOpcode() == ISD::SELECT &&
2790          "Should be called with a SELECT node");
2791   ISD::CondCode CC = ISD::CondCode::SETNE;
2792   SDValue Cond = N->getOperand(0);
2793   SDValue True = N->getOperand(1);
2794   SDValue False = N->getOperand(2);
2795 
2796   // We handle only scalar SELECT.
2797   EVT VT = N->getValueType(0);
2798   if (VT.isVector())
2799     return SDValue();
2800 
2801   // Peform combineSelect after leagalize DAG.
2802   if (!DCI.isAfterLegalizeDAG())
2803     return SDValue();
2804 
2805   EVT VT0 = Cond.getValueType();
2806   if (isMImm(True)) {
2807     // VE's condition move can handle MImm in True clause, so nothing to do.
2808   } else if (isMImm(False)) {
2809     // VE's condition move can handle MImm in True clause, so swap True and
2810     // False clauses if False has MImm value.  And, update condition code.
2811     std::swap(True, False);
2812     CC = getSetCCInverse(CC, VT0);
2813   }
2814 
2815   SDLoc DL(N);
2816   SelectionDAG &DAG = DCI.DAG;
2817   VECC::CondCode VECCVal;
2818   if (VT0.isFloatingPoint()) {
2819     VECCVal = fpCondCode2Fcc(CC);
2820   } else {
2821     VECCVal = intCondCode2Icc(CC);
2822   }
2823   SDValue Ops[] = {Cond, True, False,
2824                    DAG.getConstant(VECCVal, DL, MVT::i32)};
2825   return DAG.getNode(VEISD::CMOV, DL, VT, Ops);
2826 }
2827 
2828 SDValue VETargetLowering::combineSelectCC(SDNode *N,
2829                                           DAGCombinerInfo &DCI) const {
2830   assert(N->getOpcode() == ISD::SELECT_CC &&
2831          "Should be called with a SELECT_CC node");
2832   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2833   SDValue LHS = N->getOperand(0);
2834   SDValue RHS = N->getOperand(1);
2835   SDValue True = N->getOperand(2);
2836   SDValue False = N->getOperand(3);
2837 
2838   // We handle only scalar SELECT_CC.
2839   EVT VT = N->getValueType(0);
2840   if (VT.isVector())
2841     return SDValue();
2842 
2843   // Peform combineSelectCC after leagalize DAG.
2844   if (!DCI.isAfterLegalizeDAG())
2845     return SDValue();
2846 
2847   // We handle only i32/i64/f32/f64/f128 comparisons.
2848   EVT LHSVT = LHS.getValueType();
2849   assert(LHSVT == RHS.getValueType());
2850   switch (LHSVT.getSimpleVT().SimpleTy) {
2851   case MVT::i32:
2852   case MVT::i64:
2853   case MVT::f32:
2854   case MVT::f64:
2855   case MVT::f128:
2856     break;
2857   default:
2858     // Return SDValue to let llvm handle other types.
2859     return SDValue();
2860   }
2861 
2862   if (isMImm(RHS)) {
2863     // VE's comparison can handle MImm in RHS, so nothing to do.
2864   } else if (isSimm7(RHS)) {
2865     // VE's comparison can handle Simm7 in LHS, so swap LHS and RHS, and
2866     // update condition code.
2867     std::swap(LHS, RHS);
2868     CC = getSetCCSwappedOperands(CC);
2869   }
2870   if (isMImm(True)) {
2871     // VE's condition move can handle MImm in True clause, so nothing to do.
2872   } else if (isMImm(False)) {
2873     // VE's condition move can handle MImm in True clause, so swap True and
2874     // False clauses if False has MImm value.  And, update condition code.
2875     std::swap(True, False);
2876     CC = getSetCCInverse(CC, LHSVT);
2877   }
2878 
2879   SDLoc DL(N);
2880   SelectionDAG &DAG = DCI.DAG;
2881 
2882   bool WithCMov = true;
2883   SDValue CompNode = generateComparison(LHSVT, LHS, RHS, CC, WithCMov, DL, DAG);
2884 
2885   VECC::CondCode VECCVal;
2886   if (LHSVT.isFloatingPoint()) {
2887     VECCVal = fpCondCode2Fcc(CC);
2888   } else {
2889     VECCVal = intCondCode2Icc(CC);
2890   }
2891   SDValue Ops[] = {CompNode, True, False,
2892                    DAG.getConstant(VECCVal, DL, MVT::i32)};
2893   return DAG.getNode(VEISD::CMOV, DL, VT, Ops);
2894 }
2895 
2896 static bool isI32InsnAllUses(const SDNode *User, const SDNode *N);
2897 static bool isI32Insn(const SDNode *User, const SDNode *N) {
2898   switch (User->getOpcode()) {
2899   default:
2900     return false;
2901   case ISD::ADD:
2902   case ISD::SUB:
2903   case ISD::MUL:
2904   case ISD::SDIV:
2905   case ISD::UDIV:
2906   case ISD::SETCC:
2907   case ISD::SMIN:
2908   case ISD::SMAX:
2909   case ISD::SHL:
2910   case ISD::SRA:
2911   case ISD::BSWAP:
2912   case ISD::SINT_TO_FP:
2913   case ISD::UINT_TO_FP:
2914   case ISD::BR_CC:
2915   case ISD::BITCAST:
2916   case ISD::ATOMIC_CMP_SWAP:
2917   case ISD::ATOMIC_SWAP:
2918   case VEISD::CMPU:
2919   case VEISD::CMPI:
2920     return true;
2921   case ISD::SRL:
2922     if (N->getOperand(0).getOpcode() != ISD::SRL)
2923       return true;
2924     // (srl (trunc (srl ...))) may be optimized by combining srl, so
2925     // doesn't optimize trunc now.
2926     return false;
2927   case ISD::SELECT_CC:
2928     if (User->getOperand(2).getNode() != N &&
2929         User->getOperand(3).getNode() != N)
2930       return true;
2931     return isI32InsnAllUses(User, N);
2932   case VEISD::CMOV:
2933     // CMOV in (cmov (trunc ...), true, false, int-comparison) is safe.
2934     // However, trunc in true or false clauses is not safe.
2935     if (User->getOperand(1).getNode() != N &&
2936         User->getOperand(2).getNode() != N &&
2937         isa<ConstantSDNode>(User->getOperand(3))) {
2938       VECC::CondCode VECCVal =
2939           static_cast<VECC::CondCode>(User->getConstantOperandVal(3));
2940       return isIntVECondCode(VECCVal);
2941     }
2942     [[fallthrough]];
2943   case ISD::AND:
2944   case ISD::OR:
2945   case ISD::XOR:
2946   case ISD::SELECT:
2947   case ISD::CopyToReg:
2948     // Check all use of selections, bit operations, and copies.  If all of them
2949     // are safe, optimize truncate to extract_subreg.
2950     return isI32InsnAllUses(User, N);
2951   }
2952 }
2953 
2954 static bool isI32InsnAllUses(const SDNode *User, const SDNode *N) {
2955   // Check all use of User node.  If all of them are safe, optimize
2956   // truncate to extract_subreg.
2957   for (const SDNode *U : User->uses()) {
2958     switch (U->getOpcode()) {
2959     default:
2960       // If the use is an instruction which treats the source operand as i32,
2961       // it is safe to avoid truncate here.
2962       if (isI32Insn(U, N))
2963         continue;
2964       break;
2965     case ISD::ANY_EXTEND:
2966     case ISD::SIGN_EXTEND:
2967     case ISD::ZERO_EXTEND: {
2968       // Special optimizations to the combination of ext and trunc.
2969       // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
2970       // since this truncate instruction clears higher 32 bits which is filled
2971       // by one of ext instructions later.
2972       assert(N->getValueType(0) == MVT::i32 &&
2973              "find truncate to not i32 integer");
2974       if (User->getOpcode() == ISD::SELECT_CC ||
2975           User->getOpcode() == ISD::SELECT || User->getOpcode() == VEISD::CMOV)
2976         continue;
2977       break;
2978     }
2979     }
2980     return false;
2981   }
2982   return true;
2983 }
2984 
2985 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
2986 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
2987 // is sometime too late.  So, doing it at here.
2988 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
2989                                           DAGCombinerInfo &DCI) const {
2990   assert(N->getOpcode() == ISD::TRUNCATE &&
2991          "Should be called with a TRUNCATE node");
2992 
2993   SelectionDAG &DAG = DCI.DAG;
2994   SDLoc DL(N);
2995   EVT VT = N->getValueType(0);
2996 
2997   // We prefer to do this when all types are legal.
2998   if (!DCI.isAfterLegalizeDAG())
2999     return SDValue();
3000 
3001   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
3002   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
3003       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
3004       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
3005     return SDValue();
3006 
3007   // Check all use of this TRUNCATE.
3008   for (const SDNode *User : N->uses()) {
3009     // Make sure that we're not going to replace TRUNCATE for non i32
3010     // instructions.
3011     //
3012     // FIXME: Although we could sometimes handle this, and it does occur in
3013     // practice that one of the condition inputs to the select is also one of
3014     // the outputs, we currently can't deal with this.
3015     if (isI32Insn(User, N))
3016       continue;
3017 
3018     return SDValue();
3019   }
3020 
3021   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
3022   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
3023                                     N->getOperand(0), SubI32),
3024                  0);
3025 }
3026 
3027 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
3028                                             DAGCombinerInfo &DCI) const {
3029   switch (N->getOpcode()) {
3030   default:
3031     break;
3032   case ISD::SELECT:
3033     return combineSelect(N, DCI);
3034   case ISD::SELECT_CC:
3035     return combineSelectCC(N, DCI);
3036   case ISD::TRUNCATE:
3037     return combineTRUNCATE(N, DCI);
3038   }
3039 
3040   return SDValue();
3041 }
3042 
3043 //===----------------------------------------------------------------------===//
3044 // VE Inline Assembly Support
3045 //===----------------------------------------------------------------------===//
3046 
3047 VETargetLowering::ConstraintType
3048 VETargetLowering::getConstraintType(StringRef Constraint) const {
3049   if (Constraint.size() == 1) {
3050     switch (Constraint[0]) {
3051     default:
3052       break;
3053     case 'v': // vector registers
3054       return C_RegisterClass;
3055     }
3056   }
3057   return TargetLowering::getConstraintType(Constraint);
3058 }
3059 
3060 std::pair<unsigned, const TargetRegisterClass *>
3061 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
3062                                                StringRef Constraint,
3063                                                MVT VT) const {
3064   const TargetRegisterClass *RC = nullptr;
3065   if (Constraint.size() == 1) {
3066     switch (Constraint[0]) {
3067     default:
3068       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3069     case 'r':
3070       RC = &VE::I64RegClass;
3071       break;
3072     case 'v':
3073       RC = &VE::V64RegClass;
3074       break;
3075     }
3076     return std::make_pair(0U, RC);
3077   }
3078 
3079   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3080 }
3081 
3082 //===----------------------------------------------------------------------===//
3083 // VE Target Optimization Support
3084 //===----------------------------------------------------------------------===//
3085 
3086 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
3087   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
3088   if (isJumpTableRelative())
3089     return 8;
3090 
3091   return TargetLowering::getMinimumJumpTableEntries();
3092 }
3093 
3094 bool VETargetLowering::hasAndNot(SDValue Y) const {
3095   EVT VT = Y.getValueType();
3096 
3097   // VE doesn't have vector and not instruction.
3098   if (VT.isVector())
3099     return false;
3100 
3101   // VE allows different immediate values for X and Y where ~X & Y.
3102   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
3103   // function is used to check whether an immediate value is OK for and-not
3104   // instruction as both X and Y.  Generating additional instruction to
3105   // retrieve an immediate value is no good since the purpose of this
3106   // function is to convert a series of 3 instructions to another series of
3107   // 3 instructions with better parallelism.  Therefore, we return false
3108   // for all immediate values now.
3109   // FIXME: Change hasAndNot function to have two operands to make it work
3110   //        correctly with Aurora VE.
3111   if (isa<ConstantSDNode>(Y))
3112     return false;
3113 
3114   // It's ok for generic registers.
3115   return true;
3116 }
3117 
3118 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
3119                                                   SelectionDAG &DAG) const {
3120   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
3121   MVT VT = Op.getOperand(0).getSimpleValueType();
3122 
3123   // Special treatment for packed V64 types.
3124   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
3125   (void)VT;
3126   // Example of codes:
3127   //   %packed_v = extractelt %vr, %idx / 2
3128   //   %v = %packed_v >> (%idx % 2 * 32)
3129   //   %res = %v & 0xffffffff
3130 
3131   SDValue Vec = Op.getOperand(0);
3132   SDValue Idx = Op.getOperand(1);
3133   SDLoc DL(Op);
3134   SDValue Result = Op;
3135   if (false /* Idx->isConstant() */) {
3136     // TODO: optimized implementation using constant values
3137   } else {
3138     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
3139     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
3140     SDValue PackedElt =
3141         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
3142     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
3143     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
3144     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
3145     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
3146     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
3147     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
3148     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
3149     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
3150     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
3151                                         MVT::i32, PackedElt, SubI32),
3152                      0);
3153 
3154     if (Op.getSimpleValueType() == MVT::f32) {
3155       Result = DAG.getBitcast(MVT::f32, Result);
3156     } else {
3157       assert(Op.getSimpleValueType() == MVT::i32);
3158     }
3159   }
3160   return Result;
3161 }
3162 
3163 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
3164                                                  SelectionDAG &DAG) const {
3165   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
3166   MVT VT = Op.getOperand(0).getSimpleValueType();
3167 
3168   // Special treatment for packed V64 types.
3169   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
3170   (void)VT;
3171   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
3172   // bits" required `val << 32` from C implementation's point of view.
3173   //
3174   // Example of codes:
3175   //   %packed_elt = extractelt %vr, (%idx >> 1)
3176   //   %shift = ((%idx & 1) ^ 1) << 5
3177   //   %packed_elt &= 0xffffffff00000000 >> shift
3178   //   %packed_elt |= (zext %val) << shift
3179   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
3180 
3181   SDLoc DL(Op);
3182   SDValue Vec = Op.getOperand(0);
3183   SDValue Val = Op.getOperand(1);
3184   SDValue Idx = Op.getOperand(2);
3185   if (Idx.getSimpleValueType() == MVT::i32)
3186     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
3187   if (Val.getSimpleValueType() == MVT::f32)
3188     Val = DAG.getBitcast(MVT::i32, Val);
3189   assert(Val.getSimpleValueType() == MVT::i32);
3190   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
3191 
3192   SDValue Result = Op;
3193   if (false /* Idx->isConstant()*/) {
3194     // TODO: optimized implementation using constant values
3195   } else {
3196     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
3197     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
3198     SDValue PackedElt =
3199         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
3200     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
3201     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
3202     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
3203     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
3204     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
3205     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
3206     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
3207     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
3208     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
3209     Result =
3210         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
3211                                    {HalfIdx, PackedElt, Vec}),
3212                 0);
3213   }
3214   return Result;
3215 }
3216