xref: /freebsd/contrib/llvm-project/llvm/lib/Target/VE/VEISelLowering.cpp (revision a2464ee12761660f50d0b6f59f233949ebcacc87)
1 //===-- VEISelLowering.cpp - VE DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the interfaces that VE uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "VEISelLowering.h"
15 #include "MCTargetDesc/VEMCExpr.h"
16 #include "VECustomDAG.h"
17 #include "VEInstrBuilder.h"
18 #include "VEMachineFunctionInfo.h"
19 #include "VERegisterInfo.h"
20 #include "VETargetMachine.h"
21 #include "llvm/ADT/StringSwitch.h"
22 #include "llvm/CodeGen/CallingConvLower.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineFunction.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineJumpTableInfo.h"
27 #include "llvm/CodeGen/MachineModuleInfo.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/SelectionDAG.h"
30 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/IRBuilder.h"
34 #include "llvm/IR/Module.h"
35 #include "llvm/Support/ErrorHandling.h"
36 #include "llvm/Support/KnownBits.h"
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "ve-lower"
40 
41 //===----------------------------------------------------------------------===//
42 // Calling Convention Implementation
43 //===----------------------------------------------------------------------===//
44 
45 #include "VEGenCallingConv.inc"
46 
47 CCAssignFn *getReturnCC(CallingConv::ID CallConv) {
48   switch (CallConv) {
49   default:
50     return RetCC_VE_C;
51   case CallingConv::Fast:
52     return RetCC_VE_Fast;
53   }
54 }
55 
56 CCAssignFn *getParamCC(CallingConv::ID CallConv, bool IsVarArg) {
57   if (IsVarArg)
58     return CC_VE2;
59   switch (CallConv) {
60   default:
61     return CC_VE_C;
62   case CallingConv::Fast:
63     return CC_VE_Fast;
64   }
65 }
66 
67 bool VETargetLowering::CanLowerReturn(
68     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
69     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
70   CCAssignFn *RetCC = getReturnCC(CallConv);
71   SmallVector<CCValAssign, 16> RVLocs;
72   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
73   return CCInfo.CheckReturn(Outs, RetCC);
74 }
75 
76 static const MVT AllVectorVTs[] = {MVT::v256i32, MVT::v512i32, MVT::v256i64,
77                                    MVT::v256f32, MVT::v512f32, MVT::v256f64};
78 
79 static const MVT AllPackedVTs[] = {MVT::v512i32, MVT::v512f32};
80 
81 void VETargetLowering::initRegisterClasses() {
82   // Set up the register classes.
83   addRegisterClass(MVT::i32, &VE::I32RegClass);
84   addRegisterClass(MVT::i64, &VE::I64RegClass);
85   addRegisterClass(MVT::f32, &VE::F32RegClass);
86   addRegisterClass(MVT::f64, &VE::I64RegClass);
87   addRegisterClass(MVT::f128, &VE::F128RegClass);
88 
89   if (Subtarget->enableVPU()) {
90     for (MVT VecVT : AllVectorVTs)
91       addRegisterClass(VecVT, &VE::V64RegClass);
92     addRegisterClass(MVT::v256i1, &VE::VMRegClass);
93     addRegisterClass(MVT::v512i1, &VE::VM512RegClass);
94   }
95 }
96 
97 void VETargetLowering::initSPUActions() {
98   const auto &TM = getTargetMachine();
99   /// Load & Store {
100 
101   // VE doesn't have i1 sign extending load.
102   for (MVT VT : MVT::integer_valuetypes()) {
103     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
104     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
105     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
106     setTruncStoreAction(VT, MVT::i1, Expand);
107   }
108 
109   // VE doesn't have floating point extload/truncstore, so expand them.
110   for (MVT FPVT : MVT::fp_valuetypes()) {
111     for (MVT OtherFPVT : MVT::fp_valuetypes()) {
112       setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
113       setTruncStoreAction(FPVT, OtherFPVT, Expand);
114     }
115   }
116 
117   // VE doesn't have fp128 load/store, so expand them in custom lower.
118   setOperationAction(ISD::LOAD, MVT::f128, Custom);
119   setOperationAction(ISD::STORE, MVT::f128, Custom);
120 
121   /// } Load & Store
122 
123   // Custom legalize address nodes into LO/HI parts.
124   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
125   setOperationAction(ISD::BlockAddress, PtrVT, Custom);
126   setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
127   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
128   setOperationAction(ISD::ConstantPool, PtrVT, Custom);
129   setOperationAction(ISD::JumpTable, PtrVT, Custom);
130 
131   /// VAARG handling {
132   setOperationAction(ISD::VASTART, MVT::Other, Custom);
133   // VAARG needs to be lowered to access with 8 bytes alignment.
134   setOperationAction(ISD::VAARG, MVT::Other, Custom);
135   // Use the default implementation.
136   setOperationAction(ISD::VACOPY, MVT::Other, Expand);
137   setOperationAction(ISD::VAEND, MVT::Other, Expand);
138   /// } VAARG handling
139 
140   /// Stack {
141   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
142   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
143 
144   // Use the default implementation.
145   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
146   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
147   /// } Stack
148 
149   /// Branch {
150 
151   // VE doesn't have BRCOND
152   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
153 
154   // BR_JT is not implemented yet.
155   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
156 
157   /// } Branch
158 
159   /// Int Ops {
160   for (MVT IntVT : {MVT::i32, MVT::i64}) {
161     // VE has no REM or DIVREM operations.
162     setOperationAction(ISD::UREM, IntVT, Expand);
163     setOperationAction(ISD::SREM, IntVT, Expand);
164     setOperationAction(ISD::SDIVREM, IntVT, Expand);
165     setOperationAction(ISD::UDIVREM, IntVT, Expand);
166 
167     // VE has no SHL_PARTS/SRA_PARTS/SRL_PARTS operations.
168     setOperationAction(ISD::SHL_PARTS, IntVT, Expand);
169     setOperationAction(ISD::SRA_PARTS, IntVT, Expand);
170     setOperationAction(ISD::SRL_PARTS, IntVT, Expand);
171 
172     // VE has no MULHU/S or U/SMUL_LOHI operations.
173     // TODO: Use MPD instruction to implement SMUL_LOHI for i32 type.
174     setOperationAction(ISD::MULHU, IntVT, Expand);
175     setOperationAction(ISD::MULHS, IntVT, Expand);
176     setOperationAction(ISD::UMUL_LOHI, IntVT, Expand);
177     setOperationAction(ISD::SMUL_LOHI, IntVT, Expand);
178 
179     // VE has no CTTZ, ROTL, ROTR operations.
180     setOperationAction(ISD::CTTZ, IntVT, Expand);
181     setOperationAction(ISD::ROTL, IntVT, Expand);
182     setOperationAction(ISD::ROTR, IntVT, Expand);
183 
184     // VE has 64 bits instruction which works as i64 BSWAP operation.  This
185     // instruction works fine as i32 BSWAP operation with an additional
186     // parameter.  Use isel patterns to lower BSWAP.
187     setOperationAction(ISD::BSWAP, IntVT, Legal);
188 
189     // VE has only 64 bits instructions which work as i64 BITREVERSE/CTLZ/CTPOP
190     // operations.  Use isel patterns for i64, promote for i32.
191     LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
192     setOperationAction(ISD::BITREVERSE, IntVT, Act);
193     setOperationAction(ISD::CTLZ, IntVT, Act);
194     setOperationAction(ISD::CTLZ_ZERO_UNDEF, IntVT, Act);
195     setOperationAction(ISD::CTPOP, IntVT, Act);
196 
197     // VE has only 64 bits instructions which work as i64 AND/OR/XOR operations.
198     // Use isel patterns for i64, promote for i32.
199     setOperationAction(ISD::AND, IntVT, Act);
200     setOperationAction(ISD::OR, IntVT, Act);
201     setOperationAction(ISD::XOR, IntVT, Act);
202   }
203   /// } Int Ops
204 
205   /// Conversion {
206   // VE doesn't have instructions for fp<->uint, so expand them by llvm
207   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
208   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
209   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
210   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
211 
212   // fp16 not supported
213   for (MVT FPVT : MVT::fp_valuetypes()) {
214     setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
215     setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
216   }
217   /// } Conversion
218 
219   /// Floating-point Ops {
220   /// Note: Floating-point operations are fneg, fadd, fsub, fmul, fdiv, frem,
221   ///       and fcmp.
222 
223   // VE doesn't have following floating point operations.
224   for (MVT VT : MVT::fp_valuetypes()) {
225     setOperationAction(ISD::FNEG, VT, Expand);
226     setOperationAction(ISD::FREM, VT, Expand);
227   }
228 
229   // VE doesn't have fdiv of f128.
230   setOperationAction(ISD::FDIV, MVT::f128, Expand);
231 
232   for (MVT FPVT : {MVT::f32, MVT::f64}) {
233     // f32 and f64 uses ConstantFP.  f128 uses ConstantPool.
234     setOperationAction(ISD::ConstantFP, FPVT, Legal);
235   }
236   /// } Floating-point Ops
237 
238   /// Floating-point math functions {
239 
240   // VE doesn't have following floating point math functions.
241   for (MVT VT : MVT::fp_valuetypes()) {
242     setOperationAction(ISD::FABS, VT, Expand);
243     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
244     setOperationAction(ISD::FCOS, VT, Expand);
245     setOperationAction(ISD::FSIN, VT, Expand);
246     setOperationAction(ISD::FSQRT, VT, Expand);
247   }
248 
249   /// } Floating-point math functions
250 
251   /// Atomic instructions {
252 
253   setMaxAtomicSizeInBitsSupported(64);
254   setMinCmpXchgSizeInBits(32);
255   setSupportsUnalignedAtomics(false);
256 
257   // Use custom inserter for ATOMIC_FENCE.
258   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
259 
260   // Other atomic instructions.
261   for (MVT VT : MVT::integer_valuetypes()) {
262     // Support i8/i16 atomic swap.
263     setOperationAction(ISD::ATOMIC_SWAP, VT, Custom);
264 
265     // FIXME: Support "atmam" instructions.
266     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Expand);
267     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Expand);
268     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Expand);
269     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Expand);
270 
271     // VE doesn't have follwing instructions.
272     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
273     setOperationAction(ISD::ATOMIC_LOAD_CLR, VT, Expand);
274     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Expand);
275     setOperationAction(ISD::ATOMIC_LOAD_NAND, VT, Expand);
276     setOperationAction(ISD::ATOMIC_LOAD_MIN, VT, Expand);
277     setOperationAction(ISD::ATOMIC_LOAD_MAX, VT, Expand);
278     setOperationAction(ISD::ATOMIC_LOAD_UMIN, VT, Expand);
279     setOperationAction(ISD::ATOMIC_LOAD_UMAX, VT, Expand);
280   }
281 
282   /// } Atomic instructions
283 
284   /// SJLJ instructions {
285   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
286   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
287   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
288   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
289     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
290   /// } SJLJ instructions
291 
292   // Intrinsic instructions
293   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
294 }
295 
296 void VETargetLowering::initVPUActions() {
297   for (MVT LegalVecVT : AllVectorVTs) {
298     setOperationAction(ISD::BUILD_VECTOR, LegalVecVT, Custom);
299     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalVecVT, Legal);
300     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalVecVT, Legal);
301     // Translate all vector instructions with legal element types to VVP_*
302     // nodes.
303     // TODO We will custom-widen into VVP_* nodes in the future. While we are
304     // buildling the infrastructure for this, we only do this for legal vector
305     // VTs.
306 #define HANDLE_VP_TO_VVP(VP_OPC, VVP_NAME)                                     \
307   setOperationAction(ISD::VP_OPC, LegalVecVT, Custom);
308 #define ADD_VVP_OP(VVP_NAME, ISD_NAME)                                         \
309   setOperationAction(ISD::ISD_NAME, LegalVecVT, Custom);
310 #include "VVPNodes.def"
311   }
312 
313   for (MVT LegalPackedVT : AllPackedVTs) {
314     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
315     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
316   }
317 }
318 
319 SDValue
320 VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
321                               bool IsVarArg,
322                               const SmallVectorImpl<ISD::OutputArg> &Outs,
323                               const SmallVectorImpl<SDValue> &OutVals,
324                               const SDLoc &DL, SelectionDAG &DAG) const {
325   // CCValAssign - represent the assignment of the return value to locations.
326   SmallVector<CCValAssign, 16> RVLocs;
327 
328   // CCState - Info about the registers and stack slot.
329   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
330                  *DAG.getContext());
331 
332   // Analyze return values.
333   CCInfo.AnalyzeReturn(Outs, getReturnCC(CallConv));
334 
335   SDValue Flag;
336   SmallVector<SDValue, 4> RetOps(1, Chain);
337 
338   // Copy the result values into the output registers.
339   for (unsigned i = 0; i != RVLocs.size(); ++i) {
340     CCValAssign &VA = RVLocs[i];
341     assert(VA.isRegLoc() && "Can only return in registers!");
342     assert(!VA.needsCustom() && "Unexpected custom lowering");
343     SDValue OutVal = OutVals[i];
344 
345     // Integer return values must be sign or zero extended by the callee.
346     switch (VA.getLocInfo()) {
347     case CCValAssign::Full:
348       break;
349     case CCValAssign::SExt:
350       OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
351       break;
352     case CCValAssign::ZExt:
353       OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
354       break;
355     case CCValAssign::AExt:
356       OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
357       break;
358     case CCValAssign::BCvt: {
359       // Convert a float return value to i64 with padding.
360       //     63     31   0
361       //    +------+------+
362       //    | float|   0  |
363       //    +------+------+
364       assert(VA.getLocVT() == MVT::i64);
365       assert(VA.getValVT() == MVT::f32);
366       SDValue Undef = SDValue(
367           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
368       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
369       OutVal = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
370                                           MVT::i64, Undef, OutVal, Sub_f32),
371                        0);
372       break;
373     }
374     default:
375       llvm_unreachable("Unknown loc info!");
376     }
377 
378     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
379 
380     // Guarantee that all emitted copies are stuck together with flags.
381     Flag = Chain.getValue(1);
382     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
383   }
384 
385   RetOps[0] = Chain; // Update chain.
386 
387   // Add the flag if we have it.
388   if (Flag.getNode())
389     RetOps.push_back(Flag);
390 
391   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
392 }
393 
394 SDValue VETargetLowering::LowerFormalArguments(
395     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
396     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
397     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
398   MachineFunction &MF = DAG.getMachineFunction();
399 
400   // Get the base offset of the incoming arguments stack space.
401   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
402   // Get the size of the preserved arguments area
403   unsigned ArgsPreserved = 64;
404 
405   // Analyze arguments according to CC_VE.
406   SmallVector<CCValAssign, 16> ArgLocs;
407   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
408                  *DAG.getContext());
409   // Allocate the preserved area first.
410   CCInfo.AllocateStack(ArgsPreserved, Align(8));
411   // We already allocated the preserved area, so the stack offset computed
412   // by CC_VE would be correct now.
413   CCInfo.AnalyzeFormalArguments(Ins, getParamCC(CallConv, false));
414 
415   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
416     CCValAssign &VA = ArgLocs[i];
417     assert(!VA.needsCustom() && "Unexpected custom lowering");
418     if (VA.isRegLoc()) {
419       // This argument is passed in a register.
420       // All integer register arguments are promoted by the caller to i64.
421 
422       // Create a virtual register for the promoted live-in value.
423       Register VReg =
424           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
425       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
426 
427       // The caller promoted the argument, so insert an Assert?ext SDNode so we
428       // won't promote the value again in this function.
429       switch (VA.getLocInfo()) {
430       case CCValAssign::SExt:
431         Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
432                           DAG.getValueType(VA.getValVT()));
433         break;
434       case CCValAssign::ZExt:
435         Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
436                           DAG.getValueType(VA.getValVT()));
437         break;
438       case CCValAssign::BCvt: {
439         // Extract a float argument from i64 with padding.
440         //     63     31   0
441         //    +------+------+
442         //    | float|   0  |
443         //    +------+------+
444         assert(VA.getLocVT() == MVT::i64);
445         assert(VA.getValVT() == MVT::f32);
446         SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
447         Arg = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
448                                          MVT::f32, Arg, Sub_f32),
449                       0);
450         break;
451       }
452       default:
453         break;
454       }
455 
456       // Truncate the register down to the argument type.
457       if (VA.isExtInLoc())
458         Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
459 
460       InVals.push_back(Arg);
461       continue;
462     }
463 
464     // The registers are exhausted. This argument was passed on the stack.
465     assert(VA.isMemLoc());
466     // The CC_VE_Full/Half functions compute stack offsets relative to the
467     // beginning of the arguments area at %fp + the size of reserved area.
468     unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
469     unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
470 
471     // Adjust offset for a float argument by adding 4 since the argument is
472     // stored in 8 bytes buffer with offset like below.  LLVM generates
473     // 4 bytes load instruction, so need to adjust offset here.  This
474     // adjustment is required in only LowerFormalArguments.  In LowerCall,
475     // a float argument is converted to i64 first, and stored as 8 bytes
476     // data, which is required by ABI, so no need for adjustment.
477     //    0      4
478     //    +------+------+
479     //    | empty| float|
480     //    +------+------+
481     if (VA.getValVT() == MVT::f32)
482       Offset += 4;
483 
484     int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
485     InVals.push_back(
486         DAG.getLoad(VA.getValVT(), DL, Chain,
487                     DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
488                     MachinePointerInfo::getFixedStack(MF, FI)));
489   }
490 
491   if (!IsVarArg)
492     return Chain;
493 
494   // This function takes variable arguments, some of which may have been passed
495   // in registers %s0-%s8.
496   //
497   // The va_start intrinsic needs to know the offset to the first variable
498   // argument.
499   // TODO: need to calculate offset correctly once we support f128.
500   unsigned ArgOffset = ArgLocs.size() * 8;
501   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
502   // Skip the reserved area at the top of stack.
503   FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
504 
505   return Chain;
506 }
507 
508 // FIXME? Maybe this could be a TableGen attribute on some registers and
509 // this table could be generated automatically from RegInfo.
510 Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
511                                              const MachineFunction &MF) const {
512   Register Reg = StringSwitch<Register>(RegName)
513                      .Case("sp", VE::SX11)    // Stack pointer
514                      .Case("fp", VE::SX9)     // Frame pointer
515                      .Case("sl", VE::SX8)     // Stack limit
516                      .Case("lr", VE::SX10)    // Link register
517                      .Case("tp", VE::SX14)    // Thread pointer
518                      .Case("outer", VE::SX12) // Outer regiser
519                      .Case("info", VE::SX17)  // Info area register
520                      .Case("got", VE::SX15)   // Global offset table register
521                      .Case("plt", VE::SX16) // Procedure linkage table register
522                      .Default(0);
523 
524   if (Reg)
525     return Reg;
526 
527   report_fatal_error("Invalid register name global variable");
528 }
529 
530 //===----------------------------------------------------------------------===//
531 // TargetLowering Implementation
532 //===----------------------------------------------------------------------===//
533 
534 SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
535                                     SmallVectorImpl<SDValue> &InVals) const {
536   SelectionDAG &DAG = CLI.DAG;
537   SDLoc DL = CLI.DL;
538   SDValue Chain = CLI.Chain;
539   auto PtrVT = getPointerTy(DAG.getDataLayout());
540 
541   // VE target does not yet support tail call optimization.
542   CLI.IsTailCall = false;
543 
544   // Get the base offset of the outgoing arguments stack space.
545   unsigned ArgsBaseOffset = Subtarget->getRsaSize();
546   // Get the size of the preserved arguments area
547   unsigned ArgsPreserved = 8 * 8u;
548 
549   // Analyze operands of the call, assigning locations to each operand.
550   SmallVector<CCValAssign, 16> ArgLocs;
551   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
552                  *DAG.getContext());
553   // Allocate the preserved area first.
554   CCInfo.AllocateStack(ArgsPreserved, Align(8));
555   // We already allocated the preserved area, so the stack offset computed
556   // by CC_VE would be correct now.
557   CCInfo.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, false));
558 
559   // VE requires to use both register and stack for varargs or no-prototyped
560   // functions.
561   bool UseBoth = CLI.IsVarArg;
562 
563   // Analyze operands again if it is required to store BOTH.
564   SmallVector<CCValAssign, 16> ArgLocs2;
565   CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
566                   ArgLocs2, *DAG.getContext());
567   if (UseBoth)
568     CCInfo2.AnalyzeCallOperands(CLI.Outs, getParamCC(CLI.CallConv, true));
569 
570   // Get the size of the outgoing arguments stack space requirement.
571   unsigned ArgsSize = CCInfo.getNextStackOffset();
572 
573   // Keep stack frames 16-byte aligned.
574   ArgsSize = alignTo(ArgsSize, 16);
575 
576   // Adjust the stack pointer to make room for the arguments.
577   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
578   // with more than 6 arguments.
579   Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
580 
581   // Collect the set of registers to pass to the function and their values.
582   // This will be emitted as a sequence of CopyToReg nodes glued to the call
583   // instruction.
584   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
585 
586   // Collect chains from all the memory opeations that copy arguments to the
587   // stack. They must follow the stack pointer adjustment above and precede the
588   // call instruction itself.
589   SmallVector<SDValue, 8> MemOpChains;
590 
591   // VE needs to get address of callee function in a register
592   // So, prepare to copy it to SX12 here.
593 
594   // If the callee is a GlobalAddress node (quite common, every direct call is)
595   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
596   // Likewise ExternalSymbol -> TargetExternalSymbol.
597   SDValue Callee = CLI.Callee;
598 
599   bool IsPICCall = isPositionIndependent();
600 
601   // PC-relative references to external symbols should go through $stub.
602   // If so, we need to prepare GlobalBaseReg first.
603   const TargetMachine &TM = DAG.getTarget();
604   const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
605   const GlobalValue *GV = nullptr;
606   auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
607   if (CalleeG)
608     GV = CalleeG->getGlobal();
609   bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
610   bool UsePlt = !Local;
611   MachineFunction &MF = DAG.getMachineFunction();
612 
613   // Turn GlobalAddress/ExternalSymbol node into a value node
614   // containing the address of them here.
615   if (CalleeG) {
616     if (IsPICCall) {
617       if (UsePlt)
618         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
619       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
620       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
621     } else {
622       Callee =
623           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
624     }
625   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
626     if (IsPICCall) {
627       if (UsePlt)
628         Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
629       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
630       Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
631     } else {
632       Callee =
633           makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
634     }
635   }
636 
637   RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
638 
639   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
640     CCValAssign &VA = ArgLocs[i];
641     SDValue Arg = CLI.OutVals[i];
642 
643     // Promote the value if needed.
644     switch (VA.getLocInfo()) {
645     default:
646       llvm_unreachable("Unknown location info!");
647     case CCValAssign::Full:
648       break;
649     case CCValAssign::SExt:
650       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
651       break;
652     case CCValAssign::ZExt:
653       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
654       break;
655     case CCValAssign::AExt:
656       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
657       break;
658     case CCValAssign::BCvt: {
659       // Convert a float argument to i64 with padding.
660       //     63     31   0
661       //    +------+------+
662       //    | float|   0  |
663       //    +------+------+
664       assert(VA.getLocVT() == MVT::i64);
665       assert(VA.getValVT() == MVT::f32);
666       SDValue Undef = SDValue(
667           DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i64), 0);
668       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
669       Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
670                                        MVT::i64, Undef, Arg, Sub_f32),
671                     0);
672       break;
673     }
674     }
675 
676     if (VA.isRegLoc()) {
677       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
678       if (!UseBoth)
679         continue;
680       VA = ArgLocs2[i];
681     }
682 
683     assert(VA.isMemLoc());
684 
685     // Create a store off the stack pointer for this argument.
686     SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
687     // The argument area starts at %fp/%sp + the size of reserved area.
688     SDValue PtrOff =
689         DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
690     PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
691     MemOpChains.push_back(
692         DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
693   }
694 
695   // Emit all stores, make sure they occur before the call.
696   if (!MemOpChains.empty())
697     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
698 
699   // Build a sequence of CopyToReg nodes glued together with token chain and
700   // glue operands which copy the outgoing args into registers. The InGlue is
701   // necessary since all emitted instructions must be stuck together in order
702   // to pass the live physical registers.
703   SDValue InGlue;
704   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
705     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
706                              RegsToPass[i].second, InGlue);
707     InGlue = Chain.getValue(1);
708   }
709 
710   // Build the operands for the call instruction itself.
711   SmallVector<SDValue, 8> Ops;
712   Ops.push_back(Chain);
713   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
714     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
715                                   RegsToPass[i].second.getValueType()));
716 
717   // Add a register mask operand representing the call-preserved registers.
718   const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
719   const uint32_t *Mask =
720       TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
721   assert(Mask && "Missing call preserved mask for calling convention");
722   Ops.push_back(DAG.getRegisterMask(Mask));
723 
724   // Make sure the CopyToReg nodes are glued to the call instruction which
725   // consumes the registers.
726   if (InGlue.getNode())
727     Ops.push_back(InGlue);
728 
729   // Now the call itself.
730   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
731   Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
732   InGlue = Chain.getValue(1);
733 
734   // Revert the stack pointer immediately after the call.
735   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
736                              DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
737   InGlue = Chain.getValue(1);
738 
739   // Now extract the return values. This is more or less the same as
740   // LowerFormalArguments.
741 
742   // Assign locations to each value returned by this call.
743   SmallVector<CCValAssign, 16> RVLocs;
744   CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
745                  *DAG.getContext());
746 
747   // Set inreg flag manually for codegen generated library calls that
748   // return float.
749   if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
750     CLI.Ins[0].Flags.setInReg();
751 
752   RVInfo.AnalyzeCallResult(CLI.Ins, getReturnCC(CLI.CallConv));
753 
754   // Copy all of the result registers out of their specified physreg.
755   for (unsigned i = 0; i != RVLocs.size(); ++i) {
756     CCValAssign &VA = RVLocs[i];
757     assert(!VA.needsCustom() && "Unexpected custom lowering");
758     Register Reg = VA.getLocReg();
759 
760     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
761     // reside in the same register in the high and low bits. Reuse the
762     // CopyFromReg previous node to avoid duplicate copies.
763     SDValue RV;
764     if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
765       if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
766         RV = Chain.getValue(0);
767 
768     // But usually we'll create a new CopyFromReg for a different register.
769     if (!RV.getNode()) {
770       RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
771       Chain = RV.getValue(1);
772       InGlue = Chain.getValue(2);
773     }
774 
775     // The callee promoted the return value, so insert an Assert?ext SDNode so
776     // we won't promote the value again in this function.
777     switch (VA.getLocInfo()) {
778     case CCValAssign::SExt:
779       RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
780                        DAG.getValueType(VA.getValVT()));
781       break;
782     case CCValAssign::ZExt:
783       RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
784                        DAG.getValueType(VA.getValVT()));
785       break;
786     case CCValAssign::BCvt: {
787       // Extract a float return value from i64 with padding.
788       //     63     31   0
789       //    +------+------+
790       //    | float|   0  |
791       //    +------+------+
792       assert(VA.getLocVT() == MVT::i64);
793       assert(VA.getValVT() == MVT::f32);
794       SDValue Sub_f32 = DAG.getTargetConstant(VE::sub_f32, DL, MVT::i32);
795       RV = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
796                                       MVT::f32, RV, Sub_f32),
797                    0);
798       break;
799     }
800     default:
801       break;
802     }
803 
804     // Truncate the register down to the return value type.
805     if (VA.isExtInLoc())
806       RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
807 
808     InVals.push_back(RV);
809   }
810 
811   return Chain;
812 }
813 
814 bool VETargetLowering::isOffsetFoldingLegal(
815     const GlobalAddressSDNode *GA) const {
816   // VE uses 64 bit addressing, so we need multiple instructions to generate
817   // an address.  Folding address with offset increases the number of
818   // instructions, so that we disable it here.  Offsets will be folded in
819   // the DAG combine later if it worth to do so.
820   return false;
821 }
822 
823 /// isFPImmLegal - Returns true if the target can instruction select the
824 /// specified FP immediate natively. If false, the legalizer will
825 /// materialize the FP immediate as a load from a constant pool.
826 bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
827                                     bool ForCodeSize) const {
828   return VT == MVT::f32 || VT == MVT::f64;
829 }
830 
831 /// Determine if the target supports unaligned memory accesses.
832 ///
833 /// This function returns true if the target allows unaligned memory accesses
834 /// of the specified type in the given address space. If true, it also returns
835 /// whether the unaligned memory access is "fast" in the last argument by
836 /// reference. This is used, for example, in situations where an array
837 /// copy/move/set is converted to a sequence of store operations. Its use
838 /// helps to ensure that such replacements don't generate code that causes an
839 /// alignment error (trap) on the target machine.
840 bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
841                                                       unsigned AddrSpace,
842                                                       Align A,
843                                                       MachineMemOperand::Flags,
844                                                       bool *Fast) const {
845   if (Fast) {
846     // It's fast anytime on VE
847     *Fast = true;
848   }
849   return true;
850 }
851 
852 VETargetLowering::VETargetLowering(const TargetMachine &TM,
853                                    const VESubtarget &STI)
854     : TargetLowering(TM), Subtarget(&STI) {
855   // Instructions which use registers as conditionals examine all the
856   // bits (as does the pseudo SELECT_CC expansion). I don't think it
857   // matters much whether it's ZeroOrOneBooleanContent, or
858   // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the
859   // former.
860   setBooleanContents(ZeroOrOneBooleanContent);
861   setBooleanVectorContents(ZeroOrOneBooleanContent);
862 
863   initRegisterClasses();
864   initSPUActions();
865   initVPUActions();
866 
867   setStackPointerRegisterToSaveRestore(VE::SX11);
868 
869   // We have target-specific dag combine patterns for the following nodes:
870   setTargetDAGCombine(ISD::TRUNCATE);
871 
872   // Set function alignment to 16 bytes
873   setMinFunctionAlignment(Align(16));
874 
875   // VE stores all argument by 8 bytes alignment
876   setMinStackArgumentAlignment(Align(8));
877 
878   computeRegisterProperties(Subtarget->getRegisterInfo());
879 }
880 
881 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
882 #define TARGET_NODE_CASE(NAME)                                                 \
883   case VEISD::NAME:                                                            \
884     return "VEISD::" #NAME;
885   switch ((VEISD::NodeType)Opcode) {
886   case VEISD::FIRST_NUMBER:
887     break;
888     TARGET_NODE_CASE(CALL)
889     TARGET_NODE_CASE(EH_SJLJ_LONGJMP)
890     TARGET_NODE_CASE(EH_SJLJ_SETJMP)
891     TARGET_NODE_CASE(EH_SJLJ_SETUP_DISPATCH)
892     TARGET_NODE_CASE(GETFUNPLT)
893     TARGET_NODE_CASE(GETSTACKTOP)
894     TARGET_NODE_CASE(GETTLSADDR)
895     TARGET_NODE_CASE(GLOBAL_BASE_REG)
896     TARGET_NODE_CASE(Hi)
897     TARGET_NODE_CASE(Lo)
898     TARGET_NODE_CASE(MEMBARRIER)
899     TARGET_NODE_CASE(RET_FLAG)
900     TARGET_NODE_CASE(TS1AM)
901     TARGET_NODE_CASE(VEC_BROADCAST)
902     TARGET_NODE_CASE(REPL_I32)
903     TARGET_NODE_CASE(REPL_F32)
904 
905     // Register the VVP_* SDNodes.
906 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
907 #include "VVPNodes.def"
908   }
909 #undef TARGET_NODE_CASE
910   return nullptr;
911 }
912 
913 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
914                                          EVT VT) const {
915   return MVT::i32;
916 }
917 
918 // Convert to a target node and set target flags.
919 SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
920                                           SelectionDAG &DAG) const {
921   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
922     return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
923                                       GA->getValueType(0), GA->getOffset(), TF);
924 
925   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
926     return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
927                                      0, TF);
928 
929   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
930     return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
931                                      CP->getAlign(), CP->getOffset(), TF);
932 
933   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
934     return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
935                                        TF);
936 
937   if (const JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op))
938     return DAG.getTargetJumpTable(JT->getIndex(), JT->getValueType(0), TF);
939 
940   llvm_unreachable("Unhandled address SDNode");
941 }
942 
943 // Split Op into high and low parts according to HiTF and LoTF.
944 // Return an ADD node combining the parts.
945 SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
946                                        SelectionDAG &DAG) const {
947   SDLoc DL(Op);
948   EVT VT = Op.getValueType();
949   SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
950   SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
951   return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
952 }
953 
954 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
955 // or ExternalSymbol SDNode.
956 SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
957   SDLoc DL(Op);
958   EVT PtrVT = Op.getValueType();
959 
960   // Handle PIC mode first. VE needs a got load for every variable!
961   if (isPositionIndependent()) {
962     auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
963 
964     if (isa<ConstantPoolSDNode>(Op) || isa<JumpTableSDNode>(Op) ||
965         (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
966       // Create following instructions for local linkage PIC code.
967       //     lea %reg, label@gotoff_lo
968       //     and %reg, %reg, (32)0
969       //     lea.sl %reg, label@gotoff_hi(%reg, %got)
970       SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
971                                   VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
972       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
973       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
974     }
975     // Create following instructions for not local linkage PIC code.
976     //     lea %reg, label@got_lo
977     //     and %reg, %reg, (32)0
978     //     lea.sl %reg, label@got_hi(%reg)
979     //     ld %reg, (%reg, %got)
980     SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
981                                 VEMCExpr::VK_VE_GOT_LO32, DAG);
982     SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
983     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
984     return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
985                        MachinePointerInfo::getGOT(DAG.getMachineFunction()));
986   }
987 
988   // This is one of the absolute code models.
989   switch (getTargetMachine().getCodeModel()) {
990   default:
991     llvm_unreachable("Unsupported absolute code model");
992   case CodeModel::Small:
993   case CodeModel::Medium:
994   case CodeModel::Large:
995     // abs64.
996     return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
997   }
998 }
999 
1000 /// Custom Lower {
1001 
1002 // The mappings for emitLeading/TrailingFence for VE is designed by following
1003 // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
1004 Instruction *VETargetLowering::emitLeadingFence(IRBuilderBase &Builder,
1005                                                 Instruction *Inst,
1006                                                 AtomicOrdering Ord) const {
1007   switch (Ord) {
1008   case AtomicOrdering::NotAtomic:
1009   case AtomicOrdering::Unordered:
1010     llvm_unreachable("Invalid fence: unordered/non-atomic");
1011   case AtomicOrdering::Monotonic:
1012   case AtomicOrdering::Acquire:
1013     return nullptr; // Nothing to do
1014   case AtomicOrdering::Release:
1015   case AtomicOrdering::AcquireRelease:
1016     return Builder.CreateFence(AtomicOrdering::Release);
1017   case AtomicOrdering::SequentiallyConsistent:
1018     if (!Inst->hasAtomicStore())
1019       return nullptr; // Nothing to do
1020     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1021   }
1022   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
1023 }
1024 
1025 Instruction *VETargetLowering::emitTrailingFence(IRBuilderBase &Builder,
1026                                                  Instruction *Inst,
1027                                                  AtomicOrdering Ord) const {
1028   switch (Ord) {
1029   case AtomicOrdering::NotAtomic:
1030   case AtomicOrdering::Unordered:
1031     llvm_unreachable("Invalid fence: unordered/not-atomic");
1032   case AtomicOrdering::Monotonic:
1033   case AtomicOrdering::Release:
1034     return nullptr; // Nothing to do
1035   case AtomicOrdering::Acquire:
1036   case AtomicOrdering::AcquireRelease:
1037     return Builder.CreateFence(AtomicOrdering::Acquire);
1038   case AtomicOrdering::SequentiallyConsistent:
1039     return Builder.CreateFence(AtomicOrdering::SequentiallyConsistent);
1040   }
1041   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
1042 }
1043 
1044 SDValue VETargetLowering::lowerATOMIC_FENCE(SDValue Op,
1045                                             SelectionDAG &DAG) const {
1046   SDLoc DL(Op);
1047   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
1048       cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
1049   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
1050       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
1051 
1052   // VE uses Release consistency, so need a fence instruction if it is a
1053   // cross-thread fence.
1054   if (FenceSSID == SyncScope::System) {
1055     switch (FenceOrdering) {
1056     case AtomicOrdering::NotAtomic:
1057     case AtomicOrdering::Unordered:
1058     case AtomicOrdering::Monotonic:
1059       // No need to generate fencem instruction here.
1060       break;
1061     case AtomicOrdering::Acquire:
1062       // Generate "fencem 2" as acquire fence.
1063       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1064                                         DAG.getTargetConstant(2, DL, MVT::i32),
1065                                         Op.getOperand(0)),
1066                      0);
1067     case AtomicOrdering::Release:
1068       // Generate "fencem 1" as release fence.
1069       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1070                                         DAG.getTargetConstant(1, DL, MVT::i32),
1071                                         Op.getOperand(0)),
1072                      0);
1073     case AtomicOrdering::AcquireRelease:
1074     case AtomicOrdering::SequentiallyConsistent:
1075       // Generate "fencem 3" as acq_rel and seq_cst fence.
1076       // FIXME: "fencem 3" doesn't wait for for PCIe deveices accesses,
1077       //        so  seq_cst may require more instruction for them.
1078       return SDValue(DAG.getMachineNode(VE::FENCEM, DL, MVT::Other,
1079                                         DAG.getTargetConstant(3, DL, MVT::i32),
1080                                         Op.getOperand(0)),
1081                      0);
1082     }
1083   }
1084 
1085   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
1086   return DAG.getNode(VEISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
1087 }
1088 
1089 TargetLowering::AtomicExpansionKind
1090 VETargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
1091   // We have TS1AM implementation for i8/i16/i32/i64, so use it.
1092   if (AI->getOperation() == AtomicRMWInst::Xchg) {
1093     return AtomicExpansionKind::None;
1094   }
1095   // FIXME: Support "ATMAM" instruction for LOAD_ADD/SUB/AND/OR.
1096 
1097   // Otherwise, expand it using compare and exchange instruction to not call
1098   // __sync_fetch_and_* functions.
1099   return AtomicExpansionKind::CmpXChg;
1100 }
1101 
1102 static SDValue prepareTS1AM(SDValue Op, SelectionDAG &DAG, SDValue &Flag,
1103                             SDValue &Bits) {
1104   SDLoc DL(Op);
1105   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1106   SDValue Ptr = N->getOperand(1);
1107   SDValue Val = N->getOperand(2);
1108   EVT PtrVT = Ptr.getValueType();
1109   bool Byte = N->getMemoryVT() == MVT::i8;
1110   //   Remainder = AND Ptr, 3
1111   //   Flag = 1 << Remainder  ; If Byte is true (1 byte swap flag)
1112   //   Flag = 3 << Remainder  ; If Byte is false (2 bytes swap flag)
1113   //   Bits = Remainder << 3
1114   //   NewVal = Val << Bits
1115   SDValue Const3 = DAG.getConstant(3, DL, PtrVT);
1116   SDValue Remainder = DAG.getNode(ISD::AND, DL, PtrVT, {Ptr, Const3});
1117   SDValue Mask = Byte ? DAG.getConstant(1, DL, MVT::i32)
1118                       : DAG.getConstant(3, DL, MVT::i32);
1119   Flag = DAG.getNode(ISD::SHL, DL, MVT::i32, {Mask, Remainder});
1120   Bits = DAG.getNode(ISD::SHL, DL, PtrVT, {Remainder, Const3});
1121   return DAG.getNode(ISD::SHL, DL, Val.getValueType(), {Val, Bits});
1122 }
1123 
1124 static SDValue finalizeTS1AM(SDValue Op, SelectionDAG &DAG, SDValue Data,
1125                              SDValue Bits) {
1126   SDLoc DL(Op);
1127   EVT VT = Data.getValueType();
1128   bool Byte = cast<AtomicSDNode>(Op)->getMemoryVT() == MVT::i8;
1129   //   NewData = Data >> Bits
1130   //   Result = NewData & 0xff   ; If Byte is true (1 byte)
1131   //   Result = NewData & 0xffff ; If Byte is false (2 bytes)
1132 
1133   SDValue NewData = DAG.getNode(ISD::SRL, DL, VT, Data, Bits);
1134   return DAG.getNode(ISD::AND, DL, VT,
1135                      {NewData, DAG.getConstant(Byte ? 0xff : 0xffff, DL, VT)});
1136 }
1137 
1138 SDValue VETargetLowering::lowerATOMIC_SWAP(SDValue Op,
1139                                            SelectionDAG &DAG) const {
1140   SDLoc DL(Op);
1141   AtomicSDNode *N = cast<AtomicSDNode>(Op);
1142 
1143   if (N->getMemoryVT() == MVT::i8) {
1144     // For i8, use "ts1am"
1145     //   Input:
1146     //     ATOMIC_SWAP Ptr, Val, Order
1147     //
1148     //   Output:
1149     //     Remainder = AND Ptr, 3
1150     //     Flag = 1 << Remainder   ; 1 byte swap flag for TS1AM inst.
1151     //     Bits = Remainder << 3
1152     //     NewVal = Val << Bits
1153     //
1154     //     Aligned = AND Ptr, -4
1155     //     Data = TS1AM Aligned, Flag, NewVal
1156     //
1157     //     NewData = Data >> Bits
1158     //     Result = NewData & 0xff ; 1 byte result
1159     SDValue Flag;
1160     SDValue Bits;
1161     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1162 
1163     SDValue Ptr = N->getOperand(1);
1164     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1165                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1166     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1167                                   DAG.getVTList(Op.getNode()->getValueType(0),
1168                                                 Op.getNode()->getValueType(1)),
1169                                   {N->getChain(), Aligned, Flag, NewVal},
1170                                   N->getMemOperand());
1171 
1172     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1173     SDValue Chain = TS1AM.getValue(1);
1174     return DAG.getMergeValues({Result, Chain}, DL);
1175   }
1176   if (N->getMemoryVT() == MVT::i16) {
1177     // For i16, use "ts1am"
1178     SDValue Flag;
1179     SDValue Bits;
1180     SDValue NewVal = prepareTS1AM(Op, DAG, Flag, Bits);
1181 
1182     SDValue Ptr = N->getOperand(1);
1183     SDValue Aligned = DAG.getNode(ISD::AND, DL, Ptr.getValueType(),
1184                                   {Ptr, DAG.getConstant(-4, DL, MVT::i64)});
1185     SDValue TS1AM = DAG.getAtomic(VEISD::TS1AM, DL, N->getMemoryVT(),
1186                                   DAG.getVTList(Op.getNode()->getValueType(0),
1187                                                 Op.getNode()->getValueType(1)),
1188                                   {N->getChain(), Aligned, Flag, NewVal},
1189                                   N->getMemOperand());
1190 
1191     SDValue Result = finalizeTS1AM(Op, DAG, TS1AM, Bits);
1192     SDValue Chain = TS1AM.getValue(1);
1193     return DAG.getMergeValues({Result, Chain}, DL);
1194   }
1195   // Otherwise, let llvm legalize it.
1196   return Op;
1197 }
1198 
1199 SDValue VETargetLowering::lowerGlobalAddress(SDValue Op,
1200                                              SelectionDAG &DAG) const {
1201   return makeAddress(Op, DAG);
1202 }
1203 
1204 SDValue VETargetLowering::lowerBlockAddress(SDValue Op,
1205                                             SelectionDAG &DAG) const {
1206   return makeAddress(Op, DAG);
1207 }
1208 
1209 SDValue VETargetLowering::lowerConstantPool(SDValue Op,
1210                                             SelectionDAG &DAG) const {
1211   return makeAddress(Op, DAG);
1212 }
1213 
1214 SDValue
1215 VETargetLowering::lowerToTLSGeneralDynamicModel(SDValue Op,
1216                                                 SelectionDAG &DAG) const {
1217   SDLoc DL(Op);
1218 
1219   // Generate the following code:
1220   //   t1: ch,glue = callseq_start t0, 0, 0
1221   //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
1222   //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
1223   //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
1224   SDValue Label = withTargetFlags(Op, 0, DAG);
1225   EVT PtrVT = Op.getValueType();
1226 
1227   // Lowering the machine isd will make sure everything is in the right
1228   // location.
1229   SDValue Chain = DAG.getEntryNode();
1230   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1231   const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
1232       DAG.getMachineFunction(), CallingConv::C);
1233   Chain = DAG.getCALLSEQ_START(Chain, 64, 0, DL);
1234   SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
1235   Chain = DAG.getNode(VEISD::GETTLSADDR, DL, NodeTys, Args);
1236   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, DL, true),
1237                              DAG.getIntPtrConstant(0, DL, true),
1238                              Chain.getValue(1), DL);
1239   Chain = DAG.getCopyFromReg(Chain, DL, VE::SX0, PtrVT, Chain.getValue(1));
1240 
1241   // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
1242   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1243   MFI.setHasCalls(true);
1244 
1245   // Also generate code to prepare a GOT register if it is PIC.
1246   if (isPositionIndependent()) {
1247     MachineFunction &MF = DAG.getMachineFunction();
1248     Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
1249   }
1250 
1251   return Chain;
1252 }
1253 
1254 SDValue VETargetLowering::lowerGlobalTLSAddress(SDValue Op,
1255                                                 SelectionDAG &DAG) const {
1256   // The current implementation of nld (2.26) doesn't allow local exec model
1257   // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
1258   // generate the general dynamic model code sequence.
1259   //
1260   // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
1261   return lowerToTLSGeneralDynamicModel(Op, DAG);
1262 }
1263 
1264 SDValue VETargetLowering::lowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
1265   return makeAddress(Op, DAG);
1266 }
1267 
1268 // Lower a f128 load into two f64 loads.
1269 static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
1270   SDLoc DL(Op);
1271   LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
1272   assert(LdNode && LdNode->getOffset().isUndef() && "Unexpected node type");
1273   unsigned Alignment = LdNode->getAlign().value();
1274   if (Alignment > 8)
1275     Alignment = 8;
1276 
1277   SDValue Lo64 =
1278       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), LdNode->getBasePtr(),
1279                   LdNode->getPointerInfo(), Alignment,
1280                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1281                                        : MachineMemOperand::MONone);
1282   EVT AddrVT = LdNode->getBasePtr().getValueType();
1283   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, LdNode->getBasePtr(),
1284                               DAG.getConstant(8, DL, AddrVT));
1285   SDValue Hi64 =
1286       DAG.getLoad(MVT::f64, DL, LdNode->getChain(), HiPtr,
1287                   LdNode->getPointerInfo(), Alignment,
1288                   LdNode->isVolatile() ? MachineMemOperand::MOVolatile
1289                                        : MachineMemOperand::MONone);
1290 
1291   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1292   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1293 
1294   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1295   SDNode *InFP128 =
1296       DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f128);
1297   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1298                                SDValue(InFP128, 0), Hi64, SubRegEven);
1299   InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f128,
1300                                SDValue(InFP128, 0), Lo64, SubRegOdd);
1301   SDValue OutChains[2] = {SDValue(Lo64.getNode(), 1),
1302                           SDValue(Hi64.getNode(), 1)};
1303   SDValue OutChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1304   SDValue Ops[2] = {SDValue(InFP128, 0), OutChain};
1305   return DAG.getMergeValues(Ops, DL);
1306 }
1307 
1308 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1309   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
1310 
1311   SDValue BasePtr = LdNode->getBasePtr();
1312   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1313     // Do not expand store instruction with frame index here because of
1314     // dependency problems.  We expand it later in eliminateFrameIndex().
1315     return Op;
1316   }
1317 
1318   EVT MemVT = LdNode->getMemoryVT();
1319   if (MemVT == MVT::f128)
1320     return lowerLoadF128(Op, DAG);
1321 
1322   return Op;
1323 }
1324 
1325 // Lower a f128 store into two f64 stores.
1326 static SDValue lowerStoreF128(SDValue Op, SelectionDAG &DAG) {
1327   SDLoc DL(Op);
1328   StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
1329   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1330 
1331   SDValue SubRegEven = DAG.getTargetConstant(VE::sub_even, DL, MVT::i32);
1332   SDValue SubRegOdd = DAG.getTargetConstant(VE::sub_odd, DL, MVT::i32);
1333 
1334   SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1335                                     StNode->getValue(), SubRegEven);
1336   SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i64,
1337                                     StNode->getValue(), SubRegOdd);
1338 
1339   unsigned Alignment = StNode->getAlign().value();
1340   if (Alignment > 8)
1341     Alignment = 8;
1342 
1343   // VE stores Hi64 to 8(addr) and Lo64 to 0(addr)
1344   SDValue OutChains[2];
1345   OutChains[0] =
1346       DAG.getStore(StNode->getChain(), DL, SDValue(Lo64, 0),
1347                    StNode->getBasePtr(), MachinePointerInfo(), Alignment,
1348                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1349                                         : MachineMemOperand::MONone);
1350   EVT AddrVT = StNode->getBasePtr().getValueType();
1351   SDValue HiPtr = DAG.getNode(ISD::ADD, DL, AddrVT, StNode->getBasePtr(),
1352                               DAG.getConstant(8, DL, AddrVT));
1353   OutChains[1] =
1354       DAG.getStore(StNode->getChain(), DL, SDValue(Hi64, 0), HiPtr,
1355                    MachinePointerInfo(), Alignment,
1356                    StNode->isVolatile() ? MachineMemOperand::MOVolatile
1357                                         : MachineMemOperand::MONone);
1358   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
1359 }
1360 
1361 SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1362   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
1363   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
1364 
1365   SDValue BasePtr = StNode->getBasePtr();
1366   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
1367     // Do not expand store instruction with frame index here because of
1368     // dependency problems.  We expand it later in eliminateFrameIndex().
1369     return Op;
1370   }
1371 
1372   EVT MemVT = StNode->getMemoryVT();
1373   if (MemVT == MVT::f128)
1374     return lowerStoreF128(Op, DAG);
1375 
1376   // Otherwise, ask llvm to expand it.
1377   return SDValue();
1378 }
1379 
1380 SDValue VETargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
1381   MachineFunction &MF = DAG.getMachineFunction();
1382   VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
1383   auto PtrVT = getPointerTy(DAG.getDataLayout());
1384 
1385   // Need frame address to find the address of VarArgsFrameIndex.
1386   MF.getFrameInfo().setFrameAddressIsTaken(true);
1387 
1388   // vastart just stores the address of the VarArgsFrameIndex slot into the
1389   // memory location argument.
1390   SDLoc DL(Op);
1391   SDValue Offset =
1392       DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
1393                   DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
1394   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
1395   return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
1396                       MachinePointerInfo(SV));
1397 }
1398 
1399 SDValue VETargetLowering::lowerVAARG(SDValue Op, SelectionDAG &DAG) const {
1400   SDNode *Node = Op.getNode();
1401   EVT VT = Node->getValueType(0);
1402   SDValue InChain = Node->getOperand(0);
1403   SDValue VAListPtr = Node->getOperand(1);
1404   EVT PtrVT = VAListPtr.getValueType();
1405   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
1406   SDLoc DL(Node);
1407   SDValue VAList =
1408       DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
1409   SDValue Chain = VAList.getValue(1);
1410   SDValue NextPtr;
1411 
1412   if (VT == MVT::f128) {
1413     // VE f128 values must be stored with 16 bytes alignment.  We doesn't
1414     // know the actual alignment of VAList, so we take alignment of it
1415     // dyanmically.
1416     int Align = 16;
1417     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1418                          DAG.getConstant(Align - 1, DL, PtrVT));
1419     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
1420                          DAG.getConstant(-Align, DL, PtrVT));
1421     // Increment the pointer, VAList, by 16 to the next vaarg.
1422     NextPtr =
1423         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(16, DL));
1424   } else if (VT == MVT::f32) {
1425     // float --> need special handling like below.
1426     //    0      4
1427     //    +------+------+
1428     //    | empty| float|
1429     //    +------+------+
1430     // Increment the pointer, VAList, by 8 to the next vaarg.
1431     NextPtr =
1432         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1433     // Then, adjust VAList.
1434     unsigned InternalOffset = 4;
1435     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
1436                          DAG.getConstant(InternalOffset, DL, PtrVT));
1437   } else {
1438     // Increment the pointer, VAList, by 8 to the next vaarg.
1439     NextPtr =
1440         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
1441   }
1442 
1443   // Store the incremented VAList to the legalized pointer.
1444   InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
1445 
1446   // Load the actual argument out of the pointer VAList.
1447   // We can't count on greater alignment than the word size.
1448   return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
1449                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
1450 }
1451 
1452 SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
1453                                                   SelectionDAG &DAG) const {
1454   // Generate following code.
1455   //   (void)__llvm_grow_stack(size);
1456   //   ret = GETSTACKTOP;        // pseudo instruction
1457   SDLoc DL(Op);
1458 
1459   // Get the inputs.
1460   SDNode *Node = Op.getNode();
1461   SDValue Chain = Op.getOperand(0);
1462   SDValue Size = Op.getOperand(1);
1463   MaybeAlign Alignment(Op.getConstantOperandVal(2));
1464   EVT VT = Node->getValueType(0);
1465 
1466   // Chain the dynamic stack allocation so that it doesn't modify the stack
1467   // pointer when other instructions are using the stack.
1468   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
1469 
1470   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
1471   Align StackAlign = TFI.getStackAlign();
1472   bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
1473 
1474   // Prepare arguments
1475   TargetLowering::ArgListTy Args;
1476   TargetLowering::ArgListEntry Entry;
1477   Entry.Node = Size;
1478   Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1479   Args.push_back(Entry);
1480   if (NeedsAlign) {
1481     Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
1482     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
1483     Args.push_back(Entry);
1484   }
1485   Type *RetTy = Type::getVoidTy(*DAG.getContext());
1486 
1487   EVT PtrVT = Op.getValueType();
1488   SDValue Callee;
1489   if (NeedsAlign) {
1490     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
1491   } else {
1492     Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
1493   }
1494 
1495   TargetLowering::CallLoweringInfo CLI(DAG);
1496   CLI.setDebugLoc(DL)
1497       .setChain(Chain)
1498       .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
1499       .setDiscardResult(true);
1500   std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
1501   Chain = pair.second;
1502   SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
1503   if (NeedsAlign) {
1504     Result = DAG.getNode(ISD::ADD, DL, VT, Result,
1505                          DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
1506     Result = DAG.getNode(ISD::AND, DL, VT, Result,
1507                          DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
1508   }
1509   //  Chain = Result.getValue(1);
1510   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
1511                              DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
1512 
1513   SDValue Ops[2] = {Result, Chain};
1514   return DAG.getMergeValues(Ops, DL);
1515 }
1516 
1517 SDValue VETargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
1518                                                SelectionDAG &DAG) const {
1519   SDLoc DL(Op);
1520   return DAG.getNode(VEISD::EH_SJLJ_LONGJMP, DL, MVT::Other, Op.getOperand(0),
1521                      Op.getOperand(1));
1522 }
1523 
1524 SDValue VETargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
1525                                               SelectionDAG &DAG) const {
1526   SDLoc DL(Op);
1527   return DAG.getNode(VEISD::EH_SJLJ_SETJMP, DL,
1528                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
1529                      Op.getOperand(1));
1530 }
1531 
1532 SDValue VETargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
1533                                                       SelectionDAG &DAG) const {
1534   SDLoc DL(Op);
1535   return DAG.getNode(VEISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
1536                      Op.getOperand(0));
1537 }
1538 
1539 static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
1540                               const VETargetLowering &TLI,
1541                               const VESubtarget *Subtarget) {
1542   SDLoc DL(Op);
1543   MachineFunction &MF = DAG.getMachineFunction();
1544   EVT PtrVT = TLI.getPointerTy(MF.getDataLayout());
1545 
1546   MachineFrameInfo &MFI = MF.getFrameInfo();
1547   MFI.setFrameAddressIsTaken(true);
1548 
1549   unsigned Depth = Op.getConstantOperandVal(0);
1550   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
1551   Register FrameReg = RegInfo->getFrameRegister(MF);
1552   SDValue FrameAddr =
1553       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
1554   while (Depth--)
1555     FrameAddr = DAG.getLoad(Op.getValueType(), DL, DAG.getEntryNode(),
1556                             FrameAddr, MachinePointerInfo());
1557   return FrameAddr;
1558 }
1559 
1560 static SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
1561                                const VETargetLowering &TLI,
1562                                const VESubtarget *Subtarget) {
1563   MachineFunction &MF = DAG.getMachineFunction();
1564   MachineFrameInfo &MFI = MF.getFrameInfo();
1565   MFI.setReturnAddressIsTaken(true);
1566 
1567   if (TLI.verifyReturnAddressArgumentIsConstant(Op, DAG))
1568     return SDValue();
1569 
1570   SDValue FrameAddr = lowerFRAMEADDR(Op, DAG, TLI, Subtarget);
1571 
1572   SDLoc DL(Op);
1573   EVT VT = Op.getValueType();
1574   SDValue Offset = DAG.getConstant(8, DL, VT);
1575   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1576                      DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
1577                      MachinePointerInfo());
1578 }
1579 
1580 SDValue VETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
1581                                                   SelectionDAG &DAG) const {
1582   SDLoc DL(Op);
1583   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1584   switch (IntNo) {
1585   default: // Don't custom lower most intrinsics.
1586     return SDValue();
1587   case Intrinsic::eh_sjlj_lsda: {
1588     MachineFunction &MF = DAG.getMachineFunction();
1589     MVT VT = Op.getSimpleValueType();
1590     const VETargetMachine *TM =
1591         static_cast<const VETargetMachine *>(&DAG.getTarget());
1592 
1593     // Create GCC_except_tableXX string.  The real symbol for that will be
1594     // generated in EHStreamer::emitExceptionTable() later.  So, we just
1595     // borrow it's name here.
1596     TM->getStrList()->push_back(std::string(
1597         (Twine("GCC_except_table") + Twine(MF.getFunctionNumber())).str()));
1598     SDValue Addr =
1599         DAG.getTargetExternalSymbol(TM->getStrList()->back().c_str(), VT, 0);
1600     if (isPositionIndependent()) {
1601       Addr = makeHiLoPair(Addr, VEMCExpr::VK_VE_GOTOFF_HI32,
1602                           VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1603       SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, VT);
1604       return DAG.getNode(ISD::ADD, DL, VT, GlobalBase, Addr);
1605     }
1606     return makeHiLoPair(Addr, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
1607   }
1608   }
1609 }
1610 
1611 static bool getUniqueInsertion(SDNode *N, unsigned &UniqueIdx) {
1612   if (!isa<BuildVectorSDNode>(N))
1613     return false;
1614   const auto *BVN = cast<BuildVectorSDNode>(N);
1615 
1616   // Find first non-undef insertion.
1617   unsigned Idx;
1618   for (Idx = 0; Idx < BVN->getNumOperands(); ++Idx) {
1619     auto ElemV = BVN->getOperand(Idx);
1620     if (!ElemV->isUndef())
1621       break;
1622   }
1623   // Catch the (hypothetical) all-undef case.
1624   if (Idx == BVN->getNumOperands())
1625     return false;
1626   // Remember insertion.
1627   UniqueIdx = Idx++;
1628   // Verify that all other insertions are undef.
1629   for (; Idx < BVN->getNumOperands(); ++Idx) {
1630     auto ElemV = BVN->getOperand(Idx);
1631     if (!ElemV->isUndef())
1632       return false;
1633   }
1634   return true;
1635 }
1636 
1637 static SDValue getSplatValue(SDNode *N) {
1638   if (auto *BuildVec = dyn_cast<BuildVectorSDNode>(N)) {
1639     return BuildVec->getSplatValue();
1640   }
1641   return SDValue();
1642 }
1643 
1644 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
1645                                             SelectionDAG &DAG) const {
1646   VECustomDAG CDAG(DAG, Op);
1647   MVT ResultVT = Op.getSimpleValueType();
1648 
1649   // If there is just one element, expand to INSERT_VECTOR_ELT.
1650   unsigned UniqueIdx;
1651   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
1652     SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
1653     auto ElemV = Op->getOperand(UniqueIdx);
1654     SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
1655     return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
1656   }
1657 
1658   // Else emit a broadcast.
1659   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
1660     unsigned NumEls = ResultVT.getVectorNumElements();
1661     // TODO: Legalize packed-mode AVL.
1662     //       For now, cap the AVL at 256.
1663     auto CappedLength = std::min<unsigned>(256, NumEls);
1664     auto AVL = CDAG.getConstant(CappedLength, MVT::i32);
1665     return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL);
1666   }
1667 
1668   // Expand
1669   return SDValue();
1670 }
1671 
1672 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1673   unsigned Opcode = Op.getOpcode();
1674   if (ISD::isVPOpcode(Opcode))
1675     return lowerToVVP(Op, DAG);
1676 
1677   switch (Opcode) {
1678   default:
1679     llvm_unreachable("Should not custom lower this!");
1680   case ISD::ATOMIC_FENCE:
1681     return lowerATOMIC_FENCE(Op, DAG);
1682   case ISD::ATOMIC_SWAP:
1683     return lowerATOMIC_SWAP(Op, DAG);
1684   case ISD::BlockAddress:
1685     return lowerBlockAddress(Op, DAG);
1686   case ISD::ConstantPool:
1687     return lowerConstantPool(Op, DAG);
1688   case ISD::DYNAMIC_STACKALLOC:
1689     return lowerDYNAMIC_STACKALLOC(Op, DAG);
1690   case ISD::EH_SJLJ_LONGJMP:
1691     return lowerEH_SJLJ_LONGJMP(Op, DAG);
1692   case ISD::EH_SJLJ_SETJMP:
1693     return lowerEH_SJLJ_SETJMP(Op, DAG);
1694   case ISD::EH_SJLJ_SETUP_DISPATCH:
1695     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
1696   case ISD::FRAMEADDR:
1697     return lowerFRAMEADDR(Op, DAG, *this, Subtarget);
1698   case ISD::GlobalAddress:
1699     return lowerGlobalAddress(Op, DAG);
1700   case ISD::GlobalTLSAddress:
1701     return lowerGlobalTLSAddress(Op, DAG);
1702   case ISD::INTRINSIC_WO_CHAIN:
1703     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
1704   case ISD::JumpTable:
1705     return lowerJumpTable(Op, DAG);
1706   case ISD::LOAD:
1707     return lowerLOAD(Op, DAG);
1708   case ISD::RETURNADDR:
1709     return lowerRETURNADDR(Op, DAG, *this, Subtarget);
1710   case ISD::BUILD_VECTOR:
1711     return lowerBUILD_VECTOR(Op, DAG);
1712   case ISD::STORE:
1713     return lowerSTORE(Op, DAG);
1714   case ISD::VASTART:
1715     return lowerVASTART(Op, DAG);
1716   case ISD::VAARG:
1717     return lowerVAARG(Op, DAG);
1718 
1719   case ISD::INSERT_VECTOR_ELT:
1720     return lowerINSERT_VECTOR_ELT(Op, DAG);
1721   case ISD::EXTRACT_VECTOR_ELT:
1722     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
1723 
1724 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
1725 #include "VVPNodes.def"
1726     return lowerToVVP(Op, DAG);
1727   }
1728 }
1729 /// } Custom Lower
1730 
1731 void VETargetLowering::ReplaceNodeResults(SDNode *N,
1732                                           SmallVectorImpl<SDValue> &Results,
1733                                           SelectionDAG &DAG) const {
1734   switch (N->getOpcode()) {
1735   case ISD::ATOMIC_SWAP:
1736     // Let LLVM expand atomic swap instruction through LowerOperation.
1737     return;
1738   default:
1739     LLVM_DEBUG(N->dumpr(&DAG));
1740     llvm_unreachable("Do not know how to custom type legalize this operation!");
1741   }
1742 }
1743 
1744 /// JumpTable for VE.
1745 ///
1746 ///   VE cannot generate relocatable symbol in jump table.  VE cannot
1747 ///   generate expressions using symbols in both text segment and data
1748 ///   segment like below.
1749 ///             .4byte  .LBB0_2-.LJTI0_0
1750 ///   So, we generate offset from the top of function like below as
1751 ///   a custom label.
1752 ///             .4byte  .LBB0_2-<function name>
1753 
1754 unsigned VETargetLowering::getJumpTableEncoding() const {
1755   // Use custom label for PIC.
1756   if (isPositionIndependent())
1757     return MachineJumpTableInfo::EK_Custom32;
1758 
1759   // Otherwise, use the normal jump table encoding heuristics.
1760   return TargetLowering::getJumpTableEncoding();
1761 }
1762 
1763 const MCExpr *VETargetLowering::LowerCustomJumpTableEntry(
1764     const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
1765     unsigned Uid, MCContext &Ctx) const {
1766   assert(isPositionIndependent());
1767 
1768   // Generate custom label for PIC like below.
1769   //    .4bytes  .LBB0_2-<function name>
1770   const auto *Value = MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
1771   MCSymbol *Sym = Ctx.getOrCreateSymbol(MBB->getParent()->getName().data());
1772   const auto *Base = MCSymbolRefExpr::create(Sym, Ctx);
1773   return MCBinaryExpr::createSub(Value, Base, Ctx);
1774 }
1775 
1776 SDValue VETargetLowering::getPICJumpTableRelocBase(SDValue Table,
1777                                                    SelectionDAG &DAG) const {
1778   assert(isPositionIndependent());
1779   SDLoc DL(Table);
1780   Function *Function = &DAG.getMachineFunction().getFunction();
1781   assert(Function != nullptr);
1782   auto PtrTy = getPointerTy(DAG.getDataLayout(), Function->getAddressSpace());
1783 
1784   // In the jump table, we have following values in PIC mode.
1785   //    .4bytes  .LBB0_2-<function name>
1786   // We need to add this value and the address of this function to generate
1787   // .LBB0_2 label correctly under PIC mode.  So, we want to generate following
1788   // instructions:
1789   //     lea %reg, fun@gotoff_lo
1790   //     and %reg, %reg, (32)0
1791   //     lea.sl %reg, fun@gotoff_hi(%reg, %got)
1792   // In order to do so, we need to genarate correctly marked DAG node using
1793   // makeHiLoPair.
1794   SDValue Op = DAG.getGlobalAddress(Function, DL, PtrTy);
1795   SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
1796                               VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
1797   SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrTy);
1798   return DAG.getNode(ISD::ADD, DL, PtrTy, GlobalBase, HiLo);
1799 }
1800 
1801 Register VETargetLowering::prepareMBB(MachineBasicBlock &MBB,
1802                                       MachineBasicBlock::iterator I,
1803                                       MachineBasicBlock *TargetBB,
1804                                       const DebugLoc &DL) const {
1805   MachineFunction *MF = MBB.getParent();
1806   MachineRegisterInfo &MRI = MF->getRegInfo();
1807   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1808 
1809   const TargetRegisterClass *RC = &VE::I64RegClass;
1810   Register Tmp1 = MRI.createVirtualRegister(RC);
1811   Register Tmp2 = MRI.createVirtualRegister(RC);
1812   Register Result = MRI.createVirtualRegister(RC);
1813 
1814   if (isPositionIndependent()) {
1815     // Create following instructions for local linkage PIC code.
1816     //     lea %Tmp1, TargetBB@gotoff_lo
1817     //     and %Tmp2, %Tmp1, (32)0
1818     //     lea.sl %Result, TargetBB@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1819     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1820         .addImm(0)
1821         .addImm(0)
1822         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_LO32);
1823     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1824         .addReg(Tmp1, getKillRegState(true))
1825         .addImm(M0(32));
1826     BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
1827         .addReg(VE::SX15)
1828         .addReg(Tmp2, getKillRegState(true))
1829         .addMBB(TargetBB, VEMCExpr::VK_VE_GOTOFF_HI32);
1830   } else {
1831     // Create following instructions for non-PIC code.
1832     //     lea     %Tmp1, TargetBB@lo
1833     //     and     %Tmp2, %Tmp1, (32)0
1834     //     lea.sl  %Result, TargetBB@hi(%Tmp2)
1835     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1836         .addImm(0)
1837         .addImm(0)
1838         .addMBB(TargetBB, VEMCExpr::VK_VE_LO32);
1839     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1840         .addReg(Tmp1, getKillRegState(true))
1841         .addImm(M0(32));
1842     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
1843         .addReg(Tmp2, getKillRegState(true))
1844         .addImm(0)
1845         .addMBB(TargetBB, VEMCExpr::VK_VE_HI32);
1846   }
1847   return Result;
1848 }
1849 
1850 Register VETargetLowering::prepareSymbol(MachineBasicBlock &MBB,
1851                                          MachineBasicBlock::iterator I,
1852                                          StringRef Symbol, const DebugLoc &DL,
1853                                          bool IsLocal = false,
1854                                          bool IsCall = false) const {
1855   MachineFunction *MF = MBB.getParent();
1856   MachineRegisterInfo &MRI = MF->getRegInfo();
1857   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1858 
1859   const TargetRegisterClass *RC = &VE::I64RegClass;
1860   Register Result = MRI.createVirtualRegister(RC);
1861 
1862   if (isPositionIndependent()) {
1863     if (IsCall && !IsLocal) {
1864       // Create following instructions for non-local linkage PIC code function
1865       // calls.  These instructions uses IC and magic number -24, so we expand
1866       // them in VEAsmPrinter.cpp from GETFUNPLT pseudo instruction.
1867       //     lea %Reg, Symbol@plt_lo(-24)
1868       //     and %Reg, %Reg, (32)0
1869       //     sic %s16
1870       //     lea.sl %Result, Symbol@plt_hi(%Reg, %s16) ; %s16 is PLT
1871       BuildMI(MBB, I, DL, TII->get(VE::GETFUNPLT), Result)
1872           .addExternalSymbol("abort");
1873     } else if (IsLocal) {
1874       Register Tmp1 = MRI.createVirtualRegister(RC);
1875       Register Tmp2 = MRI.createVirtualRegister(RC);
1876       // Create following instructions for local linkage PIC code.
1877       //     lea %Tmp1, Symbol@gotoff_lo
1878       //     and %Tmp2, %Tmp1, (32)0
1879       //     lea.sl %Result, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1880       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1881           .addImm(0)
1882           .addImm(0)
1883           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_LO32);
1884       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1885           .addReg(Tmp1, getKillRegState(true))
1886           .addImm(M0(32));
1887       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Result)
1888           .addReg(VE::SX15)
1889           .addReg(Tmp2, getKillRegState(true))
1890           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOTOFF_HI32);
1891     } else {
1892       Register Tmp1 = MRI.createVirtualRegister(RC);
1893       Register Tmp2 = MRI.createVirtualRegister(RC);
1894       // Create following instructions for not local linkage PIC code.
1895       //     lea %Tmp1, Symbol@got_lo
1896       //     and %Tmp2, %Tmp1, (32)0
1897       //     lea.sl %Tmp3, Symbol@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
1898       //     ld %Result, 0(%Tmp3)
1899       Register Tmp3 = MRI.createVirtualRegister(RC);
1900       BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1901           .addImm(0)
1902           .addImm(0)
1903           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_LO32);
1904       BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1905           .addReg(Tmp1, getKillRegState(true))
1906           .addImm(M0(32));
1907       BuildMI(MBB, I, DL, TII->get(VE::LEASLrri), Tmp3)
1908           .addReg(VE::SX15)
1909           .addReg(Tmp2, getKillRegState(true))
1910           .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_GOT_HI32);
1911       BuildMI(MBB, I, DL, TII->get(VE::LDrii), Result)
1912           .addReg(Tmp3, getKillRegState(true))
1913           .addImm(0)
1914           .addImm(0);
1915     }
1916   } else {
1917     Register Tmp1 = MRI.createVirtualRegister(RC);
1918     Register Tmp2 = MRI.createVirtualRegister(RC);
1919     // Create following instructions for non-PIC code.
1920     //     lea     %Tmp1, Symbol@lo
1921     //     and     %Tmp2, %Tmp1, (32)0
1922     //     lea.sl  %Result, Symbol@hi(%Tmp2)
1923     BuildMI(MBB, I, DL, TII->get(VE::LEAzii), Tmp1)
1924         .addImm(0)
1925         .addImm(0)
1926         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_LO32);
1927     BuildMI(MBB, I, DL, TII->get(VE::ANDrm), Tmp2)
1928         .addReg(Tmp1, getKillRegState(true))
1929         .addImm(M0(32));
1930     BuildMI(MBB, I, DL, TII->get(VE::LEASLrii), Result)
1931         .addReg(Tmp2, getKillRegState(true))
1932         .addImm(0)
1933         .addExternalSymbol(Symbol.data(), VEMCExpr::VK_VE_HI32);
1934   }
1935   return Result;
1936 }
1937 
1938 void VETargetLowering::setupEntryBlockForSjLj(MachineInstr &MI,
1939                                               MachineBasicBlock *MBB,
1940                                               MachineBasicBlock *DispatchBB,
1941                                               int FI, int Offset) const {
1942   DebugLoc DL = MI.getDebugLoc();
1943   const VEInstrInfo *TII = Subtarget->getInstrInfo();
1944 
1945   Register LabelReg =
1946       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), DispatchBB, DL);
1947 
1948   // Store an address of DispatchBB to a given jmpbuf[1] where has next IC
1949   // referenced by longjmp (throw) later.
1950   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
1951   addFrameReference(MIB, FI, Offset); // jmpbuf[1]
1952   MIB.addReg(LabelReg, getKillRegState(true));
1953 }
1954 
1955 MachineBasicBlock *
1956 VETargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
1957                                    MachineBasicBlock *MBB) const {
1958   DebugLoc DL = MI.getDebugLoc();
1959   MachineFunction *MF = MBB->getParent();
1960   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1961   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
1962   MachineRegisterInfo &MRI = MF->getRegInfo();
1963 
1964   const BasicBlock *BB = MBB->getBasicBlock();
1965   MachineFunction::iterator I = ++MBB->getIterator();
1966 
1967   // Memory Reference.
1968   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
1969                                            MI.memoperands_end());
1970   Register BufReg = MI.getOperand(1).getReg();
1971 
1972   Register DstReg;
1973 
1974   DstReg = MI.getOperand(0).getReg();
1975   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
1976   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
1977   (void)TRI;
1978   Register MainDestReg = MRI.createVirtualRegister(RC);
1979   Register RestoreDestReg = MRI.createVirtualRegister(RC);
1980 
1981   // For `v = call @llvm.eh.sjlj.setjmp(buf)`, we generate following
1982   // instructions.  SP/FP must be saved in jmpbuf before `llvm.eh.sjlj.setjmp`.
1983   //
1984   // ThisMBB:
1985   //   buf[3] = %s17 iff %s17 is used as BP
1986   //   buf[1] = RestoreMBB as IC after longjmp
1987   //   # SjLjSetup RestoreMBB
1988   //
1989   // MainMBB:
1990   //   v_main = 0
1991   //
1992   // SinkMBB:
1993   //   v = phi(v_main, MainMBB, v_restore, RestoreMBB)
1994   //   ...
1995   //
1996   // RestoreMBB:
1997   //   %s17 = buf[3] = iff %s17 is used as BP
1998   //   v_restore = 1
1999   //   goto SinkMBB
2000 
2001   MachineBasicBlock *ThisMBB = MBB;
2002   MachineBasicBlock *MainMBB = MF->CreateMachineBasicBlock(BB);
2003   MachineBasicBlock *SinkMBB = MF->CreateMachineBasicBlock(BB);
2004   MachineBasicBlock *RestoreMBB = MF->CreateMachineBasicBlock(BB);
2005   MF->insert(I, MainMBB);
2006   MF->insert(I, SinkMBB);
2007   MF->push_back(RestoreMBB);
2008   RestoreMBB->setHasAddressTaken();
2009 
2010   // Transfer the remainder of BB and its successor edges to SinkMBB.
2011   SinkMBB->splice(SinkMBB->begin(), MBB,
2012                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2013   SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
2014 
2015   // ThisMBB:
2016   Register LabelReg =
2017       prepareMBB(*MBB, MachineBasicBlock::iterator(MI), RestoreMBB, DL);
2018 
2019   // Store BP in buf[3] iff this function is using BP.
2020   const VEFrameLowering *TFI = Subtarget->getFrameLowering();
2021   if (TFI->hasBP(*MF)) {
2022     MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2023     MIB.addReg(BufReg);
2024     MIB.addImm(0);
2025     MIB.addImm(24);
2026     MIB.addReg(VE::SX17);
2027     MIB.setMemRefs(MMOs);
2028   }
2029 
2030   // Store IP in buf[1].
2031   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(VE::STrii));
2032   MIB.add(MI.getOperand(1)); // we can preserve the kill flags here.
2033   MIB.addImm(0);
2034   MIB.addImm(8);
2035   MIB.addReg(LabelReg, getKillRegState(true));
2036   MIB.setMemRefs(MMOs);
2037 
2038   // SP/FP are already stored in jmpbuf before `llvm.eh.sjlj.setjmp`.
2039 
2040   // Insert setup.
2041   MIB =
2042       BuildMI(*ThisMBB, MI, DL, TII->get(VE::EH_SjLj_Setup)).addMBB(RestoreMBB);
2043 
2044   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
2045   MIB.addRegMask(RegInfo->getNoPreservedMask());
2046   ThisMBB->addSuccessor(MainMBB);
2047   ThisMBB->addSuccessor(RestoreMBB);
2048 
2049   // MainMBB:
2050   BuildMI(MainMBB, DL, TII->get(VE::LEAzii), MainDestReg)
2051       .addImm(0)
2052       .addImm(0)
2053       .addImm(0);
2054   MainMBB->addSuccessor(SinkMBB);
2055 
2056   // SinkMBB:
2057   BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(VE::PHI), DstReg)
2058       .addReg(MainDestReg)
2059       .addMBB(MainMBB)
2060       .addReg(RestoreDestReg)
2061       .addMBB(RestoreMBB);
2062 
2063   // RestoreMBB:
2064   // Restore BP from buf[3] iff this function is using BP.  The address of
2065   // buf is in SX10.
2066   // FIXME: Better to not use SX10 here
2067   if (TFI->hasBP(*MF)) {
2068     MachineInstrBuilder MIB =
2069         BuildMI(RestoreMBB, DL, TII->get(VE::LDrii), VE::SX17);
2070     MIB.addReg(VE::SX10);
2071     MIB.addImm(0);
2072     MIB.addImm(24);
2073     MIB.setMemRefs(MMOs);
2074   }
2075   BuildMI(RestoreMBB, DL, TII->get(VE::LEAzii), RestoreDestReg)
2076       .addImm(0)
2077       .addImm(0)
2078       .addImm(1);
2079   BuildMI(RestoreMBB, DL, TII->get(VE::BRCFLa_t)).addMBB(SinkMBB);
2080   RestoreMBB->addSuccessor(SinkMBB);
2081 
2082   MI.eraseFromParent();
2083   return SinkMBB;
2084 }
2085 
2086 MachineBasicBlock *
2087 VETargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
2088                                     MachineBasicBlock *MBB) const {
2089   DebugLoc DL = MI.getDebugLoc();
2090   MachineFunction *MF = MBB->getParent();
2091   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2092   MachineRegisterInfo &MRI = MF->getRegInfo();
2093 
2094   // Memory Reference.
2095   SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
2096                                            MI.memoperands_end());
2097   Register BufReg = MI.getOperand(0).getReg();
2098 
2099   Register Tmp = MRI.createVirtualRegister(&VE::I64RegClass);
2100   // Since FP is only updated here but NOT referenced, it's treated as GPR.
2101   Register FP = VE::SX9;
2102   Register SP = VE::SX11;
2103 
2104   MachineInstrBuilder MIB;
2105 
2106   MachineBasicBlock *ThisMBB = MBB;
2107 
2108   // For `call @llvm.eh.sjlj.longjmp(buf)`, we generate following instructions.
2109   //
2110   // ThisMBB:
2111   //   %fp = load buf[0]
2112   //   %jmp = load buf[1]
2113   //   %s10 = buf        ; Store an address of buf to SX10 for RestoreMBB
2114   //   %sp = load buf[2] ; generated by llvm.eh.sjlj.setjmp.
2115   //   jmp %jmp
2116 
2117   // Reload FP.
2118   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), FP);
2119   MIB.addReg(BufReg);
2120   MIB.addImm(0);
2121   MIB.addImm(0);
2122   MIB.setMemRefs(MMOs);
2123 
2124   // Reload IP.
2125   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), Tmp);
2126   MIB.addReg(BufReg);
2127   MIB.addImm(0);
2128   MIB.addImm(8);
2129   MIB.setMemRefs(MMOs);
2130 
2131   // Copy BufReg to SX10 for later use in setjmp.
2132   // FIXME: Better to not use SX10 here
2133   BuildMI(*ThisMBB, MI, DL, TII->get(VE::ORri), VE::SX10)
2134       .addReg(BufReg)
2135       .addImm(0);
2136 
2137   // Reload SP.
2138   MIB = BuildMI(*ThisMBB, MI, DL, TII->get(VE::LDrii), SP);
2139   MIB.add(MI.getOperand(0)); // we can preserve the kill flags here.
2140   MIB.addImm(0);
2141   MIB.addImm(16);
2142   MIB.setMemRefs(MMOs);
2143 
2144   // Jump.
2145   BuildMI(*ThisMBB, MI, DL, TII->get(VE::BCFLari_t))
2146       .addReg(Tmp, getKillRegState(true))
2147       .addImm(0);
2148 
2149   MI.eraseFromParent();
2150   return ThisMBB;
2151 }
2152 
2153 MachineBasicBlock *
2154 VETargetLowering::emitSjLjDispatchBlock(MachineInstr &MI,
2155                                         MachineBasicBlock *BB) const {
2156   DebugLoc DL = MI.getDebugLoc();
2157   MachineFunction *MF = BB->getParent();
2158   MachineFrameInfo &MFI = MF->getFrameInfo();
2159   MachineRegisterInfo &MRI = MF->getRegInfo();
2160   const VEInstrInfo *TII = Subtarget->getInstrInfo();
2161   int FI = MFI.getFunctionContextIndex();
2162 
2163   // Get a mapping of the call site numbers to all of the landing pads they're
2164   // associated with.
2165   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
2166   unsigned MaxCSNum = 0;
2167   for (auto &MBB : *MF) {
2168     if (!MBB.isEHPad())
2169       continue;
2170 
2171     MCSymbol *Sym = nullptr;
2172     for (const auto &MI : MBB) {
2173       if (MI.isDebugInstr())
2174         continue;
2175 
2176       assert(MI.isEHLabel() && "expected EH_LABEL");
2177       Sym = MI.getOperand(0).getMCSymbol();
2178       break;
2179     }
2180 
2181     if (!MF->hasCallSiteLandingPad(Sym))
2182       continue;
2183 
2184     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
2185       CallSiteNumToLPad[CSI].push_back(&MBB);
2186       MaxCSNum = std::max(MaxCSNum, CSI);
2187     }
2188   }
2189 
2190   // Get an ordered list of the machine basic blocks for the jump table.
2191   std::vector<MachineBasicBlock *> LPadList;
2192   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
2193   LPadList.reserve(CallSiteNumToLPad.size());
2194 
2195   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
2196     for (auto &LP : CallSiteNumToLPad[CSI]) {
2197       LPadList.push_back(LP);
2198       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
2199     }
2200   }
2201 
2202   assert(!LPadList.empty() &&
2203          "No landing pad destinations for the dispatch jump table!");
2204 
2205   // The %fn_context is allocated like below (from --print-after=sjljehprepare):
2206   //   %fn_context = alloca { i8*, i64, [4 x i64], i8*, i8*, [5 x i8*] }
2207   //
2208   // This `[5 x i8*]` is jmpbuf, so jmpbuf[1] is FI+72.
2209   // First `i64` is callsite, so callsite is FI+8.
2210   static const int OffsetIC = 72;
2211   static const int OffsetCS = 8;
2212 
2213   // Create the MBBs for the dispatch code like following:
2214   //
2215   // ThisMBB:
2216   //   Prepare DispatchBB address and store it to buf[1].
2217   //   ...
2218   //
2219   // DispatchBB:
2220   //   %s15 = GETGOT iff isPositionIndependent
2221   //   %callsite = load callsite
2222   //   brgt.l.t #size of callsites, %callsite, DispContBB
2223   //
2224   // TrapBB:
2225   //   Call abort.
2226   //
2227   // DispContBB:
2228   //   %breg = address of jump table
2229   //   %pc = load and calculate next pc from %breg and %callsite
2230   //   jmp %pc
2231 
2232   // Shove the dispatch's address into the return slot in the function context.
2233   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
2234   DispatchBB->setIsEHPad(true);
2235 
2236   // Trap BB will causes trap like `assert(0)`.
2237   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
2238   DispatchBB->addSuccessor(TrapBB);
2239 
2240   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
2241   DispatchBB->addSuccessor(DispContBB);
2242 
2243   // Insert MBBs.
2244   MF->push_back(DispatchBB);
2245   MF->push_back(DispContBB);
2246   MF->push_back(TrapBB);
2247 
2248   // Insert code to call abort in the TrapBB.
2249   Register Abort = prepareSymbol(*TrapBB, TrapBB->end(), "abort", DL,
2250                                  /* Local */ false, /* Call */ true);
2251   BuildMI(TrapBB, DL, TII->get(VE::BSICrii), VE::SX10)
2252       .addReg(Abort, getKillRegState(true))
2253       .addImm(0)
2254       .addImm(0);
2255 
2256   // Insert code into the entry block that creates and registers the function
2257   // context.
2258   setupEntryBlockForSjLj(MI, BB, DispatchBB, FI, OffsetIC);
2259 
2260   // Create the jump table and associated information
2261   unsigned JTE = getJumpTableEncoding();
2262   MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
2263   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
2264 
2265   const VERegisterInfo &RI = TII->getRegisterInfo();
2266   // Add a register mask with no preserved registers.  This results in all
2267   // registers being marked as clobbered.
2268   BuildMI(DispatchBB, DL, TII->get(VE::NOP))
2269       .addRegMask(RI.getNoPreservedMask());
2270 
2271   if (isPositionIndependent()) {
2272     // Force to generate GETGOT, since current implementation doesn't store GOT
2273     // register.
2274     BuildMI(DispatchBB, DL, TII->get(VE::GETGOT), VE::SX15);
2275   }
2276 
2277   // IReg is used as an index in a memory operand and therefore can't be SP
2278   const TargetRegisterClass *RC = &VE::I64RegClass;
2279   Register IReg = MRI.createVirtualRegister(RC);
2280   addFrameReference(BuildMI(DispatchBB, DL, TII->get(VE::LDLZXrii), IReg), FI,
2281                     OffsetCS);
2282   if (LPadList.size() < 64) {
2283     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLir_t))
2284         .addImm(VECC::CC_ILE)
2285         .addImm(LPadList.size())
2286         .addReg(IReg)
2287         .addMBB(TrapBB);
2288   } else {
2289     assert(LPadList.size() <= 0x7FFFFFFF && "Too large Landing Pad!");
2290     Register TmpReg = MRI.createVirtualRegister(RC);
2291     BuildMI(DispatchBB, DL, TII->get(VE::LEAzii), TmpReg)
2292         .addImm(0)
2293         .addImm(0)
2294         .addImm(LPadList.size());
2295     BuildMI(DispatchBB, DL, TII->get(VE::BRCFLrr_t))
2296         .addImm(VECC::CC_ILE)
2297         .addReg(TmpReg, getKillRegState(true))
2298         .addReg(IReg)
2299         .addMBB(TrapBB);
2300   }
2301 
2302   Register BReg = MRI.createVirtualRegister(RC);
2303   Register Tmp1 = MRI.createVirtualRegister(RC);
2304   Register Tmp2 = MRI.createVirtualRegister(RC);
2305 
2306   if (isPositionIndependent()) {
2307     // Create following instructions for local linkage PIC code.
2308     //     lea    %Tmp1, .LJTI0_0@gotoff_lo
2309     //     and    %Tmp2, %Tmp1, (32)0
2310     //     lea.sl %BReg, .LJTI0_0@gotoff_hi(%Tmp2, %s15) ; %s15 is GOT
2311     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2312         .addImm(0)
2313         .addImm(0)
2314         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_LO32);
2315     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2316         .addReg(Tmp1, getKillRegState(true))
2317         .addImm(M0(32));
2318     BuildMI(DispContBB, DL, TII->get(VE::LEASLrri), BReg)
2319         .addReg(VE::SX15)
2320         .addReg(Tmp2, getKillRegState(true))
2321         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_GOTOFF_HI32);
2322   } else {
2323     // Create following instructions for non-PIC code.
2324     //     lea     %Tmp1, .LJTI0_0@lo
2325     //     and     %Tmp2, %Tmp1, (32)0
2326     //     lea.sl  %BReg, .LJTI0_0@hi(%Tmp2)
2327     BuildMI(DispContBB, DL, TII->get(VE::LEAzii), Tmp1)
2328         .addImm(0)
2329         .addImm(0)
2330         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_LO32);
2331     BuildMI(DispContBB, DL, TII->get(VE::ANDrm), Tmp2)
2332         .addReg(Tmp1, getKillRegState(true))
2333         .addImm(M0(32));
2334     BuildMI(DispContBB, DL, TII->get(VE::LEASLrii), BReg)
2335         .addReg(Tmp2, getKillRegState(true))
2336         .addImm(0)
2337         .addJumpTableIndex(MJTI, VEMCExpr::VK_VE_HI32);
2338   }
2339 
2340   switch (JTE) {
2341   case MachineJumpTableInfo::EK_BlockAddress: {
2342     // Generate simple block address code for no-PIC model.
2343     //     sll %Tmp1, %IReg, 3
2344     //     lds %TReg, 0(%Tmp1, %BReg)
2345     //     bcfla %TReg
2346 
2347     Register TReg = MRI.createVirtualRegister(RC);
2348     Register Tmp1 = MRI.createVirtualRegister(RC);
2349 
2350     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2351         .addReg(IReg, getKillRegState(true))
2352         .addImm(3);
2353     BuildMI(DispContBB, DL, TII->get(VE::LDrri), TReg)
2354         .addReg(BReg, getKillRegState(true))
2355         .addReg(Tmp1, getKillRegState(true))
2356         .addImm(0);
2357     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2358         .addReg(TReg, getKillRegState(true))
2359         .addImm(0);
2360     break;
2361   }
2362   case MachineJumpTableInfo::EK_Custom32: {
2363     // Generate block address code using differences from the function pointer
2364     // for PIC model.
2365     //     sll %Tmp1, %IReg, 2
2366     //     ldl.zx %OReg, 0(%Tmp1, %BReg)
2367     //     Prepare function address in BReg2.
2368     //     adds.l %TReg, %BReg2, %OReg
2369     //     bcfla %TReg
2370 
2371     assert(isPositionIndependent());
2372     Register OReg = MRI.createVirtualRegister(RC);
2373     Register TReg = MRI.createVirtualRegister(RC);
2374     Register Tmp1 = MRI.createVirtualRegister(RC);
2375 
2376     BuildMI(DispContBB, DL, TII->get(VE::SLLri), Tmp1)
2377         .addReg(IReg, getKillRegState(true))
2378         .addImm(2);
2379     BuildMI(DispContBB, DL, TII->get(VE::LDLZXrri), OReg)
2380         .addReg(BReg, getKillRegState(true))
2381         .addReg(Tmp1, getKillRegState(true))
2382         .addImm(0);
2383     Register BReg2 =
2384         prepareSymbol(*DispContBB, DispContBB->end(),
2385                       DispContBB->getParent()->getName(), DL, /* Local */ true);
2386     BuildMI(DispContBB, DL, TII->get(VE::ADDSLrr), TReg)
2387         .addReg(OReg, getKillRegState(true))
2388         .addReg(BReg2, getKillRegState(true));
2389     BuildMI(DispContBB, DL, TII->get(VE::BCFLari_t))
2390         .addReg(TReg, getKillRegState(true))
2391         .addImm(0);
2392     break;
2393   }
2394   default:
2395     llvm_unreachable("Unexpected jump table encoding");
2396   }
2397 
2398   // Add the jump table entries as successors to the MBB.
2399   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
2400   for (auto &LP : LPadList)
2401     if (SeenMBBs.insert(LP).second)
2402       DispContBB->addSuccessor(LP);
2403 
2404   // N.B. the order the invoke BBs are processed in doesn't matter here.
2405   SmallVector<MachineBasicBlock *, 64> MBBLPads;
2406   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
2407   for (MachineBasicBlock *MBB : InvokeBBs) {
2408     // Remove the landing pad successor from the invoke block and replace it
2409     // with the new dispatch block.
2410     // Keep a copy of Successors since it's modified inside the loop.
2411     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
2412                                                    MBB->succ_rend());
2413     // FIXME: Avoid quadratic complexity.
2414     for (auto MBBS : Successors) {
2415       if (MBBS->isEHPad()) {
2416         MBB->removeSuccessor(MBBS);
2417         MBBLPads.push_back(MBBS);
2418       }
2419     }
2420 
2421     MBB->addSuccessor(DispatchBB);
2422 
2423     // Find the invoke call and mark all of the callee-saved registers as
2424     // 'implicit defined' so that they're spilled.  This prevents code from
2425     // moving instructions to before the EH block, where they will never be
2426     // executed.
2427     for (auto &II : reverse(*MBB)) {
2428       if (!II.isCall())
2429         continue;
2430 
2431       DenseMap<Register, bool> DefRegs;
2432       for (auto &MOp : II.operands())
2433         if (MOp.isReg())
2434           DefRegs[MOp.getReg()] = true;
2435 
2436       MachineInstrBuilder MIB(*MF, &II);
2437       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
2438         Register Reg = SavedRegs[RI];
2439         if (!DefRegs[Reg])
2440           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
2441       }
2442 
2443       break;
2444     }
2445   }
2446 
2447   // Mark all former landing pads as non-landing pads.  The dispatch is the only
2448   // landing pad now.
2449   for (auto &LP : MBBLPads)
2450     LP->setIsEHPad(false);
2451 
2452   // The instruction is gone now.
2453   MI.eraseFromParent();
2454   return BB;
2455 }
2456 
2457 MachineBasicBlock *
2458 VETargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
2459                                               MachineBasicBlock *BB) const {
2460   switch (MI.getOpcode()) {
2461   default:
2462     llvm_unreachable("Unknown Custom Instruction!");
2463   case VE::EH_SjLj_LongJmp:
2464     return emitEHSjLjLongJmp(MI, BB);
2465   case VE::EH_SjLj_SetJmp:
2466     return emitEHSjLjSetJmp(MI, BB);
2467   case VE::EH_SjLj_Setup_Dispatch:
2468     return emitSjLjDispatchBlock(MI, BB);
2469   }
2470 }
2471 
2472 static bool isI32Insn(const SDNode *User, const SDNode *N) {
2473   switch (User->getOpcode()) {
2474   default:
2475     return false;
2476   case ISD::ADD:
2477   case ISD::SUB:
2478   case ISD::MUL:
2479   case ISD::SDIV:
2480   case ISD::UDIV:
2481   case ISD::SETCC:
2482   case ISD::SMIN:
2483   case ISD::SMAX:
2484   case ISD::SHL:
2485   case ISD::SRA:
2486   case ISD::BSWAP:
2487   case ISD::SINT_TO_FP:
2488   case ISD::UINT_TO_FP:
2489   case ISD::BR_CC:
2490   case ISD::BITCAST:
2491   case ISD::ATOMIC_CMP_SWAP:
2492   case ISD::ATOMIC_SWAP:
2493     return true;
2494   case ISD::SRL:
2495     if (N->getOperand(0).getOpcode() != ISD::SRL)
2496       return true;
2497     // (srl (trunc (srl ...))) may be optimized by combining srl, so
2498     // doesn't optimize trunc now.
2499     return false;
2500   case ISD::SELECT_CC:
2501     if (User->getOperand(2).getNode() != N &&
2502         User->getOperand(3).getNode() != N)
2503       return true;
2504     LLVM_FALLTHROUGH;
2505   case ISD::AND:
2506   case ISD::OR:
2507   case ISD::XOR:
2508   case ISD::SELECT:
2509   case ISD::CopyToReg:
2510     // Check all use of selections, bit operations, and copies.  If all of them
2511     // are safe, optimize truncate to extract_subreg.
2512     for (const SDNode *U : User->uses()) {
2513       switch (U->getOpcode()) {
2514       default:
2515         // If the use is an instruction which treats the source operand as i32,
2516         // it is safe to avoid truncate here.
2517         if (isI32Insn(U, N))
2518           continue;
2519         break;
2520       case ISD::ANY_EXTEND:
2521       case ISD::SIGN_EXTEND:
2522       case ISD::ZERO_EXTEND: {
2523         // Special optimizations to the combination of ext and trunc.
2524         // (ext ... (select ... (trunc ...))) is safe to avoid truncate here
2525         // since this truncate instruction clears higher 32 bits which is filled
2526         // by one of ext instructions later.
2527         assert(N->getValueType(0) == MVT::i32 &&
2528                "find truncate to not i32 integer");
2529         if (User->getOpcode() == ISD::SELECT_CC ||
2530             User->getOpcode() == ISD::SELECT)
2531           continue;
2532         break;
2533       }
2534       }
2535       return false;
2536     }
2537     return true;
2538   }
2539 }
2540 
2541 // Optimize TRUNCATE in DAG combining.  Optimizing it in CUSTOM lower is
2542 // sometime too early.  Optimizing it in DAG pattern matching in VEInstrInfo.td
2543 // is sometime too late.  So, doing it at here.
2544 SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
2545                                           DAGCombinerInfo &DCI) const {
2546   assert(N->getOpcode() == ISD::TRUNCATE &&
2547          "Should be called with a TRUNCATE node");
2548 
2549   SelectionDAG &DAG = DCI.DAG;
2550   SDLoc DL(N);
2551   EVT VT = N->getValueType(0);
2552 
2553   // We prefer to do this when all types are legal.
2554   if (!DCI.isAfterLegalizeDAG())
2555     return SDValue();
2556 
2557   // Skip combine TRUNCATE atm if the operand of TRUNCATE might be a constant.
2558   if (N->getOperand(0)->getOpcode() == ISD::SELECT_CC &&
2559       isa<ConstantSDNode>(N->getOperand(0)->getOperand(0)) &&
2560       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
2561     return SDValue();
2562 
2563   // Check all use of this TRUNCATE.
2564   for (const SDNode *User : N->uses()) {
2565     // Make sure that we're not going to replace TRUNCATE for non i32
2566     // instructions.
2567     //
2568     // FIXME: Although we could sometimes handle this, and it does occur in
2569     // practice that one of the condition inputs to the select is also one of
2570     // the outputs, we currently can't deal with this.
2571     if (isI32Insn(User, N))
2572       continue;
2573 
2574     return SDValue();
2575   }
2576 
2577   SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2578   return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT,
2579                                     N->getOperand(0), SubI32),
2580                  0);
2581 }
2582 
2583 SDValue VETargetLowering::PerformDAGCombine(SDNode *N,
2584                                             DAGCombinerInfo &DCI) const {
2585   switch (N->getOpcode()) {
2586   default:
2587     break;
2588   case ISD::TRUNCATE:
2589     return combineTRUNCATE(N, DCI);
2590   }
2591 
2592   return SDValue();
2593 }
2594 
2595 //===----------------------------------------------------------------------===//
2596 // VE Inline Assembly Support
2597 //===----------------------------------------------------------------------===//
2598 
2599 VETargetLowering::ConstraintType
2600 VETargetLowering::getConstraintType(StringRef Constraint) const {
2601   if (Constraint.size() == 1) {
2602     switch (Constraint[0]) {
2603     default:
2604       break;
2605     case 'v': // vector registers
2606       return C_RegisterClass;
2607     }
2608   }
2609   return TargetLowering::getConstraintType(Constraint);
2610 }
2611 
2612 std::pair<unsigned, const TargetRegisterClass *>
2613 VETargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
2614                                                StringRef Constraint,
2615                                                MVT VT) const {
2616   const TargetRegisterClass *RC = nullptr;
2617   if (Constraint.size() == 1) {
2618     switch (Constraint[0]) {
2619     default:
2620       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2621     case 'r':
2622       RC = &VE::I64RegClass;
2623       break;
2624     case 'v':
2625       RC = &VE::V64RegClass;
2626       break;
2627     }
2628     return std::make_pair(0U, RC);
2629   }
2630 
2631   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
2632 }
2633 
2634 //===----------------------------------------------------------------------===//
2635 // VE Target Optimization Support
2636 //===----------------------------------------------------------------------===//
2637 
2638 unsigned VETargetLowering::getMinimumJumpTableEntries() const {
2639   // Specify 8 for PIC model to relieve the impact of PIC load instructions.
2640   if (isJumpTableRelative())
2641     return 8;
2642 
2643   return TargetLowering::getMinimumJumpTableEntries();
2644 }
2645 
2646 bool VETargetLowering::hasAndNot(SDValue Y) const {
2647   EVT VT = Y.getValueType();
2648 
2649   // VE doesn't have vector and not instruction.
2650   if (VT.isVector())
2651     return false;
2652 
2653   // VE allows different immediate values for X and Y where ~X & Y.
2654   // Only simm7 works for X, and only mimm works for Y on VE.  However, this
2655   // function is used to check whether an immediate value is OK for and-not
2656   // instruction as both X and Y.  Generating additional instruction to
2657   // retrieve an immediate value is no good since the purpose of this
2658   // function is to convert a series of 3 instructions to another series of
2659   // 3 instructions with better parallelism.  Therefore, we return false
2660   // for all immediate values now.
2661   // FIXME: Change hasAndNot function to have two operands to make it work
2662   //        correctly with Aurora VE.
2663   if (isa<ConstantSDNode>(Y))
2664     return false;
2665 
2666   // It's ok for generic registers.
2667   return true;
2668 }
2669 
2670 SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
2671   // Can we represent this as a VVP node.
2672   const unsigned Opcode = Op->getOpcode();
2673   auto VVPOpcodeOpt = getVVPOpcode(Opcode);
2674   if (!VVPOpcodeOpt.hasValue())
2675     return SDValue();
2676   unsigned VVPOpcode = VVPOpcodeOpt.getValue();
2677   const bool FromVP = ISD::isVPOpcode(Opcode);
2678 
2679   // The representative and legalized vector type of this operation.
2680   VECustomDAG CDAG(DAG, Op);
2681   MVT MaskVT = MVT::v256i1; // TODO: packed mode.
2682   EVT OpVecVT = Op.getValueType();
2683   EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
2684 
2685   SDValue AVL;
2686   SDValue Mask;
2687 
2688   if (FromVP) {
2689     // All upstream VP SDNodes always have a mask and avl.
2690     auto MaskIdx = ISD::getVPMaskIdx(Opcode).getValue();
2691     auto AVLIdx = ISD::getVPExplicitVectorLengthIdx(Opcode).getValue();
2692     Mask = Op->getOperand(MaskIdx);
2693     AVL = Op->getOperand(AVLIdx);
2694 
2695   } else {
2696     // Materialize the VL parameter.
2697     AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32);
2698     SDValue ConstTrue = CDAG.getConstant(1, MVT::i32);
2699     Mask = CDAG.getBroadcast(MaskVT, ConstTrue, AVL);
2700   }
2701 
2702   if (isVVPBinaryOp(VVPOpcode)) {
2703     assert(LegalVecVT.isSimple());
2704     return CDAG.getNode(VVPOpcode, LegalVecVT,
2705                         {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
2706   }
2707   if (VVPOpcode == VEISD::VVP_SELECT) {
2708     auto Mask = Op->getOperand(0);
2709     auto OnTrue = Op->getOperand(1);
2710     auto OnFalse = Op->getOperand(2);
2711     return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
2712   }
2713   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
2714 }
2715 
2716 SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
2717                                                   SelectionDAG &DAG) const {
2718   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
2719   MVT VT = Op.getOperand(0).getSimpleValueType();
2720 
2721   // Special treatment for packed V64 types.
2722   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2723   (void)VT;
2724   // Example of codes:
2725   //   %packed_v = extractelt %vr, %idx / 2
2726   //   %v = %packed_v >> (%idx % 2 * 32)
2727   //   %res = %v & 0xffffffff
2728 
2729   SDValue Vec = Op.getOperand(0);
2730   SDValue Idx = Op.getOperand(1);
2731   SDLoc DL(Op);
2732   SDValue Result = Op;
2733   if (false /* Idx->isConstant() */) {
2734     // TODO: optimized implementation using constant values
2735   } else {
2736     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2737     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2738     SDValue PackedElt =
2739         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2740     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2741     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2742     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2743     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2744     PackedElt = DAG.getNode(ISD::SRL, DL, MVT::i64, {PackedElt, Shift});
2745     SDValue Mask = DAG.getConstant(0xFFFFFFFFL, DL, MVT::i64);
2746     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2747     SDValue SubI32 = DAG.getTargetConstant(VE::sub_i32, DL, MVT::i32);
2748     Result = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
2749                                         MVT::i32, PackedElt, SubI32),
2750                      0);
2751 
2752     if (Op.getSimpleValueType() == MVT::f32) {
2753       Result = DAG.getBitcast(MVT::f32, Result);
2754     } else {
2755       assert(Op.getSimpleValueType() == MVT::i32);
2756     }
2757   }
2758   return Result;
2759 }
2760 
2761 SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
2762                                                  SelectionDAG &DAG) const {
2763   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
2764   MVT VT = Op.getOperand(0).getSimpleValueType();
2765 
2766   // Special treatment for packed V64 types.
2767   assert(VT == MVT::v512i32 || VT == MVT::v512f32);
2768   (void)VT;
2769   // The v512i32 and v512f32 starts from upper bits (0..31).  This "upper
2770   // bits" required `val << 32` from C implementation's point of view.
2771   //
2772   // Example of codes:
2773   //   %packed_elt = extractelt %vr, (%idx >> 1)
2774   //   %shift = ((%idx & 1) ^ 1) << 5
2775   //   %packed_elt &= 0xffffffff00000000 >> shift
2776   //   %packed_elt |= (zext %val) << shift
2777   //   %vr = insertelt %vr, %packed_elt, (%idx >> 1)
2778 
2779   SDLoc DL(Op);
2780   SDValue Vec = Op.getOperand(0);
2781   SDValue Val = Op.getOperand(1);
2782   SDValue Idx = Op.getOperand(2);
2783   if (Idx.getSimpleValueType() == MVT::i32)
2784     Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
2785   if (Val.getSimpleValueType() == MVT::f32)
2786     Val = DAG.getBitcast(MVT::i32, Val);
2787   assert(Val.getSimpleValueType() == MVT::i32);
2788   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
2789 
2790   SDValue Result = Op;
2791   if (false /* Idx->isConstant()*/) {
2792     // TODO: optimized implementation using constant values
2793   } else {
2794     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
2795     SDValue HalfIdx = DAG.getNode(ISD::SRL, DL, MVT::i64, {Idx, Const1});
2796     SDValue PackedElt =
2797         SDValue(DAG.getMachineNode(VE::LVSvr, DL, MVT::i64, {Vec, HalfIdx}), 0);
2798     SDValue AndIdx = DAG.getNode(ISD::AND, DL, MVT::i64, {Idx, Const1});
2799     SDValue Shift = DAG.getNode(ISD::XOR, DL, MVT::i64, {AndIdx, Const1});
2800     SDValue Const5 = DAG.getConstant(5, DL, MVT::i64);
2801     Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, {Shift, Const5});
2802     SDValue Mask = DAG.getConstant(0xFFFFFFFF00000000L, DL, MVT::i64);
2803     Mask = DAG.getNode(ISD::SRL, DL, MVT::i64, {Mask, Shift});
2804     PackedElt = DAG.getNode(ISD::AND, DL, MVT::i64, {PackedElt, Mask});
2805     Val = DAG.getNode(ISD::SHL, DL, MVT::i64, {Val, Shift});
2806     PackedElt = DAG.getNode(ISD::OR, DL, MVT::i64, {PackedElt, Val});
2807     Result =
2808         SDValue(DAG.getMachineNode(VE::LSVrr_v, DL, Vec.getSimpleValueType(),
2809                                    {HalfIdx, PackedElt, Vec}),
2810                 0);
2811   }
2812   return Result;
2813 }
2814