xref: /freebsd/contrib/llvm-project/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 //===-- SystemZISelLowering.cpp - SystemZ DAG lowering implementation -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SystemZTargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "SystemZISelLowering.h"
14 #include "SystemZCallingConv.h"
15 #include "SystemZConstantPoolValue.h"
16 #include "SystemZMachineFunctionInfo.h"
17 #include "SystemZTargetMachine.h"
18 #include "llvm/CodeGen/CallingConvLower.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
22 #include "llvm/IR/IntrinsicInst.h"
23 #include "llvm/IR/Intrinsics.h"
24 #include "llvm/IR/IntrinsicsS390.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/KnownBits.h"
27 #include <cctype>
28 #include <optional>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "systemz-lower"
33 
34 namespace {
35 // Represents information about a comparison.
36 struct Comparison {
37   Comparison(SDValue Op0In, SDValue Op1In, SDValue ChainIn)
38     : Op0(Op0In), Op1(Op1In), Chain(ChainIn),
39       Opcode(0), ICmpType(0), CCValid(0), CCMask(0) {}
40 
41   // The operands to the comparison.
42   SDValue Op0, Op1;
43 
44   // Chain if this is a strict floating-point comparison.
45   SDValue Chain;
46 
47   // The opcode that should be used to compare Op0 and Op1.
48   unsigned Opcode;
49 
50   // A SystemZICMP value.  Only used for integer comparisons.
51   unsigned ICmpType;
52 
53   // The mask of CC values that Opcode can produce.
54   unsigned CCValid;
55 
56   // The mask of CC values for which the original condition is true.
57   unsigned CCMask;
58 };
59 } // end anonymous namespace
60 
61 // Classify VT as either 32 or 64 bit.
62 static bool is32Bit(EVT VT) {
63   switch (VT.getSimpleVT().SimpleTy) {
64   case MVT::i32:
65     return true;
66   case MVT::i64:
67     return false;
68   default:
69     llvm_unreachable("Unsupported type");
70   }
71 }
72 
73 // Return a version of MachineOperand that can be safely used before the
74 // final use.
75 static MachineOperand earlyUseOperand(MachineOperand Op) {
76   if (Op.isReg())
77     Op.setIsKill(false);
78   return Op;
79 }
80 
81 SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
82                                              const SystemZSubtarget &STI)
83     : TargetLowering(TM), Subtarget(STI) {
84   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
85 
86   auto *Regs = STI.getSpecialRegisters();
87 
88   // Set up the register classes.
89   if (Subtarget.hasHighWord())
90     addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
91   else
92     addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
93   addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
94   if (!useSoftFloat()) {
95     if (Subtarget.hasVector()) {
96       addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
97       addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
98     } else {
99       addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
100       addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
101     }
102     if (Subtarget.hasVectorEnhancements1())
103       addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
104     else
105       addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
106 
107     if (Subtarget.hasVector()) {
108       addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
109       addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
110       addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
111       addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
112       addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
113       addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
114     }
115   }
116 
117   // Compute derived properties from the register classes
118   computeRegisterProperties(Subtarget.getRegisterInfo());
119 
120   // Set up special registers.
121   setStackPointerRegisterToSaveRestore(Regs->getStackPointerRegister());
122 
123   // TODO: It may be better to default to latency-oriented scheduling, however
124   // LLVM's current latency-oriented scheduler can't handle physreg definitions
125   // such as SystemZ has with CC, so set this to the register-pressure
126   // scheduler, because it can.
127   setSchedulingPreference(Sched::RegPressure);
128 
129   setBooleanContents(ZeroOrOneBooleanContent);
130   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
131 
132   // Instructions are strings of 2-byte aligned 2-byte values.
133   setMinFunctionAlignment(Align(2));
134   // For performance reasons we prefer 16-byte alignment.
135   setPrefFunctionAlignment(Align(16));
136 
137   // Handle operations that are handled in a similar way for all types.
138   for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
139        I <= MVT::LAST_FP_VALUETYPE;
140        ++I) {
141     MVT VT = MVT::SimpleValueType(I);
142     if (isTypeLegal(VT)) {
143       // Lower SET_CC into an IPM-based sequence.
144       setOperationAction(ISD::SETCC, VT, Custom);
145       setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
146       setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
147 
148       // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
149       setOperationAction(ISD::SELECT, VT, Expand);
150 
151       // Lower SELECT_CC and BR_CC into separate comparisons and branches.
152       setOperationAction(ISD::SELECT_CC, VT, Custom);
153       setOperationAction(ISD::BR_CC,     VT, Custom);
154     }
155   }
156 
157   // Expand jump table branches as address arithmetic followed by an
158   // indirect jump.
159   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
160 
161   // Expand BRCOND into a BR_CC (see above).
162   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
163 
164   // Handle integer types.
165   for (unsigned I = MVT::FIRST_INTEGER_VALUETYPE;
166        I <= MVT::LAST_INTEGER_VALUETYPE;
167        ++I) {
168     MVT VT = MVT::SimpleValueType(I);
169     if (isTypeLegal(VT)) {
170       setOperationAction(ISD::ABS, VT, Legal);
171 
172       // Expand individual DIV and REMs into DIVREMs.
173       setOperationAction(ISD::SDIV, VT, Expand);
174       setOperationAction(ISD::UDIV, VT, Expand);
175       setOperationAction(ISD::SREM, VT, Expand);
176       setOperationAction(ISD::UREM, VT, Expand);
177       setOperationAction(ISD::SDIVREM, VT, Custom);
178       setOperationAction(ISD::UDIVREM, VT, Custom);
179 
180       // Support addition/subtraction with overflow.
181       setOperationAction(ISD::SADDO, VT, Custom);
182       setOperationAction(ISD::SSUBO, VT, Custom);
183 
184       // Support addition/subtraction with carry.
185       setOperationAction(ISD::UADDO, VT, Custom);
186       setOperationAction(ISD::USUBO, VT, Custom);
187 
188       // Support carry in as value rather than glue.
189       setOperationAction(ISD::UADDO_CARRY, VT, Custom);
190       setOperationAction(ISD::USUBO_CARRY, VT, Custom);
191 
192       // Lower ATOMIC_LOAD and ATOMIC_STORE into normal volatile loads and
193       // stores, putting a serialization instruction after the stores.
194       setOperationAction(ISD::ATOMIC_LOAD,  VT, Custom);
195       setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
196 
197       // Lower ATOMIC_LOAD_SUB into ATOMIC_LOAD_ADD if LAA and LAAG are
198       // available, or if the operand is constant.
199       setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
200 
201       // Use POPCNT on z196 and above.
202       if (Subtarget.hasPopulationCount())
203         setOperationAction(ISD::CTPOP, VT, Custom);
204       else
205         setOperationAction(ISD::CTPOP, VT, Expand);
206 
207       // No special instructions for these.
208       setOperationAction(ISD::CTTZ,            VT, Expand);
209       setOperationAction(ISD::ROTR,            VT, Expand);
210 
211       // Use *MUL_LOHI where possible instead of MULH*.
212       setOperationAction(ISD::MULHS, VT, Expand);
213       setOperationAction(ISD::MULHU, VT, Expand);
214       setOperationAction(ISD::SMUL_LOHI, VT, Custom);
215       setOperationAction(ISD::UMUL_LOHI, VT, Custom);
216 
217       // Only z196 and above have native support for conversions to unsigned.
218       // On z10, promoting to i64 doesn't generate an inexact condition for
219       // values that are outside the i32 range but in the i64 range, so use
220       // the default expansion.
221       if (!Subtarget.hasFPExtension())
222         setOperationAction(ISD::FP_TO_UINT, VT, Expand);
223 
224       // Mirror those settings for STRICT_FP_TO_[SU]INT.  Note that these all
225       // default to Expand, so need to be modified to Legal where appropriate.
226       setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal);
227       if (Subtarget.hasFPExtension())
228         setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal);
229 
230       // And similarly for STRICT_[SU]INT_TO_FP.
231       setOperationAction(ISD::STRICT_SINT_TO_FP, VT, Legal);
232       if (Subtarget.hasFPExtension())
233         setOperationAction(ISD::STRICT_UINT_TO_FP, VT, Legal);
234     }
235   }
236 
237   // Type legalization will convert 8- and 16-bit atomic operations into
238   // forms that operate on i32s (but still keeping the original memory VT).
239   // Lower them into full i32 operations.
240   setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Custom);
241   setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Custom);
242   setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Custom);
243   setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Custom);
244   setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Custom);
245   setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Custom);
246   setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Custom);
247   setOperationAction(ISD::ATOMIC_LOAD_MIN,  MVT::i32, Custom);
248   setOperationAction(ISD::ATOMIC_LOAD_MAX,  MVT::i32, Custom);
249   setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Custom);
250   setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom);
251 
252   // Even though i128 is not a legal type, we still need to custom lower
253   // the atomic operations in order to exploit SystemZ instructions.
254   setOperationAction(ISD::ATOMIC_LOAD,     MVT::i128, Custom);
255   setOperationAction(ISD::ATOMIC_STORE,    MVT::i128, Custom);
256 
257   // We can use the CC result of compare-and-swap to implement
258   // the "success" result of ATOMIC_CMP_SWAP_WITH_SUCCESS.
259   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Custom);
260   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Custom);
261   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
262 
263   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
264 
265   // Traps are legal, as we will convert them to "j .+2".
266   setOperationAction(ISD::TRAP, MVT::Other, Legal);
267 
268   // z10 has instructions for signed but not unsigned FP conversion.
269   // Handle unsigned 32-bit types as signed 64-bit types.
270   if (!Subtarget.hasFPExtension()) {
271     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote);
272     setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
273     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Promote);
274     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Expand);
275   }
276 
277   // We have native support for a 64-bit CTLZ, via FLOGR.
278   setOperationAction(ISD::CTLZ, MVT::i32, Promote);
279   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
280   setOperationAction(ISD::CTLZ, MVT::i64, Legal);
281 
282   // On z15 we have native support for a 64-bit CTPOP.
283   if (Subtarget.hasMiscellaneousExtensions3()) {
284     setOperationAction(ISD::CTPOP, MVT::i32, Promote);
285     setOperationAction(ISD::CTPOP, MVT::i64, Legal);
286   }
287 
288   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
289   setOperationAction(ISD::OR, MVT::i64, Custom);
290 
291   // Expand 128 bit shifts without using a libcall.
292   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
293   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
294   setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
295   setLibcallName(RTLIB::SRL_I128, nullptr);
296   setLibcallName(RTLIB::SHL_I128, nullptr);
297   setLibcallName(RTLIB::SRA_I128, nullptr);
298 
299   // Handle bitcast from fp128 to i128.
300   setOperationAction(ISD::BITCAST, MVT::i128, Custom);
301 
302   // We have native instructions for i8, i16 and i32 extensions, but not i1.
303   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
304   for (MVT VT : MVT::integer_valuetypes()) {
305     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
306     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
307     setLoadExtAction(ISD::EXTLOAD,  VT, MVT::i1, Promote);
308   }
309 
310   // Handle the various types of symbolic address.
311   setOperationAction(ISD::ConstantPool,     PtrVT, Custom);
312   setOperationAction(ISD::GlobalAddress,    PtrVT, Custom);
313   setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
314   setOperationAction(ISD::BlockAddress,     PtrVT, Custom);
315   setOperationAction(ISD::JumpTable,        PtrVT, Custom);
316 
317   // We need to handle dynamic allocations specially because of the
318   // 160-byte area at the bottom of the stack.
319   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
320   setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
321 
322   setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
323   setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
324 
325   // Handle prefetches with PFD or PFDRL.
326   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
327 
328   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
329     // Assume by default that all vector operations need to be expanded.
330     for (unsigned Opcode = 0; Opcode < ISD::BUILTIN_OP_END; ++Opcode)
331       if (getOperationAction(Opcode, VT) == Legal)
332         setOperationAction(Opcode, VT, Expand);
333 
334     // Likewise all truncating stores and extending loads.
335     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
336       setTruncStoreAction(VT, InnerVT, Expand);
337       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
338       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
339       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
340     }
341 
342     if (isTypeLegal(VT)) {
343       // These operations are legal for anything that can be stored in a
344       // vector register, even if there is no native support for the format
345       // as such.  In particular, we can do these for v4f32 even though there
346       // are no specific instructions for that format.
347       setOperationAction(ISD::LOAD, VT, Legal);
348       setOperationAction(ISD::STORE, VT, Legal);
349       setOperationAction(ISD::VSELECT, VT, Legal);
350       setOperationAction(ISD::BITCAST, VT, Legal);
351       setOperationAction(ISD::UNDEF, VT, Legal);
352 
353       // Likewise, except that we need to replace the nodes with something
354       // more specific.
355       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
356       setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
357     }
358   }
359 
360   // Handle integer vector types.
361   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
362     if (isTypeLegal(VT)) {
363       // These operations have direct equivalents.
364       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Legal);
365       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Legal);
366       setOperationAction(ISD::ADD, VT, Legal);
367       setOperationAction(ISD::SUB, VT, Legal);
368       if (VT != MVT::v2i64)
369         setOperationAction(ISD::MUL, VT, Legal);
370       setOperationAction(ISD::ABS, VT, Legal);
371       setOperationAction(ISD::AND, VT, Legal);
372       setOperationAction(ISD::OR, VT, Legal);
373       setOperationAction(ISD::XOR, VT, Legal);
374       if (Subtarget.hasVectorEnhancements1())
375         setOperationAction(ISD::CTPOP, VT, Legal);
376       else
377         setOperationAction(ISD::CTPOP, VT, Custom);
378       setOperationAction(ISD::CTTZ, VT, Legal);
379       setOperationAction(ISD::CTLZ, VT, Legal);
380 
381       // Convert a GPR scalar to a vector by inserting it into element 0.
382       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
383 
384       // Use a series of unpacks for extensions.
385       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
386       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
387 
388       // Detect shifts by a scalar amount and convert them into
389       // V*_BY_SCALAR.
390       setOperationAction(ISD::SHL, VT, Custom);
391       setOperationAction(ISD::SRA, VT, Custom);
392       setOperationAction(ISD::SRL, VT, Custom);
393 
394       // At present ROTL isn't matched by DAGCombiner.  ROTR should be
395       // converted into ROTL.
396       setOperationAction(ISD::ROTL, VT, Expand);
397       setOperationAction(ISD::ROTR, VT, Expand);
398 
399       // Map SETCCs onto one of VCE, VCH or VCHL, swapping the operands
400       // and inverting the result as necessary.
401       setOperationAction(ISD::SETCC, VT, Custom);
402     }
403   }
404 
405   if (Subtarget.hasVector()) {
406     // There should be no need to check for float types other than v2f64
407     // since <2 x f32> isn't a legal type.
408     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
409     setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
410     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
411     setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
412     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
413     setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
414     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
415     setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
416 
417     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i64, Legal);
418     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f64, Legal);
419     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i64, Legal);
420     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f64, Legal);
421     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i64, Legal);
422     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f64, Legal);
423     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i64, Legal);
424     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f64, Legal);
425   }
426 
427   if (Subtarget.hasVectorEnhancements2()) {
428     setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
429     setOperationAction(ISD::FP_TO_SINT, MVT::v4f32, Legal);
430     setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
431     setOperationAction(ISD::FP_TO_UINT, MVT::v4f32, Legal);
432     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
433     setOperationAction(ISD::SINT_TO_FP, MVT::v4f32, Legal);
434     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
435     setOperationAction(ISD::UINT_TO_FP, MVT::v4f32, Legal);
436 
437     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
438     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f32, Legal);
439     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Legal);
440     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f32, Legal);
441     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
442     setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f32, Legal);
443     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Legal);
444     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f32, Legal);
445   }
446 
447   // Handle floating-point types.
448   for (unsigned I = MVT::FIRST_FP_VALUETYPE;
449        I <= MVT::LAST_FP_VALUETYPE;
450        ++I) {
451     MVT VT = MVT::SimpleValueType(I);
452     if (isTypeLegal(VT)) {
453       // We can use FI for FRINT.
454       setOperationAction(ISD::FRINT, VT, Legal);
455 
456       // We can use the extended form of FI for other rounding operations.
457       if (Subtarget.hasFPExtension()) {
458         setOperationAction(ISD::FNEARBYINT, VT, Legal);
459         setOperationAction(ISD::FFLOOR, VT, Legal);
460         setOperationAction(ISD::FCEIL, VT, Legal);
461         setOperationAction(ISD::FTRUNC, VT, Legal);
462         setOperationAction(ISD::FROUND, VT, Legal);
463       }
464 
465       // No special instructions for these.
466       setOperationAction(ISD::FSIN, VT, Expand);
467       setOperationAction(ISD::FCOS, VT, Expand);
468       setOperationAction(ISD::FSINCOS, VT, Expand);
469       setOperationAction(ISD::FREM, VT, Expand);
470       setOperationAction(ISD::FPOW, VT, Expand);
471 
472       // Special treatment.
473       setOperationAction(ISD::IS_FPCLASS, VT, Custom);
474 
475       // Handle constrained floating-point operations.
476       setOperationAction(ISD::STRICT_FADD, VT, Legal);
477       setOperationAction(ISD::STRICT_FSUB, VT, Legal);
478       setOperationAction(ISD::STRICT_FMUL, VT, Legal);
479       setOperationAction(ISD::STRICT_FDIV, VT, Legal);
480       setOperationAction(ISD::STRICT_FMA, VT, Legal);
481       setOperationAction(ISD::STRICT_FSQRT, VT, Legal);
482       setOperationAction(ISD::STRICT_FRINT, VT, Legal);
483       setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
484       setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
485       if (Subtarget.hasFPExtension()) {
486         setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
487         setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
488         setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
489         setOperationAction(ISD::STRICT_FROUND, VT, Legal);
490         setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
491       }
492     }
493   }
494 
495   // Handle floating-point vector types.
496   if (Subtarget.hasVector()) {
497     // Scalar-to-vector conversion is just a subreg.
498     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
499     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
500 
501     // Some insertions and extractions can be done directly but others
502     // need to go via integers.
503     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
504     setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
505     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
506     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
507 
508     // These operations have direct equivalents.
509     setOperationAction(ISD::FADD, MVT::v2f64, Legal);
510     setOperationAction(ISD::FNEG, MVT::v2f64, Legal);
511     setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
512     setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
513     setOperationAction(ISD::FMA, MVT::v2f64, Legal);
514     setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
515     setOperationAction(ISD::FABS, MVT::v2f64, Legal);
516     setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
517     setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
518     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
519     setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
520     setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
521     setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
522     setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
523 
524     // Handle constrained floating-point operations.
525     setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
526     setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
527     setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
528     setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
529     setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
530     setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
531     setOperationAction(ISD::STRICT_FRINT, MVT::v2f64, Legal);
532     setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
533     setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
534     setOperationAction(ISD::STRICT_FCEIL, MVT::v2f64, Legal);
535     setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
536     setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
537 
538     setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
539     setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
540     setOperationAction(ISD::STRICT_FSETCC, MVT::v2f64, Custom);
541     setOperationAction(ISD::STRICT_FSETCC, MVT::v4f32, Custom);
542     if (Subtarget.hasVectorEnhancements1()) {
543       setOperationAction(ISD::STRICT_FSETCCS, MVT::v2f64, Custom);
544       setOperationAction(ISD::STRICT_FSETCCS, MVT::v4f32, Custom);
545     }
546   }
547 
548   // The vector enhancements facility 1 has instructions for these.
549   if (Subtarget.hasVectorEnhancements1()) {
550     setOperationAction(ISD::FADD, MVT::v4f32, Legal);
551     setOperationAction(ISD::FNEG, MVT::v4f32, Legal);
552     setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
553     setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
554     setOperationAction(ISD::FMA, MVT::v4f32, Legal);
555     setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
556     setOperationAction(ISD::FABS, MVT::v4f32, Legal);
557     setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
558     setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
559     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
560     setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
561     setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
562     setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
563     setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
564 
565     setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
566     setOperationAction(ISD::FMAXIMUM, MVT::f64, Legal);
567     setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
568     setOperationAction(ISD::FMINIMUM, MVT::f64, Legal);
569 
570     setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal);
571     setOperationAction(ISD::FMAXIMUM, MVT::v2f64, Legal);
572     setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal);
573     setOperationAction(ISD::FMINIMUM, MVT::v2f64, Legal);
574 
575     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
576     setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
577     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
578     setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
579 
580     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
581     setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
582     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
583     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
584 
585     setOperationAction(ISD::FMAXNUM, MVT::f128, Legal);
586     setOperationAction(ISD::FMAXIMUM, MVT::f128, Legal);
587     setOperationAction(ISD::FMINNUM, MVT::f128, Legal);
588     setOperationAction(ISD::FMINIMUM, MVT::f128, Legal);
589 
590     // Handle constrained floating-point operations.
591     setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
592     setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
593     setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
594     setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
595     setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
596     setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
597     setOperationAction(ISD::STRICT_FRINT, MVT::v4f32, Legal);
598     setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
599     setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
600     setOperationAction(ISD::STRICT_FCEIL, MVT::v4f32, Legal);
601     setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
602     setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
603     for (auto VT : { MVT::f32, MVT::f64, MVT::f128,
604                      MVT::v4f32, MVT::v2f64 }) {
605       setOperationAction(ISD::STRICT_FMAXNUM, VT, Legal);
606       setOperationAction(ISD::STRICT_FMINNUM, VT, Legal);
607       setOperationAction(ISD::STRICT_FMAXIMUM, VT, Legal);
608       setOperationAction(ISD::STRICT_FMINIMUM, VT, Legal);
609     }
610   }
611 
612   // We only have fused f128 multiply-addition on vector registers.
613   if (!Subtarget.hasVectorEnhancements1()) {
614     setOperationAction(ISD::FMA, MVT::f128, Expand);
615     setOperationAction(ISD::STRICT_FMA, MVT::f128, Expand);
616   }
617 
618   // We don't have a copysign instruction on vector registers.
619   if (Subtarget.hasVectorEnhancements1())
620     setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
621 
622   // Needed so that we don't try to implement f128 constant loads using
623   // a load-and-extend of a f80 constant (in cases where the constant
624   // would fit in an f80).
625   for (MVT VT : MVT::fp_valuetypes())
626     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
627 
628   // We don't have extending load instruction on vector registers.
629   if (Subtarget.hasVectorEnhancements1()) {
630     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
631     setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
632   }
633 
634   // Floating-point truncation and stores need to be done separately.
635   setTruncStoreAction(MVT::f64,  MVT::f32, Expand);
636   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
637   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
638 
639   // We have 64-bit FPR<->GPR moves, but need special handling for
640   // 32-bit forms.
641   if (!Subtarget.hasVector()) {
642     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
643     setOperationAction(ISD::BITCAST, MVT::f32, Custom);
644   }
645 
646   // VASTART and VACOPY need to deal with the SystemZ-specific varargs
647   // structure, but VAEND is a no-op.
648   setOperationAction(ISD::VASTART, MVT::Other, Custom);
649   setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
650   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
651 
652   setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
653 
654   // Codes for which we want to perform some z-specific combinations.
655   setTargetDAGCombine({ISD::ZERO_EXTEND,
656                        ISD::SIGN_EXTEND,
657                        ISD::SIGN_EXTEND_INREG,
658                        ISD::LOAD,
659                        ISD::STORE,
660                        ISD::VECTOR_SHUFFLE,
661                        ISD::EXTRACT_VECTOR_ELT,
662                        ISD::FP_ROUND,
663                        ISD::STRICT_FP_ROUND,
664                        ISD::FP_EXTEND,
665                        ISD::SINT_TO_FP,
666                        ISD::UINT_TO_FP,
667                        ISD::STRICT_FP_EXTEND,
668                        ISD::BSWAP,
669                        ISD::SDIV,
670                        ISD::UDIV,
671                        ISD::SREM,
672                        ISD::UREM,
673                        ISD::INTRINSIC_VOID,
674                        ISD::INTRINSIC_W_CHAIN});
675 
676   // Handle intrinsics.
677   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
678   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
679 
680   // We want to use MVC in preference to even a single load/store pair.
681   MaxStoresPerMemcpy = Subtarget.hasVector() ? 2 : 0;
682   MaxStoresPerMemcpyOptSize = 0;
683 
684   // The main memset sequence is a byte store followed by an MVC.
685   // Two STC or MV..I stores win over that, but the kind of fused stores
686   // generated by target-independent code don't when the byte value is
687   // variable.  E.g.  "STC <reg>;MHI <reg>,257;STH <reg>" is not better
688   // than "STC;MVC".  Handle the choice in target-specific code instead.
689   MaxStoresPerMemset = Subtarget.hasVector() ? 2 : 0;
690   MaxStoresPerMemsetOptSize = 0;
691 
692   // Default to having -disable-strictnode-mutation on
693   IsStrictFPEnabled = true;
694 }
695 
696 bool SystemZTargetLowering::useSoftFloat() const {
697   return Subtarget.hasSoftFloat();
698 }
699 
700 EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
701                                               LLVMContext &, EVT VT) const {
702   if (!VT.isVector())
703     return MVT::i32;
704   return VT.changeVectorElementTypeToInteger();
705 }
706 
707 bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(
708     const MachineFunction &MF, EVT VT) const {
709   VT = VT.getScalarType();
710 
711   if (!VT.isSimple())
712     return false;
713 
714   switch (VT.getSimpleVT().SimpleTy) {
715   case MVT::f32:
716   case MVT::f64:
717     return true;
718   case MVT::f128:
719     return Subtarget.hasVectorEnhancements1();
720   default:
721     break;
722   }
723 
724   return false;
725 }
726 
727 // Return true if the constant can be generated with a vector instruction,
728 // such as VGM, VGMB or VREPI.
729 bool SystemZVectorConstantInfo::isVectorConstantLegal(
730     const SystemZSubtarget &Subtarget) {
731   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
732   if (!Subtarget.hasVector() ||
733       (isFP128 && !Subtarget.hasVectorEnhancements1()))
734     return false;
735 
736   // Try using VECTOR GENERATE BYTE MASK.  This is the architecturally-
737   // preferred way of creating all-zero and all-one vectors so give it
738   // priority over other methods below.
739   unsigned Mask = 0;
740   unsigned I = 0;
741   for (; I < SystemZ::VectorBytes; ++I) {
742     uint64_t Byte = IntBits.lshr(I * 8).trunc(8).getZExtValue();
743     if (Byte == 0xff)
744       Mask |= 1ULL << I;
745     else if (Byte != 0)
746       break;
747   }
748   if (I == SystemZ::VectorBytes) {
749     Opcode = SystemZISD::BYTE_MASK;
750     OpVals.push_back(Mask);
751     VecVT = MVT::getVectorVT(MVT::getIntegerVT(8), 16);
752     return true;
753   }
754 
755   if (SplatBitSize > 64)
756     return false;
757 
758   auto tryValue = [&](uint64_t Value) -> bool {
759     // Try VECTOR REPLICATE IMMEDIATE
760     int64_t SignedValue = SignExtend64(Value, SplatBitSize);
761     if (isInt<16>(SignedValue)) {
762       OpVals.push_back(((unsigned) SignedValue));
763       Opcode = SystemZISD::REPLICATE;
764       VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
765                                SystemZ::VectorBits / SplatBitSize);
766       return true;
767     }
768     // Try VECTOR GENERATE MASK
769     unsigned Start, End;
770     if (TII->isRxSBGMask(Value, SplatBitSize, Start, End)) {
771       // isRxSBGMask returns the bit numbers for a full 64-bit value, with 0
772       // denoting 1 << 63 and 63 denoting 1.  Convert them to bit numbers for
773       // an SplatBitSize value, so that 0 denotes 1 << (SplatBitSize-1).
774       OpVals.push_back(Start - (64 - SplatBitSize));
775       OpVals.push_back(End - (64 - SplatBitSize));
776       Opcode = SystemZISD::ROTATE_MASK;
777       VecVT = MVT::getVectorVT(MVT::getIntegerVT(SplatBitSize),
778                                SystemZ::VectorBits / SplatBitSize);
779       return true;
780     }
781     return false;
782   };
783 
784   // First try assuming that any undefined bits above the highest set bit
785   // and below the lowest set bit are 1s.  This increases the likelihood of
786   // being able to use a sign-extended element value in VECTOR REPLICATE
787   // IMMEDIATE or a wraparound mask in VECTOR GENERATE MASK.
788   uint64_t SplatBitsZ = SplatBits.getZExtValue();
789   uint64_t SplatUndefZ = SplatUndef.getZExtValue();
790   unsigned LowerBits = llvm::countr_zero(SplatBitsZ);
791   unsigned UpperBits = llvm::countl_zero(SplatBitsZ);
792   uint64_t Lower = SplatUndefZ & maskTrailingOnes<uint64_t>(LowerBits);
793   uint64_t Upper = SplatUndefZ & maskLeadingOnes<uint64_t>(UpperBits);
794   if (tryValue(SplatBitsZ | Upper | Lower))
795     return true;
796 
797   // Now try assuming that any undefined bits between the first and
798   // last defined set bits are set.  This increases the chances of
799   // using a non-wraparound mask.
800   uint64_t Middle = SplatUndefZ & ~Upper & ~Lower;
801   return tryValue(SplatBitsZ | Middle);
802 }
803 
804 SystemZVectorConstantInfo::SystemZVectorConstantInfo(APInt IntImm) {
805   if (IntImm.isSingleWord()) {
806     IntBits = APInt(128, IntImm.getZExtValue());
807     IntBits <<= (SystemZ::VectorBits - IntImm.getBitWidth());
808   } else
809     IntBits = IntImm;
810   assert(IntBits.getBitWidth() == 128 && "Unsupported APInt.");
811 
812   // Find the smallest splat.
813   SplatBits = IntImm;
814   unsigned Width = SplatBits.getBitWidth();
815   while (Width > 8) {
816     unsigned HalfSize = Width / 2;
817     APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
818     APInt LowValue = SplatBits.trunc(HalfSize);
819 
820     // If the two halves do not match, stop here.
821     if (HighValue != LowValue || 8 > HalfSize)
822       break;
823 
824     SplatBits = HighValue;
825     Width = HalfSize;
826   }
827   SplatUndef = 0;
828   SplatBitSize = Width;
829 }
830 
831 SystemZVectorConstantInfo::SystemZVectorConstantInfo(BuildVectorSDNode *BVN) {
832   assert(BVN->isConstant() && "Expected a constant BUILD_VECTOR");
833   bool HasAnyUndefs;
834 
835   // Get IntBits by finding the 128 bit splat.
836   BVN->isConstantSplat(IntBits, SplatUndef, SplatBitSize, HasAnyUndefs, 128,
837                        true);
838 
839   // Get SplatBits by finding the 8 bit or greater splat.
840   BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, 8,
841                        true);
842 }
843 
844 bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
845                                          bool ForCodeSize) const {
846   // We can load zero using LZ?R and negative zero using LZ?R;LC?BR.
847   if (Imm.isZero() || Imm.isNegZero())
848     return true;
849 
850   return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
851 }
852 
853 /// Returns true if stack probing through inline assembly is requested.
854 bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {
855   // If the function specifically requests inline stack probes, emit them.
856   if (MF.getFunction().hasFnAttribute("probe-stack"))
857     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
858            "inline-asm";
859   return false;
860 }
861 
862 bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
863   // We can use CGFI or CLGFI.
864   return isInt<32>(Imm) || isUInt<32>(Imm);
865 }
866 
867 bool SystemZTargetLowering::isLegalAddImmediate(int64_t Imm) const {
868   // We can use ALGFI or SLGFI.
869   return isUInt<32>(Imm) || isUInt<32>(-Imm);
870 }
871 
872 bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
873     EVT VT, unsigned, Align, MachineMemOperand::Flags, unsigned *Fast) const {
874   // Unaligned accesses should never be slower than the expanded version.
875   // We check specifically for aligned accesses in the few cases where
876   // they are required.
877   if (Fast)
878     *Fast = 1;
879   return true;
880 }
881 
882 // Information about the addressing mode for a memory access.
883 struct AddressingMode {
884   // True if a long displacement is supported.
885   bool LongDisplacement;
886 
887   // True if use of index register is supported.
888   bool IndexReg;
889 
890   AddressingMode(bool LongDispl, bool IdxReg) :
891     LongDisplacement(LongDispl), IndexReg(IdxReg) {}
892 };
893 
894 // Return the desired addressing mode for a Load which has only one use (in
895 // the same block) which is a Store.
896 static AddressingMode getLoadStoreAddrMode(bool HasVector,
897                                           Type *Ty) {
898   // With vector support a Load->Store combination may be combined to either
899   // an MVC or vector operations and it seems to work best to allow the
900   // vector addressing mode.
901   if (HasVector)
902     return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
903 
904   // Otherwise only the MVC case is special.
905   bool MVC = Ty->isIntegerTy(8);
906   return AddressingMode(!MVC/*LongDispl*/, !MVC/*IdxReg*/);
907 }
908 
909 // Return the addressing mode which seems most desirable given an LLVM
910 // Instruction pointer.
911 static AddressingMode
912 supportedAddressingMode(Instruction *I, bool HasVector) {
913   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
914     switch (II->getIntrinsicID()) {
915     default: break;
916     case Intrinsic::memset:
917     case Intrinsic::memmove:
918     case Intrinsic::memcpy:
919       return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
920     }
921   }
922 
923   if (isa<LoadInst>(I) && I->hasOneUse()) {
924     auto *SingleUser = cast<Instruction>(*I->user_begin());
925     if (SingleUser->getParent() == I->getParent()) {
926       if (isa<ICmpInst>(SingleUser)) {
927         if (auto *C = dyn_cast<ConstantInt>(SingleUser->getOperand(1)))
928           if (C->getBitWidth() <= 64 &&
929               (isInt<16>(C->getSExtValue()) || isUInt<16>(C->getZExtValue())))
930             // Comparison of memory with 16 bit signed / unsigned immediate
931             return AddressingMode(false/*LongDispl*/, false/*IdxReg*/);
932       } else if (isa<StoreInst>(SingleUser))
933         // Load->Store
934         return getLoadStoreAddrMode(HasVector, I->getType());
935     }
936   } else if (auto *StoreI = dyn_cast<StoreInst>(I)) {
937     if (auto *LoadI = dyn_cast<LoadInst>(StoreI->getValueOperand()))
938       if (LoadI->hasOneUse() && LoadI->getParent() == I->getParent())
939         // Load->Store
940         return getLoadStoreAddrMode(HasVector, LoadI->getType());
941   }
942 
943   if (HasVector && (isa<LoadInst>(I) || isa<StoreInst>(I))) {
944 
945     // * Use LDE instead of LE/LEY for z13 to avoid partial register
946     //   dependencies (LDE only supports small offsets).
947     // * Utilize the vector registers to hold floating point
948     //   values (vector load / store instructions only support small
949     //   offsets).
950 
951     Type *MemAccessTy = (isa<LoadInst>(I) ? I->getType() :
952                          I->getOperand(0)->getType());
953     bool IsFPAccess = MemAccessTy->isFloatingPointTy();
954     bool IsVectorAccess = MemAccessTy->isVectorTy();
955 
956     // A store of an extracted vector element will be combined into a VSTE type
957     // instruction.
958     if (!IsVectorAccess && isa<StoreInst>(I)) {
959       Value *DataOp = I->getOperand(0);
960       if (isa<ExtractElementInst>(DataOp))
961         IsVectorAccess = true;
962     }
963 
964     // A load which gets inserted into a vector element will be combined into a
965     // VLE type instruction.
966     if (!IsVectorAccess && isa<LoadInst>(I) && I->hasOneUse()) {
967       User *LoadUser = *I->user_begin();
968       if (isa<InsertElementInst>(LoadUser))
969         IsVectorAccess = true;
970     }
971 
972     if (IsFPAccess || IsVectorAccess)
973       return AddressingMode(false/*LongDispl*/, true/*IdxReg*/);
974   }
975 
976   return AddressingMode(true/*LongDispl*/, true/*IdxReg*/);
977 }
978 
979 bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
980        const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const {
981   // Punt on globals for now, although they can be used in limited
982   // RELATIVE LONG cases.
983   if (AM.BaseGV)
984     return false;
985 
986   // Require a 20-bit signed offset.
987   if (!isInt<20>(AM.BaseOffs))
988     return false;
989 
990   bool RequireD12 = Subtarget.hasVector() && Ty->isVectorTy();
991   AddressingMode SupportedAM(!RequireD12, true);
992   if (I != nullptr)
993     SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
994 
995   if (!SupportedAM.LongDisplacement && !isUInt<12>(AM.BaseOffs))
996     return false;
997 
998   if (!SupportedAM.IndexReg)
999     // No indexing allowed.
1000     return AM.Scale == 0;
1001   else
1002     // Indexing is OK but no scale factor can be applied.
1003     return AM.Scale == 0 || AM.Scale == 1;
1004 }
1005 
1006 bool SystemZTargetLowering::findOptimalMemOpLowering(
1007     std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
1008     unsigned SrcAS, const AttributeList &FuncAttributes) const {
1009   const int MVCFastLen = 16;
1010 
1011   if (Limit != ~unsigned(0)) {
1012     // Don't expand Op into scalar loads/stores in these cases:
1013     if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
1014       return false; // Small memcpy: Use MVC
1015     if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
1016       return false; // Small memset (first byte with STC/MVI): Use MVC
1017     if (Op.isZeroMemset())
1018       return false; // Memset zero: Use XC
1019   }
1020 
1021   return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS,
1022                                                   SrcAS, FuncAttributes);
1023 }
1024 
1025 EVT SystemZTargetLowering::getOptimalMemOpType(const MemOp &Op,
1026                                    const AttributeList &FuncAttributes) const {
1027   return Subtarget.hasVector() ? MVT::v2i64 : MVT::Other;
1028 }
1029 
1030 bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
1031   if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
1032     return false;
1033   unsigned FromBits = FromType->getPrimitiveSizeInBits().getFixedValue();
1034   unsigned ToBits = ToType->getPrimitiveSizeInBits().getFixedValue();
1035   return FromBits > ToBits;
1036 }
1037 
1038 bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
1039   if (!FromVT.isInteger() || !ToVT.isInteger())
1040     return false;
1041   unsigned FromBits = FromVT.getFixedSizeInBits();
1042   unsigned ToBits = ToVT.getFixedSizeInBits();
1043   return FromBits > ToBits;
1044 }
1045 
1046 //===----------------------------------------------------------------------===//
1047 // Inline asm support
1048 //===----------------------------------------------------------------------===//
1049 
1050 TargetLowering::ConstraintType
1051 SystemZTargetLowering::getConstraintType(StringRef Constraint) const {
1052   if (Constraint.size() == 1) {
1053     switch (Constraint[0]) {
1054     case 'a': // Address register
1055     case 'd': // Data register (equivalent to 'r')
1056     case 'f': // Floating-point register
1057     case 'h': // High-part register
1058     case 'r': // General-purpose register
1059     case 'v': // Vector register
1060       return C_RegisterClass;
1061 
1062     case 'Q': // Memory with base and unsigned 12-bit displacement
1063     case 'R': // Likewise, plus an index
1064     case 'S': // Memory with base and signed 20-bit displacement
1065     case 'T': // Likewise, plus an index
1066     case 'm': // Equivalent to 'T'.
1067       return C_Memory;
1068 
1069     case 'I': // Unsigned 8-bit constant
1070     case 'J': // Unsigned 12-bit constant
1071     case 'K': // Signed 16-bit constant
1072     case 'L': // Signed 20-bit displacement (on all targets we support)
1073     case 'M': // 0x7fffffff
1074       return C_Immediate;
1075 
1076     default:
1077       break;
1078     }
1079   } else if (Constraint.size() == 2 && Constraint[0] == 'Z') {
1080     switch (Constraint[1]) {
1081     case 'Q': // Address with base and unsigned 12-bit displacement
1082     case 'R': // Likewise, plus an index
1083     case 'S': // Address with base and signed 20-bit displacement
1084     case 'T': // Likewise, plus an index
1085       return C_Address;
1086 
1087     default:
1088       break;
1089     }
1090   }
1091   return TargetLowering::getConstraintType(Constraint);
1092 }
1093 
1094 TargetLowering::ConstraintWeight SystemZTargetLowering::
1095 getSingleConstraintMatchWeight(AsmOperandInfo &info,
1096                                const char *constraint) const {
1097   ConstraintWeight weight = CW_Invalid;
1098   Value *CallOperandVal = info.CallOperandVal;
1099   // If we don't have a value, we can't do a match,
1100   // but allow it at the lowest weight.
1101   if (!CallOperandVal)
1102     return CW_Default;
1103   Type *type = CallOperandVal->getType();
1104   // Look at the constraint type.
1105   switch (*constraint) {
1106   default:
1107     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
1108     break;
1109 
1110   case 'a': // Address register
1111   case 'd': // Data register (equivalent to 'r')
1112   case 'h': // High-part register
1113   case 'r': // General-purpose register
1114     weight = CallOperandVal->getType()->isIntegerTy() ? CW_Register : CW_Default;
1115     break;
1116 
1117   case 'f': // Floating-point register
1118     if (!useSoftFloat())
1119       weight = type->isFloatingPointTy() ? CW_Register : CW_Default;
1120     break;
1121 
1122   case 'v': // Vector register
1123     if (Subtarget.hasVector())
1124       weight = (type->isVectorTy() || type->isFloatingPointTy()) ? CW_Register
1125                                                                  : CW_Default;
1126     break;
1127 
1128   case 'I': // Unsigned 8-bit constant
1129     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1130       if (isUInt<8>(C->getZExtValue()))
1131         weight = CW_Constant;
1132     break;
1133 
1134   case 'J': // Unsigned 12-bit constant
1135     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1136       if (isUInt<12>(C->getZExtValue()))
1137         weight = CW_Constant;
1138     break;
1139 
1140   case 'K': // Signed 16-bit constant
1141     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1142       if (isInt<16>(C->getSExtValue()))
1143         weight = CW_Constant;
1144     break;
1145 
1146   case 'L': // Signed 20-bit displacement (on all targets we support)
1147     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1148       if (isInt<20>(C->getSExtValue()))
1149         weight = CW_Constant;
1150     break;
1151 
1152   case 'M': // 0x7fffffff
1153     if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
1154       if (C->getZExtValue() == 0x7fffffff)
1155         weight = CW_Constant;
1156     break;
1157   }
1158   return weight;
1159 }
1160 
1161 // Parse a "{tNNN}" register constraint for which the register type "t"
1162 // has already been verified.  MC is the class associated with "t" and
1163 // Map maps 0-based register numbers to LLVM register numbers.
1164 static std::pair<unsigned, const TargetRegisterClass *>
1165 parseRegisterNumber(StringRef Constraint, const TargetRegisterClass *RC,
1166                     const unsigned *Map, unsigned Size) {
1167   assert(*(Constraint.end()-1) == '}' && "Missing '}'");
1168   if (isdigit(Constraint[2])) {
1169     unsigned Index;
1170     bool Failed =
1171         Constraint.slice(2, Constraint.size() - 1).getAsInteger(10, Index);
1172     if (!Failed && Index < Size && Map[Index])
1173       return std::make_pair(Map[Index], RC);
1174   }
1175   return std::make_pair(0U, nullptr);
1176 }
1177 
1178 std::pair<unsigned, const TargetRegisterClass *>
1179 SystemZTargetLowering::getRegForInlineAsmConstraint(
1180     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
1181   if (Constraint.size() == 1) {
1182     // GCC Constraint Letters
1183     switch (Constraint[0]) {
1184     default: break;
1185     case 'd': // Data register (equivalent to 'r')
1186     case 'r': // General-purpose register
1187       if (VT.getSizeInBits() == 64)
1188         return std::make_pair(0U, &SystemZ::GR64BitRegClass);
1189       else if (VT.getSizeInBits() == 128)
1190         return std::make_pair(0U, &SystemZ::GR128BitRegClass);
1191       return std::make_pair(0U, &SystemZ::GR32BitRegClass);
1192 
1193     case 'a': // Address register
1194       if (VT == MVT::i64)
1195         return std::make_pair(0U, &SystemZ::ADDR64BitRegClass);
1196       else if (VT == MVT::i128)
1197         return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
1198       return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
1199 
1200     case 'h': // High-part register (an LLVM extension)
1201       return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
1202 
1203     case 'f': // Floating-point register
1204       if (!useSoftFloat()) {
1205         if (VT.getSizeInBits() == 64)
1206           return std::make_pair(0U, &SystemZ::FP64BitRegClass);
1207         else if (VT.getSizeInBits() == 128)
1208           return std::make_pair(0U, &SystemZ::FP128BitRegClass);
1209         return std::make_pair(0U, &SystemZ::FP32BitRegClass);
1210       }
1211       break;
1212 
1213     case 'v': // Vector register
1214       if (Subtarget.hasVector()) {
1215         if (VT.getSizeInBits() == 32)
1216           return std::make_pair(0U, &SystemZ::VR32BitRegClass);
1217         if (VT.getSizeInBits() == 64)
1218           return std::make_pair(0U, &SystemZ::VR64BitRegClass);
1219         return std::make_pair(0U, &SystemZ::VR128BitRegClass);
1220       }
1221       break;
1222     }
1223   }
1224   if (Constraint.size() > 0 && Constraint[0] == '{') {
1225 
1226     // A clobber constraint (e.g. ~{f0}) will have MVT::Other which is illegal
1227     // to check the size on.
1228     auto getVTSizeInBits = [&VT]() {
1229       return VT == MVT::Other ? 0 : VT.getSizeInBits();
1230     };
1231 
1232     // We need to override the default register parsing for GPRs and FPRs
1233     // because the interpretation depends on VT.  The internal names of
1234     // the registers are also different from the external names
1235     // (F0D and F0S instead of F0, etc.).
1236     if (Constraint[1] == 'r') {
1237       if (getVTSizeInBits() == 32)
1238         return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
1239                                    SystemZMC::GR32Regs, 16);
1240       if (getVTSizeInBits() == 128)
1241         return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
1242                                    SystemZMC::GR128Regs, 16);
1243       return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
1244                                  SystemZMC::GR64Regs, 16);
1245     }
1246     if (Constraint[1] == 'f') {
1247       if (useSoftFloat())
1248         return std::make_pair(
1249             0u, static_cast<const TargetRegisterClass *>(nullptr));
1250       if (getVTSizeInBits() == 32)
1251         return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
1252                                    SystemZMC::FP32Regs, 16);
1253       if (getVTSizeInBits() == 128)
1254         return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
1255                                    SystemZMC::FP128Regs, 16);
1256       return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
1257                                  SystemZMC::FP64Regs, 16);
1258     }
1259     if (Constraint[1] == 'v') {
1260       if (!Subtarget.hasVector())
1261         return std::make_pair(
1262             0u, static_cast<const TargetRegisterClass *>(nullptr));
1263       if (getVTSizeInBits() == 32)
1264         return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
1265                                    SystemZMC::VR32Regs, 32);
1266       if (getVTSizeInBits() == 64)
1267         return parseRegisterNumber(Constraint, &SystemZ::VR64BitRegClass,
1268                                    SystemZMC::VR64Regs, 32);
1269       return parseRegisterNumber(Constraint, &SystemZ::VR128BitRegClass,
1270                                  SystemZMC::VR128Regs, 32);
1271     }
1272   }
1273   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
1274 }
1275 
1276 // FIXME? Maybe this could be a TableGen attribute on some registers and
1277 // this table could be generated automatically from RegInfo.
1278 Register
1279 SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT,
1280                                          const MachineFunction &MF) const {
1281   Register Reg =
1282       StringSwitch<Register>(RegName)
1283           .Case("r4", Subtarget.isTargetXPLINK64() ? SystemZ::R4D : 0)
1284           .Case("r15", Subtarget.isTargetELF() ? SystemZ::R15D : 0)
1285           .Default(0);
1286 
1287   if (Reg)
1288     return Reg;
1289   report_fatal_error("Invalid register name global variable");
1290 }
1291 
1292 void SystemZTargetLowering::
1293 LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
1294                              std::vector<SDValue> &Ops,
1295                              SelectionDAG &DAG) const {
1296   // Only support length 1 constraints for now.
1297   if (Constraint.length() == 1) {
1298     switch (Constraint[0]) {
1299     case 'I': // Unsigned 8-bit constant
1300       if (auto *C = dyn_cast<ConstantSDNode>(Op))
1301         if (isUInt<8>(C->getZExtValue()))
1302           Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1303                                               Op.getValueType()));
1304       return;
1305 
1306     case 'J': // Unsigned 12-bit constant
1307       if (auto *C = dyn_cast<ConstantSDNode>(Op))
1308         if (isUInt<12>(C->getZExtValue()))
1309           Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1310                                               Op.getValueType()));
1311       return;
1312 
1313     case 'K': // Signed 16-bit constant
1314       if (auto *C = dyn_cast<ConstantSDNode>(Op))
1315         if (isInt<16>(C->getSExtValue()))
1316           Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
1317                                               Op.getValueType()));
1318       return;
1319 
1320     case 'L': // Signed 20-bit displacement (on all targets we support)
1321       if (auto *C = dyn_cast<ConstantSDNode>(Op))
1322         if (isInt<20>(C->getSExtValue()))
1323           Ops.push_back(DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
1324                                               Op.getValueType()));
1325       return;
1326 
1327     case 'M': // 0x7fffffff
1328       if (auto *C = dyn_cast<ConstantSDNode>(Op))
1329         if (C->getZExtValue() == 0x7fffffff)
1330           Ops.push_back(DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
1331                                               Op.getValueType()));
1332       return;
1333     }
1334   }
1335   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
1336 }
1337 
1338 //===----------------------------------------------------------------------===//
1339 // Calling conventions
1340 //===----------------------------------------------------------------------===//
1341 
1342 #include "SystemZGenCallingConv.inc"
1343 
1344 const MCPhysReg *SystemZTargetLowering::getScratchRegisters(
1345   CallingConv::ID) const {
1346   static const MCPhysReg ScratchRegs[] = { SystemZ::R0D, SystemZ::R1D,
1347                                            SystemZ::R14D, 0 };
1348   return ScratchRegs;
1349 }
1350 
1351 bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
1352                                                      Type *ToType) const {
1353   return isTruncateFree(FromType, ToType);
1354 }
1355 
1356 bool SystemZTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
1357   return CI->isTailCall();
1358 }
1359 
1360 // We do not yet support 128-bit single-element vector types.  If the user
1361 // attempts to use such types as function argument or return type, prefer
1362 // to error out instead of emitting code violating the ABI.
1363 static void VerifyVectorType(MVT VT, EVT ArgVT) {
1364   if (ArgVT.isVector() && !VT.isVector())
1365     report_fatal_error("Unsupported vector argument or return type");
1366 }
1367 
1368 static void VerifyVectorTypes(const SmallVectorImpl<ISD::InputArg> &Ins) {
1369   for (unsigned i = 0; i < Ins.size(); ++i)
1370     VerifyVectorType(Ins[i].VT, Ins[i].ArgVT);
1371 }
1372 
1373 static void VerifyVectorTypes(const SmallVectorImpl<ISD::OutputArg> &Outs) {
1374   for (unsigned i = 0; i < Outs.size(); ++i)
1375     VerifyVectorType(Outs[i].VT, Outs[i].ArgVT);
1376 }
1377 
1378 // Value is a value that has been passed to us in the location described by VA
1379 // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
1380 // any loads onto Chain.
1381 static SDValue convertLocVTToValVT(SelectionDAG &DAG, const SDLoc &DL,
1382                                    CCValAssign &VA, SDValue Chain,
1383                                    SDValue Value) {
1384   // If the argument has been promoted from a smaller type, insert an
1385   // assertion to capture this.
1386   if (VA.getLocInfo() == CCValAssign::SExt)
1387     Value = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Value,
1388                         DAG.getValueType(VA.getValVT()));
1389   else if (VA.getLocInfo() == CCValAssign::ZExt)
1390     Value = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Value,
1391                         DAG.getValueType(VA.getValVT()));
1392 
1393   if (VA.isExtInLoc())
1394     Value = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Value);
1395   else if (VA.getLocInfo() == CCValAssign::BCvt) {
1396     // If this is a short vector argument loaded from the stack,
1397     // extend from i64 to full vector size and then bitcast.
1398     assert(VA.getLocVT() == MVT::i64);
1399     assert(VA.getValVT().isVector());
1400     Value = DAG.getBuildVector(MVT::v2i64, DL, {Value, DAG.getUNDEF(MVT::i64)});
1401     Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value);
1402   } else
1403     assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo");
1404   return Value;
1405 }
1406 
1407 // Value is a value of type VA.getValVT() that we need to copy into
1408 // the location described by VA.  Return a copy of Value converted to
1409 // VA.getValVT().  The caller is responsible for handling indirect values.
1410 static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
1411                                    CCValAssign &VA, SDValue Value) {
1412   switch (VA.getLocInfo()) {
1413   case CCValAssign::SExt:
1414     return DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Value);
1415   case CCValAssign::ZExt:
1416     return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value);
1417   case CCValAssign::AExt:
1418     return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
1419   case CCValAssign::BCvt: {
1420     assert(VA.getLocVT() == MVT::i64 || VA.getLocVT() == MVT::i128);
1421     assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f32 ||
1422            VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::f128);
1423     // For an f32 vararg we need to first promote it to an f64 and then
1424     // bitcast it to an i64.
1425     if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i64)
1426       Value = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f64, Value);
1427     MVT BitCastToType = VA.getValVT().isVector() && VA.getLocVT() == MVT::i64
1428                             ? MVT::v2i64
1429                             : VA.getLocVT();
1430     Value = DAG.getNode(ISD::BITCAST, DL, BitCastToType, Value);
1431     // For ELF, this is a short vector argument to be stored to the stack,
1432     // bitcast to v2i64 and then extract first element.
1433     if (BitCastToType == MVT::v2i64)
1434       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value,
1435                          DAG.getConstant(0, DL, MVT::i32));
1436     return Value;
1437   }
1438   case CCValAssign::Full:
1439     return Value;
1440   default:
1441     llvm_unreachable("Unhandled getLocInfo()");
1442   }
1443 }
1444 
1445 static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
1446   SDLoc DL(In);
1447   SDValue Lo, Hi;
1448   std::tie(Lo, Hi) = DAG.SplitScalar(In, DL, MVT::i64, MVT::i64);
1449   SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
1450                                     MVT::Untyped, Hi, Lo);
1451   return SDValue(Pair, 0);
1452 }
1453 
1454 static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
1455   SDLoc DL(In);
1456   SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
1457                                           DL, MVT::i64, In);
1458   SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
1459                                           DL, MVT::i64, In);
1460   return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
1461 }
1462 
1463 bool SystemZTargetLowering::splitValueIntoRegisterParts(
1464     SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
1465     unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
1466   EVT ValueVT = Val.getValueType();
1467   if (ValueVT.getSizeInBits() == 128 && NumParts == 1 && PartVT == MVT::Untyped) {
1468     // Inline assembly operand.
1469     Parts[0] = lowerI128ToGR128(DAG, DAG.getBitcast(MVT::i128, Val));
1470     return true;
1471   }
1472 
1473   return false;
1474 }
1475 
1476 SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
1477     SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
1478     MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
1479   if (ValueVT.getSizeInBits() == 128 && NumParts == 1 && PartVT == MVT::Untyped) {
1480     // Inline assembly operand.
1481     SDValue Res = lowerGR128ToI128(DAG, Parts[0]);
1482     return DAG.getBitcast(ValueVT, Res);
1483   }
1484 
1485   return SDValue();
1486 }
1487 
1488 SDValue SystemZTargetLowering::LowerFormalArguments(
1489     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
1490     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1491     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1492   MachineFunction &MF = DAG.getMachineFunction();
1493   MachineFrameInfo &MFI = MF.getFrameInfo();
1494   MachineRegisterInfo &MRI = MF.getRegInfo();
1495   SystemZMachineFunctionInfo *FuncInfo =
1496       MF.getInfo<SystemZMachineFunctionInfo>();
1497   auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
1498   EVT PtrVT = getPointerTy(DAG.getDataLayout());
1499 
1500   // Detect unsupported vector argument types.
1501   if (Subtarget.hasVector())
1502     VerifyVectorTypes(Ins);
1503 
1504   // Assign locations to all of the incoming arguments.
1505   SmallVector<CCValAssign, 16> ArgLocs;
1506   SystemZCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
1507   CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
1508   FuncInfo->setSizeOfFnParams(CCInfo.getStackSize());
1509 
1510   unsigned NumFixedGPRs = 0;
1511   unsigned NumFixedFPRs = 0;
1512   for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1513     SDValue ArgValue;
1514     CCValAssign &VA = ArgLocs[I];
1515     EVT LocVT = VA.getLocVT();
1516     if (VA.isRegLoc()) {
1517       // Arguments passed in registers
1518       const TargetRegisterClass *RC;
1519       switch (LocVT.getSimpleVT().SimpleTy) {
1520       default:
1521         // Integers smaller than i64 should be promoted to i64.
1522         llvm_unreachable("Unexpected argument type");
1523       case MVT::i32:
1524         NumFixedGPRs += 1;
1525         RC = &SystemZ::GR32BitRegClass;
1526         break;
1527       case MVT::i64:
1528         NumFixedGPRs += 1;
1529         RC = &SystemZ::GR64BitRegClass;
1530         break;
1531       case MVT::f32:
1532         NumFixedFPRs += 1;
1533         RC = &SystemZ::FP32BitRegClass;
1534         break;
1535       case MVT::f64:
1536         NumFixedFPRs += 1;
1537         RC = &SystemZ::FP64BitRegClass;
1538         break;
1539       case MVT::f128:
1540         NumFixedFPRs += 2;
1541         RC = &SystemZ::FP128BitRegClass;
1542         break;
1543       case MVT::v16i8:
1544       case MVT::v8i16:
1545       case MVT::v4i32:
1546       case MVT::v2i64:
1547       case MVT::v4f32:
1548       case MVT::v2f64:
1549         RC = &SystemZ::VR128BitRegClass;
1550         break;
1551       }
1552 
1553       Register VReg = MRI.createVirtualRegister(RC);
1554       MRI.addLiveIn(VA.getLocReg(), VReg);
1555       ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, LocVT);
1556     } else {
1557       assert(VA.isMemLoc() && "Argument not register or memory");
1558 
1559       // Create the frame index object for this incoming parameter.
1560       // FIXME: Pre-include call frame size in the offset, should not
1561       // need to manually add it here.
1562       int64_t ArgSPOffset = VA.getLocMemOffset();
1563       if (Subtarget.isTargetXPLINK64()) {
1564         auto &XPRegs =
1565             Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
1566         ArgSPOffset += XPRegs.getCallFrameSize();
1567       }
1568       int FI =
1569           MFI.CreateFixedObject(LocVT.getSizeInBits() / 8, ArgSPOffset, true);
1570 
1571       // Create the SelectionDAG nodes corresponding to a load
1572       // from this parameter.  Unpromoted ints and floats are
1573       // passed as right-justified 8-byte values.
1574       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
1575       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
1576         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
1577                           DAG.getIntPtrConstant(4, DL));
1578       ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN,
1579                              MachinePointerInfo::getFixedStack(MF, FI));
1580     }
1581 
1582     // Convert the value of the argument register into the value that's
1583     // being passed.
1584     if (VA.getLocInfo() == CCValAssign::Indirect) {
1585       InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
1586                                    MachinePointerInfo()));
1587       // If the original argument was split (e.g. i128), we need
1588       // to load all parts of it here (using the same address).
1589       unsigned ArgIndex = Ins[I].OrigArgIndex;
1590       assert (Ins[I].PartOffset == 0);
1591       while (I + 1 != E && Ins[I + 1].OrigArgIndex == ArgIndex) {
1592         CCValAssign &PartVA = ArgLocs[I + 1];
1593         unsigned PartOffset = Ins[I + 1].PartOffset;
1594         SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, ArgValue,
1595                                       DAG.getIntPtrConstant(PartOffset, DL));
1596         InVals.push_back(DAG.getLoad(PartVA.getValVT(), DL, Chain, Address,
1597                                      MachinePointerInfo()));
1598         ++I;
1599       }
1600     } else
1601       InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, ArgValue));
1602   }
1603 
1604   // FIXME: Add support for lowering varargs for XPLINK64 in a later patch.
1605   if (IsVarArg && Subtarget.isTargetELF()) {
1606     // Save the number of non-varargs registers for later use by va_start, etc.
1607     FuncInfo->setVarArgsFirstGPR(NumFixedGPRs);
1608     FuncInfo->setVarArgsFirstFPR(NumFixedFPRs);
1609 
1610     // Likewise the address (in the form of a frame index) of where the
1611     // first stack vararg would be.  The 1-byte size here is arbitrary.
1612     int64_t VarArgsOffset = CCInfo.getStackSize();
1613     FuncInfo->setVarArgsFrameIndex(
1614         MFI.CreateFixedObject(1, VarArgsOffset, true));
1615 
1616     // ...and a similar frame index for the caller-allocated save area
1617     // that will be used to store the incoming registers.
1618     int64_t RegSaveOffset =
1619       -SystemZMC::ELFCallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16;
1620     unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
1621     FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
1622 
1623     // Store the FPR varargs in the reserved frame slots.  (We store the
1624     // GPRs as part of the prologue.)
1625     if (NumFixedFPRs < SystemZ::ELFNumArgFPRs && !useSoftFloat()) {
1626       SDValue MemOps[SystemZ::ELFNumArgFPRs];
1627       for (unsigned I = NumFixedFPRs; I < SystemZ::ELFNumArgFPRs; ++I) {
1628         unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ELFArgFPRs[I]);
1629         int FI =
1630           MFI.CreateFixedObject(8, -SystemZMC::ELFCallFrameSize + Offset, true);
1631         SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
1632         Register VReg = MF.addLiveIn(SystemZ::ELFArgFPRs[I],
1633                                      &SystemZ::FP64BitRegClass);
1634         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
1635         MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
1636                                  MachinePointerInfo::getFixedStack(MF, FI));
1637       }
1638       // Join the stores, which are independent of one another.
1639       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1640                           ArrayRef(&MemOps[NumFixedFPRs],
1641                                    SystemZ::ELFNumArgFPRs - NumFixedFPRs));
1642     }
1643   }
1644 
1645   if (Subtarget.isTargetXPLINK64()) {
1646     // Create virual register  for handling incoming "ADA" special register (R5)
1647     const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
1648     Register ADAvReg = MRI.createVirtualRegister(RC);
1649     auto *Regs = static_cast<SystemZXPLINK64Registers *>(
1650         Subtarget.getSpecialRegisters());
1651     MRI.addLiveIn(Regs->getADARegister(), ADAvReg);
1652     FuncInfo->setADAVirtualRegister(ADAvReg);
1653   }
1654   return Chain;
1655 }
1656 
1657 static bool canUseSiblingCall(const CCState &ArgCCInfo,
1658                               SmallVectorImpl<CCValAssign> &ArgLocs,
1659                               SmallVectorImpl<ISD::OutputArg> &Outs) {
1660   // Punt if there are any indirect or stack arguments, or if the call
1661   // needs the callee-saved argument register R6, or if the call uses
1662   // the callee-saved register arguments SwiftSelf and SwiftError.
1663   for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1664     CCValAssign &VA = ArgLocs[I];
1665     if (VA.getLocInfo() == CCValAssign::Indirect)
1666       return false;
1667     if (!VA.isRegLoc())
1668       return false;
1669     Register Reg = VA.getLocReg();
1670     if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
1671       return false;
1672     if (Outs[I].Flags.isSwiftSelf() || Outs[I].Flags.isSwiftError())
1673       return false;
1674   }
1675   return true;
1676 }
1677 
1678 static SDValue getADAEntry(SelectionDAG &DAG, SDValue Val, SDLoc DL,
1679                            unsigned Offset, bool LoadAdr = false) {
1680   MachineFunction &MF = DAG.getMachineFunction();
1681   SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
1682   unsigned ADAvReg = MFI->getADAVirtualRegister();
1683   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1684 
1685   SDValue Reg = DAG.getRegister(ADAvReg, PtrVT);
1686   SDValue Ofs = DAG.getTargetConstant(Offset, DL, PtrVT);
1687 
1688   SDValue Result = DAG.getNode(SystemZISD::ADA_ENTRY, DL, PtrVT, Val, Reg, Ofs);
1689   if (!LoadAdr)
1690     Result = DAG.getLoad(
1691         PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo(), Align(8),
1692         MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant);
1693 
1694   return Result;
1695 }
1696 
1697 // ADA access using Global value
1698 // Note: for functions, address of descriptor is returned
1699 static SDValue getADAEntry(SelectionDAG &DAG, const GlobalValue *GV, SDLoc DL,
1700                            EVT PtrVT) {
1701   unsigned ADAtype;
1702   bool LoadAddr = false;
1703   const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV);
1704   bool IsFunction =
1705       (isa<Function>(GV)) || (GA && isa<Function>(GA->getAliaseeObject()));
1706   bool IsInternal = (GV->hasInternalLinkage() || GV->hasPrivateLinkage());
1707 
1708   if (IsFunction) {
1709     if (IsInternal) {
1710       ADAtype = SystemZII::MO_ADA_DIRECT_FUNC_DESC;
1711       LoadAddr = true;
1712     } else
1713       ADAtype = SystemZII::MO_ADA_INDIRECT_FUNC_DESC;
1714   } else {
1715     ADAtype = SystemZII::MO_ADA_DATA_SYMBOL_ADDR;
1716   }
1717   SDValue Val = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ADAtype);
1718 
1719   return getADAEntry(DAG, Val, DL, 0, LoadAddr);
1720 }
1721 
1722 static bool getzOSCalleeAndADA(SelectionDAG &DAG, SDValue &Callee, SDValue &ADA,
1723                                SDLoc &DL, SDValue &Chain) {
1724   unsigned ADADelta = 0; // ADA offset in desc.
1725   unsigned EPADelta = 8; // EPA offset in desc.
1726   MachineFunction &MF = DAG.getMachineFunction();
1727   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
1728 
1729   // XPLink calling convention.
1730   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1731     bool IsInternal = (G->getGlobal()->hasInternalLinkage() ||
1732                        G->getGlobal()->hasPrivateLinkage());
1733     if (IsInternal) {
1734       SystemZMachineFunctionInfo *MFI =
1735           MF.getInfo<SystemZMachineFunctionInfo>();
1736       unsigned ADAvReg = MFI->getADAVirtualRegister();
1737       ADA = DAG.getCopyFromReg(Chain, DL, ADAvReg, PtrVT);
1738       Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
1739       Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1740       return true;
1741     } else {
1742       SDValue GA = DAG.getTargetGlobalAddress(
1743           G->getGlobal(), DL, PtrVT, 0, SystemZII::MO_ADA_DIRECT_FUNC_DESC);
1744       ADA = getADAEntry(DAG, GA, DL, ADADelta);
1745       Callee = getADAEntry(DAG, GA, DL, EPADelta);
1746     }
1747   } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1748     SDValue ES = DAG.getTargetExternalSymbol(
1749         E->getSymbol(), PtrVT, SystemZII::MO_ADA_DIRECT_FUNC_DESC);
1750     ADA = getADAEntry(DAG, ES, DL, ADADelta);
1751     Callee = getADAEntry(DAG, ES, DL, EPADelta);
1752   } else {
1753     // Function pointer case
1754     ADA = DAG.getNode(ISD::ADD, DL, PtrVT, Callee,
1755                       DAG.getConstant(ADADelta, DL, PtrVT));
1756     ADA = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), ADA,
1757                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
1758     Callee = DAG.getNode(ISD::ADD, DL, PtrVT, Callee,
1759                          DAG.getConstant(EPADelta, DL, PtrVT));
1760     Callee = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Callee,
1761                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
1762   }
1763   return false;
1764 }
1765 
1766 SDValue
1767 SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
1768                                  SmallVectorImpl<SDValue> &InVals) const {
1769   SelectionDAG &DAG = CLI.DAG;
1770   SDLoc &DL = CLI.DL;
1771   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1772   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1773   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1774   SDValue Chain = CLI.Chain;
1775   SDValue Callee = CLI.Callee;
1776   bool &IsTailCall = CLI.IsTailCall;
1777   CallingConv::ID CallConv = CLI.CallConv;
1778   bool IsVarArg = CLI.IsVarArg;
1779   MachineFunction &MF = DAG.getMachineFunction();
1780   EVT PtrVT = getPointerTy(MF.getDataLayout());
1781   LLVMContext &Ctx = *DAG.getContext();
1782   SystemZCallingConventionRegisters *Regs = Subtarget.getSpecialRegisters();
1783 
1784   // FIXME: z/OS support to be added in later.
1785   if (Subtarget.isTargetXPLINK64())
1786     IsTailCall = false;
1787 
1788   // Detect unsupported vector argument and return types.
1789   if (Subtarget.hasVector()) {
1790     VerifyVectorTypes(Outs);
1791     VerifyVectorTypes(Ins);
1792   }
1793 
1794   // Analyze the operands of the call, assigning locations to each operand.
1795   SmallVector<CCValAssign, 16> ArgLocs;
1796   SystemZCCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, Ctx);
1797   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
1798 
1799   // We don't support GuaranteedTailCallOpt, only automatically-detected
1800   // sibling calls.
1801   if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs, Outs))
1802     IsTailCall = false;
1803 
1804   // Get a count of how many bytes are to be pushed on the stack.
1805   unsigned NumBytes = ArgCCInfo.getStackSize();
1806 
1807   if (Subtarget.isTargetXPLINK64())
1808     // Although the XPLINK specifications for AMODE64 state that minimum size
1809     // of the param area is minimum 32 bytes and no rounding is otherwise
1810     // specified, we round this area in 64 bytes increments to be compatible
1811     // with existing compilers.
1812     NumBytes = std::max(64U, (unsigned)alignTo(NumBytes, 64));
1813 
1814   // Mark the start of the call.
1815   if (!IsTailCall)
1816     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
1817 
1818   // Copy argument values to their designated locations.
1819   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
1820   SmallVector<SDValue, 8> MemOpChains;
1821   SDValue StackPtr;
1822   for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
1823     CCValAssign &VA = ArgLocs[I];
1824     SDValue ArgValue = OutVals[I];
1825 
1826     if (VA.getLocInfo() == CCValAssign::Indirect) {
1827       // Store the argument in a stack slot and pass its address.
1828       unsigned ArgIndex = Outs[I].OrigArgIndex;
1829       EVT SlotVT;
1830       if (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
1831         // Allocate the full stack space for a promoted (and split) argument.
1832         Type *OrigArgType = CLI.Args[Outs[I].OrigArgIndex].Ty;
1833         EVT OrigArgVT = getValueType(MF.getDataLayout(), OrigArgType);
1834         MVT PartVT = getRegisterTypeForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
1835         unsigned N = getNumRegistersForCallingConv(Ctx, CLI.CallConv, OrigArgVT);
1836         SlotVT = EVT::getIntegerVT(Ctx, PartVT.getSizeInBits() * N);
1837       } else {
1838         SlotVT = Outs[I].ArgVT;
1839       }
1840       SDValue SpillSlot = DAG.CreateStackTemporary(SlotVT);
1841       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
1842       MemOpChains.push_back(
1843           DAG.getStore(Chain, DL, ArgValue, SpillSlot,
1844                        MachinePointerInfo::getFixedStack(MF, FI)));
1845       // If the original argument was split (e.g. i128), we need
1846       // to store all parts of it here (and pass just one address).
1847       assert (Outs[I].PartOffset == 0);
1848       while (I + 1 != E && Outs[I + 1].OrigArgIndex == ArgIndex) {
1849         SDValue PartValue = OutVals[I + 1];
1850         unsigned PartOffset = Outs[I + 1].PartOffset;
1851         SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot,
1852                                       DAG.getIntPtrConstant(PartOffset, DL));
1853         MemOpChains.push_back(
1854             DAG.getStore(Chain, DL, PartValue, Address,
1855                          MachinePointerInfo::getFixedStack(MF, FI)));
1856         assert((PartOffset + PartValue.getValueType().getStoreSize() <=
1857                 SlotVT.getStoreSize()) && "Not enough space for argument part!");
1858         ++I;
1859       }
1860       ArgValue = SpillSlot;
1861     } else
1862       ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue);
1863 
1864     if (VA.isRegLoc()) {
1865       // In XPLINK64, for the 128-bit vararg case, ArgValue is bitcasted to a
1866       // MVT::i128 type. We decompose the 128-bit type to a pair of its high
1867       // and low values.
1868       if (VA.getLocVT() == MVT::i128)
1869         ArgValue = lowerI128ToGR128(DAG, ArgValue);
1870       // Queue up the argument copies and emit them at the end.
1871       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
1872     } else {
1873       assert(VA.isMemLoc() && "Argument not register or memory");
1874 
1875       // Work out the address of the stack slot.  Unpromoted ints and
1876       // floats are passed as right-justified 8-byte values.
1877       if (!StackPtr.getNode())
1878         StackPtr = DAG.getCopyFromReg(Chain, DL,
1879                                       Regs->getStackPointerRegister(), PtrVT);
1880       unsigned Offset = Regs->getStackPointerBias() + Regs->getCallFrameSize() +
1881                         VA.getLocMemOffset();
1882       if (VA.getLocVT() == MVT::i32 || VA.getLocVT() == MVT::f32)
1883         Offset += 4;
1884       SDValue Address = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
1885                                     DAG.getIntPtrConstant(Offset, DL));
1886 
1887       // Emit the store.
1888       MemOpChains.push_back(
1889           DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
1890 
1891       // Although long doubles or vectors are passed through the stack when
1892       // they are vararg (non-fixed arguments), if a long double or vector
1893       // occupies the third and fourth slot of the argument list GPR3 should
1894       // still shadow the third slot of the argument list.
1895       if (Subtarget.isTargetXPLINK64() && VA.needsCustom()) {
1896         SDValue ShadowArgValue =
1897             DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, ArgValue,
1898                         DAG.getIntPtrConstant(1, DL));
1899         RegsToPass.push_back(std::make_pair(SystemZ::R3D, ShadowArgValue));
1900       }
1901     }
1902   }
1903 
1904   // Join the stores, which are independent of one another.
1905   if (!MemOpChains.empty())
1906     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
1907 
1908   // Accept direct calls by converting symbolic call addresses to the
1909   // associated Target* opcodes.  Force %r1 to be used for indirect
1910   // tail calls.
1911   SDValue Glue;
1912 
1913   if (Subtarget.isTargetXPLINK64()) {
1914     SDValue ADA;
1915     bool IsBRASL = getzOSCalleeAndADA(DAG, Callee, ADA, DL, Chain);
1916     if (!IsBRASL) {
1917       unsigned CalleeReg = static_cast<SystemZXPLINK64Registers *>(Regs)
1918                                ->getAddressOfCalleeRegister();
1919       Chain = DAG.getCopyToReg(Chain, DL, CalleeReg, Callee, Glue);
1920       Glue = Chain.getValue(1);
1921       Callee = DAG.getRegister(CalleeReg, Callee.getValueType());
1922     }
1923     RegsToPass.push_back(std::make_pair(
1924         static_cast<SystemZXPLINK64Registers *>(Regs)->getADARegister(), ADA));
1925   } else {
1926     if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
1927       Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
1928       Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1929     } else if (auto *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
1930       Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
1931       Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
1932     } else if (IsTailCall) {
1933       Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
1934       Glue = Chain.getValue(1);
1935       Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
1936     }
1937   }
1938 
1939   // Build a sequence of copy-to-reg nodes, chained and glued together.
1940   for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
1941     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
1942                              RegsToPass[I].second, Glue);
1943     Glue = Chain.getValue(1);
1944   }
1945 
1946   // The first call operand is the chain and the second is the target address.
1947   SmallVector<SDValue, 8> Ops;
1948   Ops.push_back(Chain);
1949   Ops.push_back(Callee);
1950 
1951   // Add argument registers to the end of the list so that they are
1952   // known live into the call.
1953   for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I)
1954     Ops.push_back(DAG.getRegister(RegsToPass[I].first,
1955                                   RegsToPass[I].second.getValueType()));
1956 
1957   // Add a register mask operand representing the call-preserved registers.
1958   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
1959   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
1960   assert(Mask && "Missing call preserved mask for calling convention");
1961   Ops.push_back(DAG.getRegisterMask(Mask));
1962 
1963   // Glue the call to the argument copies, if any.
1964   if (Glue.getNode())
1965     Ops.push_back(Glue);
1966 
1967   // Emit the call.
1968   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
1969   if (IsTailCall) {
1970     SDValue Ret = DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
1971     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
1972     return Ret;
1973   }
1974   Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
1975   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
1976   Glue = Chain.getValue(1);
1977 
1978   // Mark the end of the call, which is glued to the call itself.
1979   Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, Glue, DL);
1980   Glue = Chain.getValue(1);
1981 
1982   // Assign locations to each value returned by this call.
1983   SmallVector<CCValAssign, 16> RetLocs;
1984   CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, Ctx);
1985   RetCCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
1986 
1987   // Copy all of the result registers out of their specified physreg.
1988   for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
1989     CCValAssign &VA = RetLocs[I];
1990 
1991     // Copy the value out, gluing the copy to the end of the call sequence.
1992     SDValue RetValue = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(),
1993                                           VA.getLocVT(), Glue);
1994     Chain = RetValue.getValue(1);
1995     Glue = RetValue.getValue(2);
1996 
1997     // Convert the value of the return register into the value that's
1998     // being returned.
1999     InVals.push_back(convertLocVTToValVT(DAG, DL, VA, Chain, RetValue));
2000   }
2001 
2002   return Chain;
2003 }
2004 
2005 // Generate a call taking the given operands as arguments and returning a
2006 // result of type RetVT.
2007 std::pair<SDValue, SDValue> SystemZTargetLowering::makeExternalCall(
2008     SDValue Chain, SelectionDAG &DAG, const char *CalleeName, EVT RetVT,
2009     ArrayRef<SDValue> Ops, CallingConv::ID CallConv, bool IsSigned, SDLoc DL,
2010     bool DoesNotReturn, bool IsReturnValueUsed) const {
2011   TargetLowering::ArgListTy Args;
2012   Args.reserve(Ops.size());
2013 
2014   TargetLowering::ArgListEntry Entry;
2015   for (SDValue Op : Ops) {
2016     Entry.Node = Op;
2017     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
2018     Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), IsSigned);
2019     Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), IsSigned);
2020     Args.push_back(Entry);
2021   }
2022 
2023   SDValue Callee =
2024       DAG.getExternalSymbol(CalleeName, getPointerTy(DAG.getDataLayout()));
2025 
2026   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
2027   TargetLowering::CallLoweringInfo CLI(DAG);
2028   bool SignExtend = shouldSignExtendTypeInLibCall(RetVT, IsSigned);
2029   CLI.setDebugLoc(DL)
2030       .setChain(Chain)
2031       .setCallee(CallConv, RetTy, Callee, std::move(Args))
2032       .setNoReturn(DoesNotReturn)
2033       .setDiscardResult(!IsReturnValueUsed)
2034       .setSExtResult(SignExtend)
2035       .setZExtResult(!SignExtend);
2036   return LowerCallTo(CLI);
2037 }
2038 
2039 bool SystemZTargetLowering::
2040 CanLowerReturn(CallingConv::ID CallConv,
2041                MachineFunction &MF, bool isVarArg,
2042                const SmallVectorImpl<ISD::OutputArg> &Outs,
2043                LLVMContext &Context) const {
2044   // Detect unsupported vector return types.
2045   if (Subtarget.hasVector())
2046     VerifyVectorTypes(Outs);
2047 
2048   // Special case that we cannot easily detect in RetCC_SystemZ since
2049   // i128 is not a legal type.
2050   for (auto &Out : Outs)
2051     if (Out.ArgVT == MVT::i128)
2052       return false;
2053 
2054   SmallVector<CCValAssign, 16> RetLocs;
2055   CCState RetCCInfo(CallConv, isVarArg, MF, RetLocs, Context);
2056   return RetCCInfo.CheckReturn(Outs, RetCC_SystemZ);
2057 }
2058 
2059 SDValue
2060 SystemZTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2061                                    bool IsVarArg,
2062                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
2063                                    const SmallVectorImpl<SDValue> &OutVals,
2064                                    const SDLoc &DL, SelectionDAG &DAG) const {
2065   MachineFunction &MF = DAG.getMachineFunction();
2066 
2067   // Detect unsupported vector return types.
2068   if (Subtarget.hasVector())
2069     VerifyVectorTypes(Outs);
2070 
2071   // Assign locations to each returned value.
2072   SmallVector<CCValAssign, 16> RetLocs;
2073   CCState RetCCInfo(CallConv, IsVarArg, MF, RetLocs, *DAG.getContext());
2074   RetCCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
2075 
2076   // Quick exit for void returns
2077   if (RetLocs.empty())
2078     return DAG.getNode(SystemZISD::RET_GLUE, DL, MVT::Other, Chain);
2079 
2080   if (CallConv == CallingConv::GHC)
2081     report_fatal_error("GHC functions return void only");
2082 
2083   // Copy the result values into the output registers.
2084   SDValue Glue;
2085   SmallVector<SDValue, 4> RetOps;
2086   RetOps.push_back(Chain);
2087   for (unsigned I = 0, E = RetLocs.size(); I != E; ++I) {
2088     CCValAssign &VA = RetLocs[I];
2089     SDValue RetValue = OutVals[I];
2090 
2091     // Make the return register live on exit.
2092     assert(VA.isRegLoc() && "Can only return in registers!");
2093 
2094     // Promote the value as required.
2095     RetValue = convertValVTToLocVT(DAG, DL, VA, RetValue);
2096 
2097     // Chain and glue the copies together.
2098     Register Reg = VA.getLocReg();
2099     Chain = DAG.getCopyToReg(Chain, DL, Reg, RetValue, Glue);
2100     Glue = Chain.getValue(1);
2101     RetOps.push_back(DAG.getRegister(Reg, VA.getLocVT()));
2102   }
2103 
2104   // Update chain and glue.
2105   RetOps[0] = Chain;
2106   if (Glue.getNode())
2107     RetOps.push_back(Glue);
2108 
2109   return DAG.getNode(SystemZISD::RET_GLUE, DL, MVT::Other, RetOps);
2110 }
2111 
2112 // Return true if Op is an intrinsic node with chain that returns the CC value
2113 // as its only (other) argument.  Provide the associated SystemZISD opcode and
2114 // the mask of valid CC values if so.
2115 static bool isIntrinsicWithCCAndChain(SDValue Op, unsigned &Opcode,
2116                                       unsigned &CCValid) {
2117   unsigned Id = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
2118   switch (Id) {
2119   case Intrinsic::s390_tbegin:
2120     Opcode = SystemZISD::TBEGIN;
2121     CCValid = SystemZ::CCMASK_TBEGIN;
2122     return true;
2123 
2124   case Intrinsic::s390_tbegin_nofloat:
2125     Opcode = SystemZISD::TBEGIN_NOFLOAT;
2126     CCValid = SystemZ::CCMASK_TBEGIN;
2127     return true;
2128 
2129   case Intrinsic::s390_tend:
2130     Opcode = SystemZISD::TEND;
2131     CCValid = SystemZ::CCMASK_TEND;
2132     return true;
2133 
2134   default:
2135     return false;
2136   }
2137 }
2138 
2139 // Return true if Op is an intrinsic node without chain that returns the
2140 // CC value as its final argument.  Provide the associated SystemZISD
2141 // opcode and the mask of valid CC values if so.
2142 static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) {
2143   unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
2144   switch (Id) {
2145   case Intrinsic::s390_vpkshs:
2146   case Intrinsic::s390_vpksfs:
2147   case Intrinsic::s390_vpksgs:
2148     Opcode = SystemZISD::PACKS_CC;
2149     CCValid = SystemZ::CCMASK_VCMP;
2150     return true;
2151 
2152   case Intrinsic::s390_vpklshs:
2153   case Intrinsic::s390_vpklsfs:
2154   case Intrinsic::s390_vpklsgs:
2155     Opcode = SystemZISD::PACKLS_CC;
2156     CCValid = SystemZ::CCMASK_VCMP;
2157     return true;
2158 
2159   case Intrinsic::s390_vceqbs:
2160   case Intrinsic::s390_vceqhs:
2161   case Intrinsic::s390_vceqfs:
2162   case Intrinsic::s390_vceqgs:
2163     Opcode = SystemZISD::VICMPES;
2164     CCValid = SystemZ::CCMASK_VCMP;
2165     return true;
2166 
2167   case Intrinsic::s390_vchbs:
2168   case Intrinsic::s390_vchhs:
2169   case Intrinsic::s390_vchfs:
2170   case Intrinsic::s390_vchgs:
2171     Opcode = SystemZISD::VICMPHS;
2172     CCValid = SystemZ::CCMASK_VCMP;
2173     return true;
2174 
2175   case Intrinsic::s390_vchlbs:
2176   case Intrinsic::s390_vchlhs:
2177   case Intrinsic::s390_vchlfs:
2178   case Intrinsic::s390_vchlgs:
2179     Opcode = SystemZISD::VICMPHLS;
2180     CCValid = SystemZ::CCMASK_VCMP;
2181     return true;
2182 
2183   case Intrinsic::s390_vtm:
2184     Opcode = SystemZISD::VTM;
2185     CCValid = SystemZ::CCMASK_VCMP;
2186     return true;
2187 
2188   case Intrinsic::s390_vfaebs:
2189   case Intrinsic::s390_vfaehs:
2190   case Intrinsic::s390_vfaefs:
2191     Opcode = SystemZISD::VFAE_CC;
2192     CCValid = SystemZ::CCMASK_ANY;
2193     return true;
2194 
2195   case Intrinsic::s390_vfaezbs:
2196   case Intrinsic::s390_vfaezhs:
2197   case Intrinsic::s390_vfaezfs:
2198     Opcode = SystemZISD::VFAEZ_CC;
2199     CCValid = SystemZ::CCMASK_ANY;
2200     return true;
2201 
2202   case Intrinsic::s390_vfeebs:
2203   case Intrinsic::s390_vfeehs:
2204   case Intrinsic::s390_vfeefs:
2205     Opcode = SystemZISD::VFEE_CC;
2206     CCValid = SystemZ::CCMASK_ANY;
2207     return true;
2208 
2209   case Intrinsic::s390_vfeezbs:
2210   case Intrinsic::s390_vfeezhs:
2211   case Intrinsic::s390_vfeezfs:
2212     Opcode = SystemZISD::VFEEZ_CC;
2213     CCValid = SystemZ::CCMASK_ANY;
2214     return true;
2215 
2216   case Intrinsic::s390_vfenebs:
2217   case Intrinsic::s390_vfenehs:
2218   case Intrinsic::s390_vfenefs:
2219     Opcode = SystemZISD::VFENE_CC;
2220     CCValid = SystemZ::CCMASK_ANY;
2221     return true;
2222 
2223   case Intrinsic::s390_vfenezbs:
2224   case Intrinsic::s390_vfenezhs:
2225   case Intrinsic::s390_vfenezfs:
2226     Opcode = SystemZISD::VFENEZ_CC;
2227     CCValid = SystemZ::CCMASK_ANY;
2228     return true;
2229 
2230   case Intrinsic::s390_vistrbs:
2231   case Intrinsic::s390_vistrhs:
2232   case Intrinsic::s390_vistrfs:
2233     Opcode = SystemZISD::VISTR_CC;
2234     CCValid = SystemZ::CCMASK_0 | SystemZ::CCMASK_3;
2235     return true;
2236 
2237   case Intrinsic::s390_vstrcbs:
2238   case Intrinsic::s390_vstrchs:
2239   case Intrinsic::s390_vstrcfs:
2240     Opcode = SystemZISD::VSTRC_CC;
2241     CCValid = SystemZ::CCMASK_ANY;
2242     return true;
2243 
2244   case Intrinsic::s390_vstrczbs:
2245   case Intrinsic::s390_vstrczhs:
2246   case Intrinsic::s390_vstrczfs:
2247     Opcode = SystemZISD::VSTRCZ_CC;
2248     CCValid = SystemZ::CCMASK_ANY;
2249     return true;
2250 
2251   case Intrinsic::s390_vstrsb:
2252   case Intrinsic::s390_vstrsh:
2253   case Intrinsic::s390_vstrsf:
2254     Opcode = SystemZISD::VSTRS_CC;
2255     CCValid = SystemZ::CCMASK_ANY;
2256     return true;
2257 
2258   case Intrinsic::s390_vstrszb:
2259   case Intrinsic::s390_vstrszh:
2260   case Intrinsic::s390_vstrszf:
2261     Opcode = SystemZISD::VSTRSZ_CC;
2262     CCValid = SystemZ::CCMASK_ANY;
2263     return true;
2264 
2265   case Intrinsic::s390_vfcedbs:
2266   case Intrinsic::s390_vfcesbs:
2267     Opcode = SystemZISD::VFCMPES;
2268     CCValid = SystemZ::CCMASK_VCMP;
2269     return true;
2270 
2271   case Intrinsic::s390_vfchdbs:
2272   case Intrinsic::s390_vfchsbs:
2273     Opcode = SystemZISD::VFCMPHS;
2274     CCValid = SystemZ::CCMASK_VCMP;
2275     return true;
2276 
2277   case Intrinsic::s390_vfchedbs:
2278   case Intrinsic::s390_vfchesbs:
2279     Opcode = SystemZISD::VFCMPHES;
2280     CCValid = SystemZ::CCMASK_VCMP;
2281     return true;
2282 
2283   case Intrinsic::s390_vftcidb:
2284   case Intrinsic::s390_vftcisb:
2285     Opcode = SystemZISD::VFTCI;
2286     CCValid = SystemZ::CCMASK_VCMP;
2287     return true;
2288 
2289   case Intrinsic::s390_tdc:
2290     Opcode = SystemZISD::TDC;
2291     CCValid = SystemZ::CCMASK_TDC;
2292     return true;
2293 
2294   default:
2295     return false;
2296   }
2297 }
2298 
2299 // Emit an intrinsic with chain and an explicit CC register result.
2300 static SDNode *emitIntrinsicWithCCAndChain(SelectionDAG &DAG, SDValue Op,
2301                                            unsigned Opcode) {
2302   // Copy all operands except the intrinsic ID.
2303   unsigned NumOps = Op.getNumOperands();
2304   SmallVector<SDValue, 6> Ops;
2305   Ops.reserve(NumOps - 1);
2306   Ops.push_back(Op.getOperand(0));
2307   for (unsigned I = 2; I < NumOps; ++I)
2308     Ops.push_back(Op.getOperand(I));
2309 
2310   assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
2311   SDVTList RawVTs = DAG.getVTList(MVT::i32, MVT::Other);
2312   SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), RawVTs, Ops);
2313   SDValue OldChain = SDValue(Op.getNode(), 1);
2314   SDValue NewChain = SDValue(Intr.getNode(), 1);
2315   DAG.ReplaceAllUsesOfValueWith(OldChain, NewChain);
2316   return Intr.getNode();
2317 }
2318 
2319 // Emit an intrinsic with an explicit CC register result.
2320 static SDNode *emitIntrinsicWithCC(SelectionDAG &DAG, SDValue Op,
2321                                    unsigned Opcode) {
2322   // Copy all operands except the intrinsic ID.
2323   unsigned NumOps = Op.getNumOperands();
2324   SmallVector<SDValue, 6> Ops;
2325   Ops.reserve(NumOps - 1);
2326   for (unsigned I = 1; I < NumOps; ++I)
2327     Ops.push_back(Op.getOperand(I));
2328 
2329   SDValue Intr = DAG.getNode(Opcode, SDLoc(Op), Op->getVTList(), Ops);
2330   return Intr.getNode();
2331 }
2332 
2333 // CC is a comparison that will be implemented using an integer or
2334 // floating-point comparison.  Return the condition code mask for
2335 // a branch on true.  In the integer case, CCMASK_CMP_UO is set for
2336 // unsigned comparisons and clear for signed ones.  In the floating-point
2337 // case, CCMASK_CMP_UO has its normal mask meaning (unordered).
2338 static unsigned CCMaskForCondCode(ISD::CondCode CC) {
2339 #define CONV(X) \
2340   case ISD::SET##X: return SystemZ::CCMASK_CMP_##X; \
2341   case ISD::SETO##X: return SystemZ::CCMASK_CMP_##X; \
2342   case ISD::SETU##X: return SystemZ::CCMASK_CMP_UO | SystemZ::CCMASK_CMP_##X
2343 
2344   switch (CC) {
2345   default:
2346     llvm_unreachable("Invalid integer condition!");
2347 
2348   CONV(EQ);
2349   CONV(NE);
2350   CONV(GT);
2351   CONV(GE);
2352   CONV(LT);
2353   CONV(LE);
2354 
2355   case ISD::SETO:  return SystemZ::CCMASK_CMP_O;
2356   case ISD::SETUO: return SystemZ::CCMASK_CMP_UO;
2357   }
2358 #undef CONV
2359 }
2360 
2361 // If C can be converted to a comparison against zero, adjust the operands
2362 // as necessary.
2363 static void adjustZeroCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
2364   if (C.ICmpType == SystemZICMP::UnsignedOnly)
2365     return;
2366 
2367   auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1.getNode());
2368   if (!ConstOp1)
2369     return;
2370 
2371   int64_t Value = ConstOp1->getSExtValue();
2372   if ((Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_GT) ||
2373       (Value == -1 && C.CCMask == SystemZ::CCMASK_CMP_LE) ||
2374       (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_LT) ||
2375       (Value == 1 && C.CCMask == SystemZ::CCMASK_CMP_GE)) {
2376     C.CCMask ^= SystemZ::CCMASK_CMP_EQ;
2377     C.Op1 = DAG.getConstant(0, DL, C.Op1.getValueType());
2378   }
2379 }
2380 
2381 // If a comparison described by C is suitable for CLI(Y), CHHSI or CLHHSI,
2382 // adjust the operands as necessary.
2383 static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
2384                              Comparison &C) {
2385   // For us to make any changes, it must a comparison between a single-use
2386   // load and a constant.
2387   if (!C.Op0.hasOneUse() ||
2388       C.Op0.getOpcode() != ISD::LOAD ||
2389       C.Op1.getOpcode() != ISD::Constant)
2390     return;
2391 
2392   // We must have an 8- or 16-bit load.
2393   auto *Load = cast<LoadSDNode>(C.Op0);
2394   unsigned NumBits = Load->getMemoryVT().getSizeInBits();
2395   if ((NumBits != 8 && NumBits != 16) ||
2396       NumBits != Load->getMemoryVT().getStoreSizeInBits())
2397     return;
2398 
2399   // The load must be an extending one and the constant must be within the
2400   // range of the unextended value.
2401   auto *ConstOp1 = cast<ConstantSDNode>(C.Op1);
2402   uint64_t Value = ConstOp1->getZExtValue();
2403   uint64_t Mask = (1 << NumBits) - 1;
2404   if (Load->getExtensionType() == ISD::SEXTLOAD) {
2405     // Make sure that ConstOp1 is in range of C.Op0.
2406     int64_t SignedValue = ConstOp1->getSExtValue();
2407     if (uint64_t(SignedValue) + (uint64_t(1) << (NumBits - 1)) > Mask)
2408       return;
2409     if (C.ICmpType != SystemZICMP::SignedOnly) {
2410       // Unsigned comparison between two sign-extended values is equivalent
2411       // to unsigned comparison between two zero-extended values.
2412       Value &= Mask;
2413     } else if (NumBits == 8) {
2414       // Try to treat the comparison as unsigned, so that we can use CLI.
2415       // Adjust CCMask and Value as necessary.
2416       if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_LT)
2417         // Test whether the high bit of the byte is set.
2418         Value = 127, C.CCMask = SystemZ::CCMASK_CMP_GT;
2419       else if (Value == 0 && C.CCMask == SystemZ::CCMASK_CMP_GE)
2420         // Test whether the high bit of the byte is clear.
2421         Value = 128, C.CCMask = SystemZ::CCMASK_CMP_LT;
2422       else
2423         // No instruction exists for this combination.
2424         return;
2425       C.ICmpType = SystemZICMP::UnsignedOnly;
2426     }
2427   } else if (Load->getExtensionType() == ISD::ZEXTLOAD) {
2428     if (Value > Mask)
2429       return;
2430     // If the constant is in range, we can use any comparison.
2431     C.ICmpType = SystemZICMP::Any;
2432   } else
2433     return;
2434 
2435   // Make sure that the first operand is an i32 of the right extension type.
2436   ISD::LoadExtType ExtType = (C.ICmpType == SystemZICMP::SignedOnly ?
2437                               ISD::SEXTLOAD :
2438                               ISD::ZEXTLOAD);
2439   if (C.Op0.getValueType() != MVT::i32 ||
2440       Load->getExtensionType() != ExtType) {
2441     C.Op0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32, Load->getChain(),
2442                            Load->getBasePtr(), Load->getPointerInfo(),
2443                            Load->getMemoryVT(), Load->getAlign(),
2444                            Load->getMemOperand()->getFlags());
2445     // Update the chain uses.
2446     DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), C.Op0.getValue(1));
2447   }
2448 
2449   // Make sure that the second operand is an i32 with the right value.
2450   if (C.Op1.getValueType() != MVT::i32 ||
2451       Value != ConstOp1->getZExtValue())
2452     C.Op1 = DAG.getConstant(Value, DL, MVT::i32);
2453 }
2454 
2455 // Return true if Op is either an unextended load, or a load suitable
2456 // for integer register-memory comparisons of type ICmpType.
2457 static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
2458   auto *Load = dyn_cast<LoadSDNode>(Op.getNode());
2459   if (Load) {
2460     // There are no instructions to compare a register with a memory byte.
2461     if (Load->getMemoryVT() == MVT::i8)
2462       return false;
2463     // Otherwise decide on extension type.
2464     switch (Load->getExtensionType()) {
2465     case ISD::NON_EXTLOAD:
2466       return true;
2467     case ISD::SEXTLOAD:
2468       return ICmpType != SystemZICMP::UnsignedOnly;
2469     case ISD::ZEXTLOAD:
2470       return ICmpType != SystemZICMP::SignedOnly;
2471     default:
2472       break;
2473     }
2474   }
2475   return false;
2476 }
2477 
2478 // Return true if it is better to swap the operands of C.
2479 static bool shouldSwapCmpOperands(const Comparison &C) {
2480   // Leave f128 comparisons alone, since they have no memory forms.
2481   if (C.Op0.getValueType() == MVT::f128)
2482     return false;
2483 
2484   // Always keep a floating-point constant second, since comparisons with
2485   // zero can use LOAD TEST and comparisons with other constants make a
2486   // natural memory operand.
2487   if (isa<ConstantFPSDNode>(C.Op1))
2488     return false;
2489 
2490   // Never swap comparisons with zero since there are many ways to optimize
2491   // those later.
2492   auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
2493   if (ConstOp1 && ConstOp1->getZExtValue() == 0)
2494     return false;
2495 
2496   // Also keep natural memory operands second if the loaded value is
2497   // only used here.  Several comparisons have memory forms.
2498   if (isNaturalMemoryOperand(C.Op1, C.ICmpType) && C.Op1.hasOneUse())
2499     return false;
2500 
2501   // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
2502   // In that case we generally prefer the memory to be second.
2503   if (isNaturalMemoryOperand(C.Op0, C.ICmpType) && C.Op0.hasOneUse()) {
2504     // The only exceptions are when the second operand is a constant and
2505     // we can use things like CHHSI.
2506     if (!ConstOp1)
2507       return true;
2508     // The unsigned memory-immediate instructions can handle 16-bit
2509     // unsigned integers.
2510     if (C.ICmpType != SystemZICMP::SignedOnly &&
2511         isUInt<16>(ConstOp1->getZExtValue()))
2512       return false;
2513     // The signed memory-immediate instructions can handle 16-bit
2514     // signed integers.
2515     if (C.ICmpType != SystemZICMP::UnsignedOnly &&
2516         isInt<16>(ConstOp1->getSExtValue()))
2517       return false;
2518     return true;
2519   }
2520 
2521   // Try to promote the use of CGFR and CLGFR.
2522   unsigned Opcode0 = C.Op0.getOpcode();
2523   if (C.ICmpType != SystemZICMP::UnsignedOnly && Opcode0 == ISD::SIGN_EXTEND)
2524     return true;
2525   if (C.ICmpType != SystemZICMP::SignedOnly && Opcode0 == ISD::ZERO_EXTEND)
2526     return true;
2527   if (C.ICmpType != SystemZICMP::SignedOnly &&
2528       Opcode0 == ISD::AND &&
2529       C.Op0.getOperand(1).getOpcode() == ISD::Constant &&
2530       cast<ConstantSDNode>(C.Op0.getOperand(1))->getZExtValue() == 0xffffffff)
2531     return true;
2532 
2533   return false;
2534 }
2535 
2536 // Check whether C tests for equality between X and Y and whether X - Y
2537 // or Y - X is also computed.  In that case it's better to compare the
2538 // result of the subtraction against zero.
2539 static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
2540                                  Comparison &C) {
2541   if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
2542       C.CCMask == SystemZ::CCMASK_CMP_NE) {
2543     for (SDNode *N : C.Op0->uses()) {
2544       if (N->getOpcode() == ISD::SUB &&
2545           ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
2546            (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
2547         // Disable the nsw and nuw flags: the backend needs to handle
2548         // overflow as well during comparison elimination.
2549         SDNodeFlags Flags = N->getFlags();
2550         Flags.setNoSignedWrap(false);
2551         Flags.setNoUnsignedWrap(false);
2552         N->setFlags(Flags);
2553         C.Op0 = SDValue(N, 0);
2554         C.Op1 = DAG.getConstant(0, DL, N->getValueType(0));
2555         return;
2556       }
2557     }
2558   }
2559 }
2560 
2561 // Check whether C compares a floating-point value with zero and if that
2562 // floating-point value is also negated.  In this case we can use the
2563 // negation to set CC, so avoiding separate LOAD AND TEST and
2564 // LOAD (NEGATIVE/COMPLEMENT) instructions.
2565 static void adjustForFNeg(Comparison &C) {
2566   // This optimization is invalid for strict comparisons, since FNEG
2567   // does not raise any exceptions.
2568   if (C.Chain)
2569     return;
2570   auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
2571   if (C1 && C1->isZero()) {
2572     for (SDNode *N : C.Op0->uses()) {
2573       if (N->getOpcode() == ISD::FNEG) {
2574         C.Op0 = SDValue(N, 0);
2575         C.CCMask = SystemZ::reverseCCMask(C.CCMask);
2576         return;
2577       }
2578     }
2579   }
2580 }
2581 
2582 // Check whether C compares (shl X, 32) with 0 and whether X is
2583 // also sign-extended.  In that case it is better to test the result
2584 // of the sign extension using LTGFR.
2585 //
2586 // This case is important because InstCombine transforms a comparison
2587 // with (sext (trunc X)) into a comparison with (shl X, 32).
2588 static void adjustForLTGFR(Comparison &C) {
2589   // Check for a comparison between (shl X, 32) and 0.
2590   if (C.Op0.getOpcode() == ISD::SHL &&
2591       C.Op0.getValueType() == MVT::i64 &&
2592       C.Op1.getOpcode() == ISD::Constant &&
2593       cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2594     auto *C1 = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
2595     if (C1 && C1->getZExtValue() == 32) {
2596       SDValue ShlOp0 = C.Op0.getOperand(0);
2597       // See whether X has any SIGN_EXTEND_INREG uses.
2598       for (SDNode *N : ShlOp0->uses()) {
2599         if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
2600             cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
2601           C.Op0 = SDValue(N, 0);
2602           return;
2603         }
2604       }
2605     }
2606   }
2607 }
2608 
2609 // If C compares the truncation of an extending load, try to compare
2610 // the untruncated value instead.  This exposes more opportunities to
2611 // reuse CC.
2612 static void adjustICmpTruncate(SelectionDAG &DAG, const SDLoc &DL,
2613                                Comparison &C) {
2614   if (C.Op0.getOpcode() == ISD::TRUNCATE &&
2615       C.Op0.getOperand(0).getOpcode() == ISD::LOAD &&
2616       C.Op1.getOpcode() == ISD::Constant &&
2617       cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
2618     auto *L = cast<LoadSDNode>(C.Op0.getOperand(0));
2619     if (L->getMemoryVT().getStoreSizeInBits().getFixedValue() <=
2620         C.Op0.getValueSizeInBits().getFixedValue()) {
2621       unsigned Type = L->getExtensionType();
2622       if ((Type == ISD::ZEXTLOAD && C.ICmpType != SystemZICMP::SignedOnly) ||
2623           (Type == ISD::SEXTLOAD && C.ICmpType != SystemZICMP::UnsignedOnly)) {
2624         C.Op0 = C.Op0.getOperand(0);
2625         C.Op1 = DAG.getConstant(0, DL, C.Op0.getValueType());
2626       }
2627     }
2628   }
2629 }
2630 
2631 // Return true if shift operation N has an in-range constant shift value.
2632 // Store it in ShiftVal if so.
2633 static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
2634   auto *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
2635   if (!Shift)
2636     return false;
2637 
2638   uint64_t Amount = Shift->getZExtValue();
2639   if (Amount >= N.getValueSizeInBits())
2640     return false;
2641 
2642   ShiftVal = Amount;
2643   return true;
2644 }
2645 
2646 // Check whether an AND with Mask is suitable for a TEST UNDER MASK
2647 // instruction and whether the CC value is descriptive enough to handle
2648 // a comparison of type Opcode between the AND result and CmpVal.
2649 // CCMask says which comparison result is being tested and BitSize is
2650 // the number of bits in the operands.  If TEST UNDER MASK can be used,
2651 // return the corresponding CC mask, otherwise return 0.
2652 static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
2653                                      uint64_t Mask, uint64_t CmpVal,
2654                                      unsigned ICmpType) {
2655   assert(Mask != 0 && "ANDs with zero should have been removed by now");
2656 
2657   // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
2658   if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
2659       !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
2660     return 0;
2661 
2662   // Work out the masks for the lowest and highest bits.
2663   uint64_t High = llvm::bit_floor(Mask);
2664   uint64_t Low = uint64_t(1) << llvm::countr_zero(Mask);
2665 
2666   // Signed ordered comparisons are effectively unsigned if the sign
2667   // bit is dropped.
2668   bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
2669 
2670   // Check for equality comparisons with 0, or the equivalent.
2671   if (CmpVal == 0) {
2672     if (CCMask == SystemZ::CCMASK_CMP_EQ)
2673       return SystemZ::CCMASK_TM_ALL_0;
2674     if (CCMask == SystemZ::CCMASK_CMP_NE)
2675       return SystemZ::CCMASK_TM_SOME_1;
2676   }
2677   if (EffectivelyUnsigned && CmpVal > 0 && CmpVal <= Low) {
2678     if (CCMask == SystemZ::CCMASK_CMP_LT)
2679       return SystemZ::CCMASK_TM_ALL_0;
2680     if (CCMask == SystemZ::CCMASK_CMP_GE)
2681       return SystemZ::CCMASK_TM_SOME_1;
2682   }
2683   if (EffectivelyUnsigned && CmpVal < Low) {
2684     if (CCMask == SystemZ::CCMASK_CMP_LE)
2685       return SystemZ::CCMASK_TM_ALL_0;
2686     if (CCMask == SystemZ::CCMASK_CMP_GT)
2687       return SystemZ::CCMASK_TM_SOME_1;
2688   }
2689 
2690   // Check for equality comparisons with the mask, or the equivalent.
2691   if (CmpVal == Mask) {
2692     if (CCMask == SystemZ::CCMASK_CMP_EQ)
2693       return SystemZ::CCMASK_TM_ALL_1;
2694     if (CCMask == SystemZ::CCMASK_CMP_NE)
2695       return SystemZ::CCMASK_TM_SOME_0;
2696   }
2697   if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
2698     if (CCMask == SystemZ::CCMASK_CMP_GT)
2699       return SystemZ::CCMASK_TM_ALL_1;
2700     if (CCMask == SystemZ::CCMASK_CMP_LE)
2701       return SystemZ::CCMASK_TM_SOME_0;
2702   }
2703   if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
2704     if (CCMask == SystemZ::CCMASK_CMP_GE)
2705       return SystemZ::CCMASK_TM_ALL_1;
2706     if (CCMask == SystemZ::CCMASK_CMP_LT)
2707       return SystemZ::CCMASK_TM_SOME_0;
2708   }
2709 
2710   // Check for ordered comparisons with the top bit.
2711   if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
2712     if (CCMask == SystemZ::CCMASK_CMP_LE)
2713       return SystemZ::CCMASK_TM_MSB_0;
2714     if (CCMask == SystemZ::CCMASK_CMP_GT)
2715       return SystemZ::CCMASK_TM_MSB_1;
2716   }
2717   if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
2718     if (CCMask == SystemZ::CCMASK_CMP_LT)
2719       return SystemZ::CCMASK_TM_MSB_0;
2720     if (CCMask == SystemZ::CCMASK_CMP_GE)
2721       return SystemZ::CCMASK_TM_MSB_1;
2722   }
2723 
2724   // If there are just two bits, we can do equality checks for Low and High
2725   // as well.
2726   if (Mask == Low + High) {
2727     if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
2728       return SystemZ::CCMASK_TM_MIXED_MSB_0;
2729     if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
2730       return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
2731     if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
2732       return SystemZ::CCMASK_TM_MIXED_MSB_1;
2733     if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
2734       return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
2735   }
2736 
2737   // Looks like we've exhausted our options.
2738   return 0;
2739 }
2740 
2741 // See whether C can be implemented as a TEST UNDER MASK instruction.
2742 // Update the arguments with the TM version if so.
2743 static void adjustForTestUnderMask(SelectionDAG &DAG, const SDLoc &DL,
2744                                    Comparison &C) {
2745   // Check that we have a comparison with a constant.
2746   auto *ConstOp1 = dyn_cast<ConstantSDNode>(C.Op1);
2747   if (!ConstOp1)
2748     return;
2749   uint64_t CmpVal = ConstOp1->getZExtValue();
2750 
2751   // Check whether the nonconstant input is an AND with a constant mask.
2752   Comparison NewC(C);
2753   uint64_t MaskVal;
2754   ConstantSDNode *Mask = nullptr;
2755   if (C.Op0.getOpcode() == ISD::AND) {
2756     NewC.Op0 = C.Op0.getOperand(0);
2757     NewC.Op1 = C.Op0.getOperand(1);
2758     Mask = dyn_cast<ConstantSDNode>(NewC.Op1);
2759     if (!Mask)
2760       return;
2761     MaskVal = Mask->getZExtValue();
2762   } else {
2763     // There is no instruction to compare with a 64-bit immediate
2764     // so use TMHH instead if possible.  We need an unsigned ordered
2765     // comparison with an i64 immediate.
2766     if (NewC.Op0.getValueType() != MVT::i64 ||
2767         NewC.CCMask == SystemZ::CCMASK_CMP_EQ ||
2768         NewC.CCMask == SystemZ::CCMASK_CMP_NE ||
2769         NewC.ICmpType == SystemZICMP::SignedOnly)
2770       return;
2771     // Convert LE and GT comparisons into LT and GE.
2772     if (NewC.CCMask == SystemZ::CCMASK_CMP_LE ||
2773         NewC.CCMask == SystemZ::CCMASK_CMP_GT) {
2774       if (CmpVal == uint64_t(-1))
2775         return;
2776       CmpVal += 1;
2777       NewC.CCMask ^= SystemZ::CCMASK_CMP_EQ;
2778     }
2779     // If the low N bits of Op1 are zero than the low N bits of Op0 can
2780     // be masked off without changing the result.
2781     MaskVal = -(CmpVal & -CmpVal);
2782     NewC.ICmpType = SystemZICMP::UnsignedOnly;
2783   }
2784   if (!MaskVal)
2785     return;
2786 
2787   // Check whether the combination of mask, comparison value and comparison
2788   // type are suitable.
2789   unsigned BitSize = NewC.Op0.getValueSizeInBits();
2790   unsigned NewCCMask, ShiftVal;
2791   if (NewC.ICmpType != SystemZICMP::SignedOnly &&
2792       NewC.Op0.getOpcode() == ISD::SHL &&
2793       isSimpleShift(NewC.Op0, ShiftVal) &&
2794       (MaskVal >> ShiftVal != 0) &&
2795       ((CmpVal >> ShiftVal) << ShiftVal) == CmpVal &&
2796       (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
2797                                         MaskVal >> ShiftVal,
2798                                         CmpVal >> ShiftVal,
2799                                         SystemZICMP::Any))) {
2800     NewC.Op0 = NewC.Op0.getOperand(0);
2801     MaskVal >>= ShiftVal;
2802   } else if (NewC.ICmpType != SystemZICMP::SignedOnly &&
2803              NewC.Op0.getOpcode() == ISD::SRL &&
2804              isSimpleShift(NewC.Op0, ShiftVal) &&
2805              (MaskVal << ShiftVal != 0) &&
2806              ((CmpVal << ShiftVal) >> ShiftVal) == CmpVal &&
2807              (NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask,
2808                                                MaskVal << ShiftVal,
2809                                                CmpVal << ShiftVal,
2810                                                SystemZICMP::UnsignedOnly))) {
2811     NewC.Op0 = NewC.Op0.getOperand(0);
2812     MaskVal <<= ShiftVal;
2813   } else {
2814     NewCCMask = getTestUnderMaskCond(BitSize, NewC.CCMask, MaskVal, CmpVal,
2815                                      NewC.ICmpType);
2816     if (!NewCCMask)
2817       return;
2818   }
2819 
2820   // Go ahead and make the change.
2821   C.Opcode = SystemZISD::TM;
2822   C.Op0 = NewC.Op0;
2823   if (Mask && Mask->getZExtValue() == MaskVal)
2824     C.Op1 = SDValue(Mask, 0);
2825   else
2826     C.Op1 = DAG.getConstant(MaskVal, DL, C.Op0.getValueType());
2827   C.CCValid = SystemZ::CCMASK_TM;
2828   C.CCMask = NewCCMask;
2829 }
2830 
2831 // See whether the comparison argument contains a redundant AND
2832 // and remove it if so.  This sometimes happens due to the generic
2833 // BRCOND expansion.
2834 static void adjustForRedundantAnd(SelectionDAG &DAG, const SDLoc &DL,
2835                                   Comparison &C) {
2836   if (C.Op0.getOpcode() != ISD::AND)
2837     return;
2838   auto *Mask = dyn_cast<ConstantSDNode>(C.Op0.getOperand(1));
2839   if (!Mask)
2840     return;
2841   KnownBits Known = DAG.computeKnownBits(C.Op0.getOperand(0));
2842   if ((~Known.Zero).getZExtValue() & ~Mask->getZExtValue())
2843     return;
2844 
2845   C.Op0 = C.Op0.getOperand(0);
2846 }
2847 
2848 // Return a Comparison that tests the condition-code result of intrinsic
2849 // node Call against constant integer CC using comparison code Cond.
2850 // Opcode is the opcode of the SystemZISD operation for the intrinsic
2851 // and CCValid is the set of possible condition-code results.
2852 static Comparison getIntrinsicCmp(SelectionDAG &DAG, unsigned Opcode,
2853                                   SDValue Call, unsigned CCValid, uint64_t CC,
2854                                   ISD::CondCode Cond) {
2855   Comparison C(Call, SDValue(), SDValue());
2856   C.Opcode = Opcode;
2857   C.CCValid = CCValid;
2858   if (Cond == ISD::SETEQ)
2859     // bit 3 for CC==0, bit 0 for CC==3, always false for CC>3.
2860     C.CCMask = CC < 4 ? 1 << (3 - CC) : 0;
2861   else if (Cond == ISD::SETNE)
2862     // ...and the inverse of that.
2863     C.CCMask = CC < 4 ? ~(1 << (3 - CC)) : -1;
2864   else if (Cond == ISD::SETLT || Cond == ISD::SETULT)
2865     // bits above bit 3 for CC==0 (always false), bits above bit 0 for CC==3,
2866     // always true for CC>3.
2867     C.CCMask = CC < 4 ? ~0U << (4 - CC) : -1;
2868   else if (Cond == ISD::SETGE || Cond == ISD::SETUGE)
2869     // ...and the inverse of that.
2870     C.CCMask = CC < 4 ? ~(~0U << (4 - CC)) : 0;
2871   else if (Cond == ISD::SETLE || Cond == ISD::SETULE)
2872     // bit 3 and above for CC==0, bit 0 and above for CC==3 (always true),
2873     // always true for CC>3.
2874     C.CCMask = CC < 4 ? ~0U << (3 - CC) : -1;
2875   else if (Cond == ISD::SETGT || Cond == ISD::SETUGT)
2876     // ...and the inverse of that.
2877     C.CCMask = CC < 4 ? ~(~0U << (3 - CC)) : 0;
2878   else
2879     llvm_unreachable("Unexpected integer comparison type");
2880   C.CCMask &= CCValid;
2881   return C;
2882 }
2883 
2884 // Decide how to implement a comparison of type Cond between CmpOp0 with CmpOp1.
2885 static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
2886                          ISD::CondCode Cond, const SDLoc &DL,
2887                          SDValue Chain = SDValue(),
2888                          bool IsSignaling = false) {
2889   if (CmpOp1.getOpcode() == ISD::Constant) {
2890     assert(!Chain);
2891     uint64_t Constant = cast<ConstantSDNode>(CmpOp1)->getZExtValue();
2892     unsigned Opcode, CCValid;
2893     if (CmpOp0.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
2894         CmpOp0.getResNo() == 0 && CmpOp0->hasNUsesOfValue(1, 0) &&
2895         isIntrinsicWithCCAndChain(CmpOp0, Opcode, CCValid))
2896       return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
2897     if (CmpOp0.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
2898         CmpOp0.getResNo() == CmpOp0->getNumValues() - 1 &&
2899         isIntrinsicWithCC(CmpOp0, Opcode, CCValid))
2900       return getIntrinsicCmp(DAG, Opcode, CmpOp0, CCValid, Constant, Cond);
2901   }
2902   Comparison C(CmpOp0, CmpOp1, Chain);
2903   C.CCMask = CCMaskForCondCode(Cond);
2904   if (C.Op0.getValueType().isFloatingPoint()) {
2905     C.CCValid = SystemZ::CCMASK_FCMP;
2906     if (!C.Chain)
2907       C.Opcode = SystemZISD::FCMP;
2908     else if (!IsSignaling)
2909       C.Opcode = SystemZISD::STRICT_FCMP;
2910     else
2911       C.Opcode = SystemZISD::STRICT_FCMPS;
2912     adjustForFNeg(C);
2913   } else {
2914     assert(!C.Chain);
2915     C.CCValid = SystemZ::CCMASK_ICMP;
2916     C.Opcode = SystemZISD::ICMP;
2917     // Choose the type of comparison.  Equality and inequality tests can
2918     // use either signed or unsigned comparisons.  The choice also doesn't
2919     // matter if both sign bits are known to be clear.  In those cases we
2920     // want to give the main isel code the freedom to choose whichever
2921     // form fits best.
2922     if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
2923         C.CCMask == SystemZ::CCMASK_CMP_NE ||
2924         (DAG.SignBitIsZero(C.Op0) && DAG.SignBitIsZero(C.Op1)))
2925       C.ICmpType = SystemZICMP::Any;
2926     else if (C.CCMask & SystemZ::CCMASK_CMP_UO)
2927       C.ICmpType = SystemZICMP::UnsignedOnly;
2928     else
2929       C.ICmpType = SystemZICMP::SignedOnly;
2930     C.CCMask &= ~SystemZ::CCMASK_CMP_UO;
2931     adjustForRedundantAnd(DAG, DL, C);
2932     adjustZeroCmp(DAG, DL, C);
2933     adjustSubwordCmp(DAG, DL, C);
2934     adjustForSubtraction(DAG, DL, C);
2935     adjustForLTGFR(C);
2936     adjustICmpTruncate(DAG, DL, C);
2937   }
2938 
2939   if (shouldSwapCmpOperands(C)) {
2940     std::swap(C.Op0, C.Op1);
2941     C.CCMask = SystemZ::reverseCCMask(C.CCMask);
2942   }
2943 
2944   adjustForTestUnderMask(DAG, DL, C);
2945   return C;
2946 }
2947 
2948 // Emit the comparison instruction described by C.
2949 static SDValue emitCmp(SelectionDAG &DAG, const SDLoc &DL, Comparison &C) {
2950   if (!C.Op1.getNode()) {
2951     SDNode *Node;
2952     switch (C.Op0.getOpcode()) {
2953     case ISD::INTRINSIC_W_CHAIN:
2954       Node = emitIntrinsicWithCCAndChain(DAG, C.Op0, C.Opcode);
2955       return SDValue(Node, 0);
2956     case ISD::INTRINSIC_WO_CHAIN:
2957       Node = emitIntrinsicWithCC(DAG, C.Op0, C.Opcode);
2958       return SDValue(Node, Node->getNumValues() - 1);
2959     default:
2960       llvm_unreachable("Invalid comparison operands");
2961     }
2962   }
2963   if (C.Opcode == SystemZISD::ICMP)
2964     return DAG.getNode(SystemZISD::ICMP, DL, MVT::i32, C.Op0, C.Op1,
2965                        DAG.getTargetConstant(C.ICmpType, DL, MVT::i32));
2966   if (C.Opcode == SystemZISD::TM) {
2967     bool RegisterOnly = (bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
2968                          bool(C.CCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
2969     return DAG.getNode(SystemZISD::TM, DL, MVT::i32, C.Op0, C.Op1,
2970                        DAG.getTargetConstant(RegisterOnly, DL, MVT::i32));
2971   }
2972   if (C.Chain) {
2973     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
2974     return DAG.getNode(C.Opcode, DL, VTs, C.Chain, C.Op0, C.Op1);
2975   }
2976   return DAG.getNode(C.Opcode, DL, MVT::i32, C.Op0, C.Op1);
2977 }
2978 
2979 // Implement a 32-bit *MUL_LOHI operation by extending both operands to
2980 // 64 bits.  Extend is the extension type to use.  Store the high part
2981 // in Hi and the low part in Lo.
2982 static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend,
2983                             SDValue Op0, SDValue Op1, SDValue &Hi,
2984                             SDValue &Lo) {
2985   Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
2986   Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
2987   SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
2988   Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2989                    DAG.getConstant(32, DL, MVT::i64));
2990   Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
2991   Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
2992 }
2993 
2994 // Lower a binary operation that produces two VT results, one in each
2995 // half of a GR128 pair.  Op0 and Op1 are the VT operands to the operation,
2996 // and Opcode performs the GR128 operation.  Store the even register result
2997 // in Even and the odd register result in Odd.
2998 static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
2999                              unsigned Opcode, SDValue Op0, SDValue Op1,
3000                              SDValue &Even, SDValue &Odd) {
3001   SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1);
3002   bool Is32Bit = is32Bit(VT);
3003   Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
3004   Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
3005 }
3006 
3007 // Return an i32 value that is 1 if the CC value produced by CCReg is
3008 // in the mask CCMask and 0 otherwise.  CC is known to have a value
3009 // in CCValid, so other values can be ignored.
3010 static SDValue emitSETCC(SelectionDAG &DAG, const SDLoc &DL, SDValue CCReg,
3011                          unsigned CCValid, unsigned CCMask) {
3012   SDValue Ops[] = {DAG.getConstant(1, DL, MVT::i32),
3013                    DAG.getConstant(0, DL, MVT::i32),
3014                    DAG.getTargetConstant(CCValid, DL, MVT::i32),
3015                    DAG.getTargetConstant(CCMask, DL, MVT::i32), CCReg};
3016   return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, MVT::i32, Ops);
3017 }
3018 
3019 // Return the SystemISD vector comparison operation for CC, or 0 if it cannot
3020 // be done directly.  Mode is CmpMode::Int for integer comparisons, CmpMode::FP
3021 // for regular floating-point comparisons, CmpMode::StrictFP for strict (quiet)
3022 // floating-point comparisons, and CmpMode::SignalingFP for strict signaling
3023 // floating-point comparisons.
3024 enum class CmpMode { Int, FP, StrictFP, SignalingFP };
3025 static unsigned getVectorComparison(ISD::CondCode CC, CmpMode Mode) {
3026   switch (CC) {
3027   case ISD::SETOEQ:
3028   case ISD::SETEQ:
3029     switch (Mode) {
3030     case CmpMode::Int:         return SystemZISD::VICMPE;
3031     case CmpMode::FP:          return SystemZISD::VFCMPE;
3032     case CmpMode::StrictFP:    return SystemZISD::STRICT_VFCMPE;
3033     case CmpMode::SignalingFP: return SystemZISD::STRICT_VFCMPES;
3034     }
3035     llvm_unreachable("Bad mode");
3036 
3037   case ISD::SETOGE:
3038   case ISD::SETGE:
3039     switch (Mode) {
3040     case CmpMode::Int:         return 0;
3041     case CmpMode::FP:          return SystemZISD::VFCMPHE;
3042     case CmpMode::StrictFP:    return SystemZISD::STRICT_VFCMPHE;
3043     case CmpMode::SignalingFP: return SystemZISD::STRICT_VFCMPHES;
3044     }
3045     llvm_unreachable("Bad mode");
3046 
3047   case ISD::SETOGT:
3048   case ISD::SETGT:
3049     switch (Mode) {
3050     case CmpMode::Int:         return SystemZISD::VICMPH;
3051     case CmpMode::FP:          return SystemZISD::VFCMPH;
3052     case CmpMode::StrictFP:    return SystemZISD::STRICT_VFCMPH;
3053     case CmpMode::SignalingFP: return SystemZISD::STRICT_VFCMPHS;
3054     }
3055     llvm_unreachable("Bad mode");
3056 
3057   case ISD::SETUGT:
3058     switch (Mode) {
3059     case CmpMode::Int:         return SystemZISD::VICMPHL;
3060     case CmpMode::FP:          return 0;
3061     case CmpMode::StrictFP:    return 0;
3062     case CmpMode::SignalingFP: return 0;
3063     }
3064     llvm_unreachable("Bad mode");
3065 
3066   default:
3067     return 0;
3068   }
3069 }
3070 
3071 // Return the SystemZISD vector comparison operation for CC or its inverse,
3072 // or 0 if neither can be done directly.  Indicate in Invert whether the
3073 // result is for the inverse of CC.  Mode is as above.
3074 static unsigned getVectorComparisonOrInvert(ISD::CondCode CC, CmpMode Mode,
3075                                             bool &Invert) {
3076   if (unsigned Opcode = getVectorComparison(CC, Mode)) {
3077     Invert = false;
3078     return Opcode;
3079   }
3080 
3081   CC = ISD::getSetCCInverse(CC, Mode == CmpMode::Int ? MVT::i32 : MVT::f32);
3082   if (unsigned Opcode = getVectorComparison(CC, Mode)) {
3083     Invert = true;
3084     return Opcode;
3085   }
3086 
3087   return 0;
3088 }
3089 
3090 // Return a v2f64 that contains the extended form of elements Start and Start+1
3091 // of v4f32 value Op.  If Chain is nonnull, return the strict form.
3092 static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL,
3093                                   SDValue Op, SDValue Chain) {
3094   int Mask[] = { Start, -1, Start + 1, -1 };
3095   Op = DAG.getVectorShuffle(MVT::v4f32, DL, Op, DAG.getUNDEF(MVT::v4f32), Mask);
3096   if (Chain) {
3097     SDVTList VTs = DAG.getVTList(MVT::v2f64, MVT::Other);
3098     return DAG.getNode(SystemZISD::STRICT_VEXTEND, DL, VTs, Chain, Op);
3099   }
3100   return DAG.getNode(SystemZISD::VEXTEND, DL, MVT::v2f64, Op);
3101 }
3102 
3103 // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode,
3104 // producing a result of type VT.  If Chain is nonnull, return the strict form.
3105 SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode,
3106                                             const SDLoc &DL, EVT VT,
3107                                             SDValue CmpOp0,
3108                                             SDValue CmpOp1,
3109                                             SDValue Chain) const {
3110   // There is no hardware support for v4f32 (unless we have the vector
3111   // enhancements facility 1), so extend the vector into two v2f64s
3112   // and compare those.
3113   if (CmpOp0.getValueType() == MVT::v4f32 &&
3114       !Subtarget.hasVectorEnhancements1()) {
3115     SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0, Chain);
3116     SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0, Chain);
3117     SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1, Chain);
3118     SDValue L1 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp1, Chain);
3119     if (Chain) {
3120       SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::Other);
3121       SDValue HRes = DAG.getNode(Opcode, DL, VTs, Chain, H0, H1);
3122       SDValue LRes = DAG.getNode(Opcode, DL, VTs, Chain, L0, L1);
3123       SDValue Res = DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
3124       SDValue Chains[6] = { H0.getValue(1), L0.getValue(1),
3125                             H1.getValue(1), L1.getValue(1),
3126                             HRes.getValue(1), LRes.getValue(1) };
3127       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3128       SDValue Ops[2] = { Res, NewChain };
3129       return DAG.getMergeValues(Ops, DL);
3130     }
3131     SDValue HRes = DAG.getNode(Opcode, DL, MVT::v2i64, H0, H1);
3132     SDValue LRes = DAG.getNode(Opcode, DL, MVT::v2i64, L0, L1);
3133     return DAG.getNode(SystemZISD::PACK, DL, VT, HRes, LRes);
3134   }
3135   if (Chain) {
3136     SDVTList VTs = DAG.getVTList(VT, MVT::Other);
3137     return DAG.getNode(Opcode, DL, VTs, Chain, CmpOp0, CmpOp1);
3138   }
3139   return DAG.getNode(Opcode, DL, VT, CmpOp0, CmpOp1);
3140 }
3141 
3142 // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing
3143 // an integer mask of type VT.  If Chain is nonnull, we have a strict
3144 // floating-point comparison.  If in addition IsSignaling is true, we have
3145 // a strict signaling floating-point comparison.
3146 SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG,
3147                                                 const SDLoc &DL, EVT VT,
3148                                                 ISD::CondCode CC,
3149                                                 SDValue CmpOp0,
3150                                                 SDValue CmpOp1,
3151                                                 SDValue Chain,
3152                                                 bool IsSignaling) const {
3153   bool IsFP = CmpOp0.getValueType().isFloatingPoint();
3154   assert (!Chain || IsFP);
3155   assert (!IsSignaling || Chain);
3156   CmpMode Mode = IsSignaling ? CmpMode::SignalingFP :
3157                  Chain ? CmpMode::StrictFP : IsFP ? CmpMode::FP : CmpMode::Int;
3158   bool Invert = false;
3159   SDValue Cmp;
3160   switch (CC) {
3161     // Handle tests for order using (or (ogt y x) (oge x y)).
3162   case ISD::SETUO:
3163     Invert = true;
3164     [[fallthrough]];
3165   case ISD::SETO: {
3166     assert(IsFP && "Unexpected integer comparison");
3167     SDValue LT = getVectorCmp(DAG, getVectorComparison(ISD::SETOGT, Mode),
3168                               DL, VT, CmpOp1, CmpOp0, Chain);
3169     SDValue GE = getVectorCmp(DAG, getVectorComparison(ISD::SETOGE, Mode),
3170                               DL, VT, CmpOp0, CmpOp1, Chain);
3171     Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GE);
3172     if (Chain)
3173       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
3174                           LT.getValue(1), GE.getValue(1));
3175     break;
3176   }
3177 
3178     // Handle <> tests using (or (ogt y x) (ogt x y)).
3179   case ISD::SETUEQ:
3180     Invert = true;
3181     [[fallthrough]];
3182   case ISD::SETONE: {
3183     assert(IsFP && "Unexpected integer comparison");
3184     SDValue LT = getVectorCmp(DAG, getVectorComparison(ISD::SETOGT, Mode),
3185                               DL, VT, CmpOp1, CmpOp0, Chain);
3186     SDValue GT = getVectorCmp(DAG, getVectorComparison(ISD::SETOGT, Mode),
3187                               DL, VT, CmpOp0, CmpOp1, Chain);
3188     Cmp = DAG.getNode(ISD::OR, DL, VT, LT, GT);
3189     if (Chain)
3190       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
3191                           LT.getValue(1), GT.getValue(1));
3192     break;
3193   }
3194 
3195     // Otherwise a single comparison is enough.  It doesn't really
3196     // matter whether we try the inversion or the swap first, since
3197     // there are no cases where both work.
3198   default:
3199     if (unsigned Opcode = getVectorComparisonOrInvert(CC, Mode, Invert))
3200       Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp0, CmpOp1, Chain);
3201     else {
3202       CC = ISD::getSetCCSwappedOperands(CC);
3203       if (unsigned Opcode = getVectorComparisonOrInvert(CC, Mode, Invert))
3204         Cmp = getVectorCmp(DAG, Opcode, DL, VT, CmpOp1, CmpOp0, Chain);
3205       else
3206         llvm_unreachable("Unhandled comparison");
3207     }
3208     if (Chain)
3209       Chain = Cmp.getValue(1);
3210     break;
3211   }
3212   if (Invert) {
3213     SDValue Mask =
3214       DAG.getSplatBuildVector(VT, DL, DAG.getConstant(-1, DL, MVT::i64));
3215     Cmp = DAG.getNode(ISD::XOR, DL, VT, Cmp, Mask);
3216   }
3217   if (Chain && Chain.getNode() != Cmp.getNode()) {
3218     SDValue Ops[2] = { Cmp, Chain };
3219     Cmp = DAG.getMergeValues(Ops, DL);
3220   }
3221   return Cmp;
3222 }
3223 
3224 SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
3225                                           SelectionDAG &DAG) const {
3226   SDValue CmpOp0   = Op.getOperand(0);
3227   SDValue CmpOp1   = Op.getOperand(1);
3228   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
3229   SDLoc DL(Op);
3230   EVT VT = Op.getValueType();
3231   if (VT.isVector())
3232     return lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1);
3233 
3234   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
3235   SDValue CCReg = emitCmp(DAG, DL, C);
3236   return emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
3237 }
3238 
3239 SDValue SystemZTargetLowering::lowerSTRICT_FSETCC(SDValue Op,
3240                                                   SelectionDAG &DAG,
3241                                                   bool IsSignaling) const {
3242   SDValue Chain    = Op.getOperand(0);
3243   SDValue CmpOp0   = Op.getOperand(1);
3244   SDValue CmpOp1   = Op.getOperand(2);
3245   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
3246   SDLoc DL(Op);
3247   EVT VT = Op.getNode()->getValueType(0);
3248   if (VT.isVector()) {
3249     SDValue Res = lowerVectorSETCC(DAG, DL, VT, CC, CmpOp0, CmpOp1,
3250                                    Chain, IsSignaling);
3251     return Res.getValue(Op.getResNo());
3252   }
3253 
3254   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL, Chain, IsSignaling));
3255   SDValue CCReg = emitCmp(DAG, DL, C);
3256   CCReg->setFlags(Op->getFlags());
3257   SDValue Result = emitSETCC(DAG, DL, CCReg, C.CCValid, C.CCMask);
3258   SDValue Ops[2] = { Result, CCReg.getValue(1) };
3259   return DAG.getMergeValues(Ops, DL);
3260 }
3261 
3262 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
3263   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
3264   SDValue CmpOp0   = Op.getOperand(2);
3265   SDValue CmpOp1   = Op.getOperand(3);
3266   SDValue Dest     = Op.getOperand(4);
3267   SDLoc DL(Op);
3268 
3269   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
3270   SDValue CCReg = emitCmp(DAG, DL, C);
3271   return DAG.getNode(
3272       SystemZISD::BR_CCMASK, DL, Op.getValueType(), Op.getOperand(0),
3273       DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
3274       DAG.getTargetConstant(C.CCMask, DL, MVT::i32), Dest, CCReg);
3275 }
3276 
3277 // Return true if Pos is CmpOp and Neg is the negative of CmpOp,
3278 // allowing Pos and Neg to be wider than CmpOp.
3279 static bool isAbsolute(SDValue CmpOp, SDValue Pos, SDValue Neg) {
3280   return (Neg.getOpcode() == ISD::SUB &&
3281           Neg.getOperand(0).getOpcode() == ISD::Constant &&
3282           cast<ConstantSDNode>(Neg.getOperand(0))->getZExtValue() == 0 &&
3283           Neg.getOperand(1) == Pos &&
3284           (Pos == CmpOp ||
3285            (Pos.getOpcode() == ISD::SIGN_EXTEND &&
3286             Pos.getOperand(0) == CmpOp)));
3287 }
3288 
3289 // Return the absolute or negative absolute of Op; IsNegative decides which.
3290 static SDValue getAbsolute(SelectionDAG &DAG, const SDLoc &DL, SDValue Op,
3291                            bool IsNegative) {
3292   Op = DAG.getNode(ISD::ABS, DL, Op.getValueType(), Op);
3293   if (IsNegative)
3294     Op = DAG.getNode(ISD::SUB, DL, Op.getValueType(),
3295                      DAG.getConstant(0, DL, Op.getValueType()), Op);
3296   return Op;
3297 }
3298 
3299 SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
3300                                               SelectionDAG &DAG) const {
3301   SDValue CmpOp0   = Op.getOperand(0);
3302   SDValue CmpOp1   = Op.getOperand(1);
3303   SDValue TrueOp   = Op.getOperand(2);
3304   SDValue FalseOp  = Op.getOperand(3);
3305   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
3306   SDLoc DL(Op);
3307 
3308   Comparison C(getCmp(DAG, CmpOp0, CmpOp1, CC, DL));
3309 
3310   // Check for absolute and negative-absolute selections, including those
3311   // where the comparison value is sign-extended (for LPGFR and LNGFR).
3312   // This check supplements the one in DAGCombiner.
3313   if (C.Opcode == SystemZISD::ICMP &&
3314       C.CCMask != SystemZ::CCMASK_CMP_EQ &&
3315       C.CCMask != SystemZ::CCMASK_CMP_NE &&
3316       C.Op1.getOpcode() == ISD::Constant &&
3317       cast<ConstantSDNode>(C.Op1)->getZExtValue() == 0) {
3318     if (isAbsolute(C.Op0, TrueOp, FalseOp))
3319       return getAbsolute(DAG, DL, TrueOp, C.CCMask & SystemZ::CCMASK_CMP_LT);
3320     if (isAbsolute(C.Op0, FalseOp, TrueOp))
3321       return getAbsolute(DAG, DL, FalseOp, C.CCMask & SystemZ::CCMASK_CMP_GT);
3322   }
3323 
3324   SDValue CCReg = emitCmp(DAG, DL, C);
3325   SDValue Ops[] = {TrueOp, FalseOp,
3326                    DAG.getTargetConstant(C.CCValid, DL, MVT::i32),
3327                    DAG.getTargetConstant(C.CCMask, DL, MVT::i32), CCReg};
3328 
3329   return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, Op.getValueType(), Ops);
3330 }
3331 
3332 SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
3333                                                   SelectionDAG &DAG) const {
3334   SDLoc DL(Node);
3335   const GlobalValue *GV = Node->getGlobal();
3336   int64_t Offset = Node->getOffset();
3337   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3338   CodeModel::Model CM = DAG.getTarget().getCodeModel();
3339 
3340   SDValue Result;
3341   if (Subtarget.isPC32DBLSymbol(GV, CM)) {
3342     if (isInt<32>(Offset)) {
3343       // Assign anchors at 1<<12 byte boundaries.
3344       uint64_t Anchor = Offset & ~uint64_t(0xfff);
3345       Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
3346       Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3347 
3348       // The offset can be folded into the address if it is aligned to a
3349       // halfword.
3350       Offset -= Anchor;
3351       if (Offset != 0 && (Offset & 1) == 0) {
3352         SDValue Full =
3353           DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
3354         Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
3355         Offset = 0;
3356       }
3357     } else {
3358       // Conservatively load a constant offset greater than 32 bits into a
3359       // register below.
3360       Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT);
3361       Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3362     }
3363   } else if (Subtarget.isTargetELF()) {
3364     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
3365     Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3366     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3367                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3368   } else if (Subtarget.isTargetzOS()) {
3369     Result = getADAEntry(DAG, GV, DL, PtrVT);
3370   } else
3371     llvm_unreachable("Unexpected Subtarget");
3372 
3373   // If there was a non-zero offset that we didn't fold, create an explicit
3374   // addition for it.
3375   if (Offset != 0)
3376     Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
3377                          DAG.getConstant(Offset, DL, PtrVT));
3378 
3379   return Result;
3380 }
3381 
3382 SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node,
3383                                                  SelectionDAG &DAG,
3384                                                  unsigned Opcode,
3385                                                  SDValue GOTOffset) const {
3386   SDLoc DL(Node);
3387   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3388   SDValue Chain = DAG.getEntryNode();
3389   SDValue Glue;
3390 
3391   if (DAG.getMachineFunction().getFunction().getCallingConv() ==
3392       CallingConv::GHC)
3393     report_fatal_error("In GHC calling convention TLS is not supported");
3394 
3395   // __tls_get_offset takes the GOT offset in %r2 and the GOT in %r12.
3396   SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
3397   Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R12D, GOT, Glue);
3398   Glue = Chain.getValue(1);
3399   Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R2D, GOTOffset, Glue);
3400   Glue = Chain.getValue(1);
3401 
3402   // The first call operand is the chain and the second is the TLS symbol.
3403   SmallVector<SDValue, 8> Ops;
3404   Ops.push_back(Chain);
3405   Ops.push_back(DAG.getTargetGlobalAddress(Node->getGlobal(), DL,
3406                                            Node->getValueType(0),
3407                                            0, 0));
3408 
3409   // Add argument registers to the end of the list so that they are
3410   // known live into the call.
3411   Ops.push_back(DAG.getRegister(SystemZ::R2D, PtrVT));
3412   Ops.push_back(DAG.getRegister(SystemZ::R12D, PtrVT));
3413 
3414   // Add a register mask operand representing the call-preserved registers.
3415   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3416   const uint32_t *Mask =
3417       TRI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
3418   assert(Mask && "Missing call preserved mask for calling convention");
3419   Ops.push_back(DAG.getRegisterMask(Mask));
3420 
3421   // Glue the call to the argument copies.
3422   Ops.push_back(Glue);
3423 
3424   // Emit the call.
3425   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3426   Chain = DAG.getNode(Opcode, DL, NodeTys, Ops);
3427   Glue = Chain.getValue(1);
3428 
3429   // Copy the return value from %r2.
3430   return DAG.getCopyFromReg(Chain, DL, SystemZ::R2D, PtrVT, Glue);
3431 }
3432 
3433 SDValue SystemZTargetLowering::lowerThreadPointer(const SDLoc &DL,
3434                                                   SelectionDAG &DAG) const {
3435   SDValue Chain = DAG.getEntryNode();
3436   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3437 
3438   // The high part of the thread pointer is in access register 0.
3439   SDValue TPHi = DAG.getCopyFromReg(Chain, DL, SystemZ::A0, MVT::i32);
3440   TPHi = DAG.getNode(ISD::ANY_EXTEND, DL, PtrVT, TPHi);
3441 
3442   // The low part of the thread pointer is in access register 1.
3443   SDValue TPLo = DAG.getCopyFromReg(Chain, DL, SystemZ::A1, MVT::i32);
3444   TPLo = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TPLo);
3445 
3446   // Merge them into a single 64-bit address.
3447   SDValue TPHiShifted = DAG.getNode(ISD::SHL, DL, PtrVT, TPHi,
3448                                     DAG.getConstant(32, DL, PtrVT));
3449   return DAG.getNode(ISD::OR, DL, PtrVT, TPHiShifted, TPLo);
3450 }
3451 
3452 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
3453                                                      SelectionDAG &DAG) const {
3454   if (DAG.getTarget().useEmulatedTLS())
3455     return LowerToTLSEmulatedModel(Node, DAG);
3456   SDLoc DL(Node);
3457   const GlobalValue *GV = Node->getGlobal();
3458   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3459   TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
3460 
3461   if (DAG.getMachineFunction().getFunction().getCallingConv() ==
3462       CallingConv::GHC)
3463     report_fatal_error("In GHC calling convention TLS is not supported");
3464 
3465   SDValue TP = lowerThreadPointer(DL, DAG);
3466 
3467   // Get the offset of GA from the thread pointer, based on the TLS model.
3468   SDValue Offset;
3469   switch (model) {
3470     case TLSModel::GeneralDynamic: {
3471       // Load the GOT offset of the tls_index (module ID / per-symbol offset).
3472       SystemZConstantPoolValue *CPV =
3473         SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
3474 
3475       Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3476       Offset = DAG.getLoad(
3477           PtrVT, DL, DAG.getEntryNode(), Offset,
3478           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3479 
3480       // Call __tls_get_offset to retrieve the offset.
3481       Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset);
3482       break;
3483     }
3484 
3485     case TLSModel::LocalDynamic: {
3486       // Load the GOT offset of the module ID.
3487       SystemZConstantPoolValue *CPV =
3488         SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
3489 
3490       Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3491       Offset = DAG.getLoad(
3492           PtrVT, DL, DAG.getEntryNode(), Offset,
3493           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3494 
3495       // Call __tls_get_offset to retrieve the module base offset.
3496       Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset);
3497 
3498       // Note: The SystemZLDCleanupPass will remove redundant computations
3499       // of the module base offset.  Count total number of local-dynamic
3500       // accesses to trigger execution of that pass.
3501       SystemZMachineFunctionInfo* MFI =
3502         DAG.getMachineFunction().getInfo<SystemZMachineFunctionInfo>();
3503       MFI->incNumLocalDynamicTLSAccesses();
3504 
3505       // Add the per-symbol offset.
3506       CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
3507 
3508       SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3509       DTPOffset = DAG.getLoad(
3510           PtrVT, DL, DAG.getEntryNode(), DTPOffset,
3511           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3512 
3513       Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset);
3514       break;
3515     }
3516 
3517     case TLSModel::InitialExec: {
3518       // Load the offset from the GOT.
3519       Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
3520                                           SystemZII::MO_INDNTPOFF);
3521       Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset);
3522       Offset =
3523           DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset,
3524                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3525       break;
3526     }
3527 
3528     case TLSModel::LocalExec: {
3529       // Force the offset into the constant pool and load it from there.
3530       SystemZConstantPoolValue *CPV =
3531         SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
3532 
3533       Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
3534       Offset = DAG.getLoad(
3535           PtrVT, DL, DAG.getEntryNode(), Offset,
3536           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
3537       break;
3538     }
3539   }
3540 
3541   // Add the base and offset together.
3542   return DAG.getNode(ISD::ADD, DL, PtrVT, TP, Offset);
3543 }
3544 
3545 SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
3546                                                  SelectionDAG &DAG) const {
3547   SDLoc DL(Node);
3548   const BlockAddress *BA = Node->getBlockAddress();
3549   int64_t Offset = Node->getOffset();
3550   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3551 
3552   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset);
3553   Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3554   return Result;
3555 }
3556 
3557 SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
3558                                               SelectionDAG &DAG) const {
3559   SDLoc DL(JT);
3560   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3561   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
3562 
3563   // Use LARL to load the address of the table.
3564   return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3565 }
3566 
3567 SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
3568                                                  SelectionDAG &DAG) const {
3569   SDLoc DL(CP);
3570   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3571 
3572   SDValue Result;
3573   if (CP->isMachineConstantPoolEntry())
3574     Result =
3575         DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
3576   else
3577     Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign(),
3578                                        CP->getOffset());
3579 
3580   // Use LARL to load the address of the constant pool entry.
3581   return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
3582 }
3583 
3584 SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
3585                                               SelectionDAG &DAG) const {
3586   auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
3587   MachineFunction &MF = DAG.getMachineFunction();
3588   MachineFrameInfo &MFI = MF.getFrameInfo();
3589   MFI.setFrameAddressIsTaken(true);
3590 
3591   SDLoc DL(Op);
3592   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3593   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3594 
3595   // By definition, the frame address is the address of the back chain.  (In
3596   // the case of packed stack without backchain, return the address where the
3597   // backchain would have been stored. This will either be an unused space or
3598   // contain a saved register).
3599   int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF);
3600   SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
3601 
3602   // FIXME The frontend should detect this case.
3603   if (Depth > 0) {
3604     report_fatal_error("Unsupported stack frame traversal count");
3605   }
3606 
3607   return BackChain;
3608 }
3609 
3610 SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
3611                                                SelectionDAG &DAG) const {
3612   MachineFunction &MF = DAG.getMachineFunction();
3613   MachineFrameInfo &MFI = MF.getFrameInfo();
3614   MFI.setReturnAddressIsTaken(true);
3615 
3616   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
3617     return SDValue();
3618 
3619   SDLoc DL(Op);
3620   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3621   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3622 
3623   // FIXME The frontend should detect this case.
3624   if (Depth > 0) {
3625     report_fatal_error("Unsupported stack frame traversal count");
3626   }
3627 
3628   // Return R14D, which has the return address. Mark it an implicit live-in.
3629   Register LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
3630   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
3631 }
3632 
3633 SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
3634                                             SelectionDAG &DAG) const {
3635   SDLoc DL(Op);
3636   SDValue In = Op.getOperand(0);
3637   EVT InVT = In.getValueType();
3638   EVT ResVT = Op.getValueType();
3639 
3640   // Convert loads directly.  This is normally done by DAGCombiner,
3641   // but we need this case for bitcasts that are created during lowering
3642   // and which are then lowered themselves.
3643   if (auto *LoadN = dyn_cast<LoadSDNode>(In))
3644     if (ISD::isNormalLoad(LoadN)) {
3645       SDValue NewLoad = DAG.getLoad(ResVT, DL, LoadN->getChain(),
3646                                     LoadN->getBasePtr(), LoadN->getMemOperand());
3647       // Update the chain uses.
3648       DAG.ReplaceAllUsesOfValueWith(SDValue(LoadN, 1), NewLoad.getValue(1));
3649       return NewLoad;
3650     }
3651 
3652   if (InVT == MVT::i32 && ResVT == MVT::f32) {
3653     SDValue In64;
3654     if (Subtarget.hasHighWord()) {
3655       SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
3656                                        MVT::i64);
3657       In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
3658                                        MVT::i64, SDValue(U64, 0), In);
3659     } else {
3660       In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
3661       In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
3662                          DAG.getConstant(32, DL, MVT::i64));
3663     }
3664     SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
3665     return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
3666                                       DL, MVT::f32, Out64);
3667   }
3668   if (InVT == MVT::f32 && ResVT == MVT::i32) {
3669     SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
3670     SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
3671                                              MVT::f64, SDValue(U64, 0), In);
3672     SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
3673     if (Subtarget.hasHighWord())
3674       return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
3675                                         MVT::i32, Out64);
3676     SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
3677                                 DAG.getConstant(32, DL, MVT::i64));
3678     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
3679   }
3680   llvm_unreachable("Unexpected bitcast combination");
3681 }
3682 
3683 SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
3684                                             SelectionDAG &DAG) const {
3685 
3686   if (Subtarget.isTargetXPLINK64())
3687     return lowerVASTART_XPLINK(Op, DAG);
3688   else
3689     return lowerVASTART_ELF(Op, DAG);
3690 }
3691 
3692 SDValue SystemZTargetLowering::lowerVASTART_XPLINK(SDValue Op,
3693                                                    SelectionDAG &DAG) const {
3694   MachineFunction &MF = DAG.getMachineFunction();
3695   SystemZMachineFunctionInfo *FuncInfo =
3696       MF.getInfo<SystemZMachineFunctionInfo>();
3697 
3698   SDLoc DL(Op);
3699 
3700   // vastart just stores the address of the VarArgsFrameIndex slot into the
3701   // memory location argument.
3702   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3703   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
3704   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3705   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
3706                       MachinePointerInfo(SV));
3707 }
3708 
3709 SDValue SystemZTargetLowering::lowerVASTART_ELF(SDValue Op,
3710                                                 SelectionDAG &DAG) const {
3711   MachineFunction &MF = DAG.getMachineFunction();
3712   SystemZMachineFunctionInfo *FuncInfo =
3713     MF.getInfo<SystemZMachineFunctionInfo>();
3714   EVT PtrVT = getPointerTy(DAG.getDataLayout());
3715 
3716   SDValue Chain   = Op.getOperand(0);
3717   SDValue Addr    = Op.getOperand(1);
3718   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3719   SDLoc DL(Op);
3720 
3721   // The initial values of each field.
3722   const unsigned NumFields = 4;
3723   SDValue Fields[NumFields] = {
3724     DAG.getConstant(FuncInfo->getVarArgsFirstGPR(), DL, PtrVT),
3725     DAG.getConstant(FuncInfo->getVarArgsFirstFPR(), DL, PtrVT),
3726     DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT),
3727     DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT)
3728   };
3729 
3730   // Store each field into its respective slot.
3731   SDValue MemOps[NumFields];
3732   unsigned Offset = 0;
3733   for (unsigned I = 0; I < NumFields; ++I) {
3734     SDValue FieldAddr = Addr;
3735     if (Offset != 0)
3736       FieldAddr = DAG.getNode(ISD::ADD, DL, PtrVT, FieldAddr,
3737                               DAG.getIntPtrConstant(Offset, DL));
3738     MemOps[I] = DAG.getStore(Chain, DL, Fields[I], FieldAddr,
3739                              MachinePointerInfo(SV, Offset));
3740     Offset += 8;
3741   }
3742   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
3743 }
3744 
3745 SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
3746                                            SelectionDAG &DAG) const {
3747   SDValue Chain      = Op.getOperand(0);
3748   SDValue DstPtr     = Op.getOperand(1);
3749   SDValue SrcPtr     = Op.getOperand(2);
3750   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
3751   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
3752   SDLoc DL(Op);
3753 
3754   uint32_t Sz =
3755       Subtarget.isTargetXPLINK64() ? getTargetMachine().getPointerSize(0) : 32;
3756   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(Sz, DL),
3757                        Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false,
3758                        /*isTailCall*/ false, MachinePointerInfo(DstSV),
3759                        MachinePointerInfo(SrcSV));
3760 }
3761 
3762 SDValue
3763 SystemZTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
3764                                                SelectionDAG &DAG) const {
3765   if (Subtarget.isTargetXPLINK64())
3766     return lowerDYNAMIC_STACKALLOC_XPLINK(Op, DAG);
3767   else
3768     return lowerDYNAMIC_STACKALLOC_ELF(Op, DAG);
3769 }
3770 
3771 SDValue
3772 SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_XPLINK(SDValue Op,
3773                                                       SelectionDAG &DAG) const {
3774   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
3775   MachineFunction &MF = DAG.getMachineFunction();
3776   bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
3777   SDValue Chain = Op.getOperand(0);
3778   SDValue Size = Op.getOperand(1);
3779   SDValue Align = Op.getOperand(2);
3780   SDLoc DL(Op);
3781 
3782   // If user has set the no alignment function attribute, ignore
3783   // alloca alignments.
3784   uint64_t AlignVal =
3785       (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
3786 
3787   uint64_t StackAlign = TFI->getStackAlignment();
3788   uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
3789   uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
3790 
3791   SDValue NeededSpace = Size;
3792 
3793   // Add extra space for alignment if needed.
3794   EVT PtrVT = getPointerTy(MF.getDataLayout());
3795   if (ExtraAlignSpace)
3796     NeededSpace = DAG.getNode(ISD::ADD, DL, PtrVT, NeededSpace,
3797                               DAG.getConstant(ExtraAlignSpace, DL, PtrVT));
3798 
3799   bool IsSigned = false;
3800   bool DoesNotReturn = false;
3801   bool IsReturnValueUsed = false;
3802   EVT VT = Op.getValueType();
3803   SDValue AllocaCall =
3804       makeExternalCall(Chain, DAG, "@@ALCAXP", VT, ArrayRef(NeededSpace),
3805                        CallingConv::C, IsSigned, DL, DoesNotReturn,
3806                        IsReturnValueUsed)
3807           .first;
3808 
3809   // Perform a CopyFromReg from %GPR4 (stack pointer register). Chain and Glue
3810   // to end of call in order to ensure it isn't broken up from the call
3811   // sequence.
3812   auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
3813   Register SPReg = Regs.getStackPointerRegister();
3814   Chain = AllocaCall.getValue(1);
3815   SDValue Glue = AllocaCall.getValue(2);
3816   SDValue NewSPRegNode = DAG.getCopyFromReg(Chain, DL, SPReg, PtrVT, Glue);
3817   Chain = NewSPRegNode.getValue(1);
3818 
3819   MVT PtrMVT = getPointerMemTy(MF.getDataLayout());
3820   SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, PtrMVT);
3821   SDValue Result = DAG.getNode(ISD::ADD, DL, PtrMVT, NewSPRegNode, ArgAdjust);
3822 
3823   // Dynamically realign if needed.
3824   if (ExtraAlignSpace) {
3825     Result = DAG.getNode(ISD::ADD, DL, PtrVT, Result,
3826                          DAG.getConstant(ExtraAlignSpace, DL, PtrVT));
3827     Result = DAG.getNode(ISD::AND, DL, PtrVT, Result,
3828                          DAG.getConstant(~(RequiredAlign - 1), DL, PtrVT));
3829   }
3830 
3831   SDValue Ops[2] = {Result, Chain};
3832   return DAG.getMergeValues(Ops, DL);
3833 }
3834 
3835 SDValue
3836 SystemZTargetLowering::lowerDYNAMIC_STACKALLOC_ELF(SDValue Op,
3837                                                    SelectionDAG &DAG) const {
3838   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
3839   MachineFunction &MF = DAG.getMachineFunction();
3840   bool RealignOpt = !MF.getFunction().hasFnAttribute("no-realign-stack");
3841   bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
3842 
3843   SDValue Chain = Op.getOperand(0);
3844   SDValue Size  = Op.getOperand(1);
3845   SDValue Align = Op.getOperand(2);
3846   SDLoc DL(Op);
3847 
3848   // If user has set the no alignment function attribute, ignore
3849   // alloca alignments.
3850   uint64_t AlignVal =
3851       (RealignOpt ? cast<ConstantSDNode>(Align)->getZExtValue() : 0);
3852 
3853   uint64_t StackAlign = TFI->getStackAlignment();
3854   uint64_t RequiredAlign = std::max(AlignVal, StackAlign);
3855   uint64_t ExtraAlignSpace = RequiredAlign - StackAlign;
3856 
3857   Register SPReg = getStackPointerRegisterToSaveRestore();
3858   SDValue NeededSpace = Size;
3859 
3860   // Get a reference to the stack pointer.
3861   SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64);
3862 
3863   // If we need a backchain, save it now.
3864   SDValue Backchain;
3865   if (StoreBackchain)
3866     Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
3867                             MachinePointerInfo());
3868 
3869   // Add extra space for alignment if needed.
3870   if (ExtraAlignSpace)
3871     NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace,
3872                               DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
3873 
3874   // Get the new stack pointer value.
3875   SDValue NewSP;
3876   if (hasInlineStackProbe(MF)) {
3877     NewSP = DAG.getNode(SystemZISD::PROBED_ALLOCA, DL,
3878                 DAG.getVTList(MVT::i64, MVT::Other), Chain, OldSP, NeededSpace);
3879     Chain = NewSP.getValue(1);
3880   }
3881   else {
3882     NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
3883     // Copy the new stack pointer back.
3884     Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
3885   }
3886 
3887   // The allocated data lives above the 160 bytes allocated for the standard
3888   // frame, plus any outgoing stack arguments.  We don't know how much that
3889   // amounts to yet, so emit a special ADJDYNALLOC placeholder.
3890   SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
3891   SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
3892 
3893   // Dynamically realign if needed.
3894   if (RequiredAlign > StackAlign) {
3895     Result =
3896       DAG.getNode(ISD::ADD, DL, MVT::i64, Result,
3897                   DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
3898     Result =
3899       DAG.getNode(ISD::AND, DL, MVT::i64, Result,
3900                   DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64));
3901   }
3902 
3903   if (StoreBackchain)
3904     Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
3905                          MachinePointerInfo());
3906 
3907   SDValue Ops[2] = { Result, Chain };
3908   return DAG.getMergeValues(Ops, DL);
3909 }
3910 
3911 SDValue SystemZTargetLowering::lowerGET_DYNAMIC_AREA_OFFSET(
3912     SDValue Op, SelectionDAG &DAG) const {
3913   SDLoc DL(Op);
3914 
3915   return DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64);
3916 }
3917 
3918 SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
3919                                               SelectionDAG &DAG) const {
3920   EVT VT = Op.getValueType();
3921   SDLoc DL(Op);
3922   SDValue Ops[2];
3923   if (is32Bit(VT))
3924     // Just do a normal 64-bit multiplication and extract the results.
3925     // We define this so that it can be used for constant division.
3926     lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
3927                     Op.getOperand(1), Ops[1], Ops[0]);
3928   else if (Subtarget.hasMiscellaneousExtensions2())
3929     // SystemZISD::SMUL_LOHI returns the low result in the odd register and
3930     // the high result in the even register.  ISD::SMUL_LOHI is defined to
3931     // return the low half first, so the results are in reverse order.
3932     lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI,
3933                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3934   else {
3935     // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI:
3936     //
3937     //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
3938     //
3939     // but using the fact that the upper halves are either all zeros
3940     // or all ones:
3941     //
3942     //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
3943     //
3944     // and grouping the right terms together since they are quicker than the
3945     // multiplication:
3946     //
3947     //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
3948     SDValue C63 = DAG.getConstant(63, DL, MVT::i64);
3949     SDValue LL = Op.getOperand(0);
3950     SDValue RL = Op.getOperand(1);
3951     SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
3952     SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
3953     // SystemZISD::UMUL_LOHI returns the low result in the odd register and
3954     // the high result in the even register.  ISD::SMUL_LOHI is defined to
3955     // return the low half first, so the results are in reverse order.
3956     lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
3957                      LL, RL, Ops[1], Ops[0]);
3958     SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
3959     SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
3960     SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
3961     Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
3962   }
3963   return DAG.getMergeValues(Ops, DL);
3964 }
3965 
3966 SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
3967                                               SelectionDAG &DAG) const {
3968   EVT VT = Op.getValueType();
3969   SDLoc DL(Op);
3970   SDValue Ops[2];
3971   if (is32Bit(VT))
3972     // Just do a normal 64-bit multiplication and extract the results.
3973     // We define this so that it can be used for constant division.
3974     lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
3975                     Op.getOperand(1), Ops[1], Ops[0]);
3976   else
3977     // SystemZISD::UMUL_LOHI returns the low result in the odd register and
3978     // the high result in the even register.  ISD::UMUL_LOHI is defined to
3979     // return the low half first, so the results are in reverse order.
3980     lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI,
3981                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
3982   return DAG.getMergeValues(Ops, DL);
3983 }
3984 
3985 SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
3986                                             SelectionDAG &DAG) const {
3987   SDValue Op0 = Op.getOperand(0);
3988   SDValue Op1 = Op.getOperand(1);
3989   EVT VT = Op.getValueType();
3990   SDLoc DL(Op);
3991 
3992   // We use DSGF for 32-bit division.  This means the first operand must
3993   // always be 64-bit, and the second operand should be 32-bit whenever
3994   // that is possible, to improve performance.
3995   if (is32Bit(VT))
3996     Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
3997   else if (DAG.ComputeNumSignBits(Op1) > 32)
3998     Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
3999 
4000   // DSG(F) returns the remainder in the even register and the
4001   // quotient in the odd register.
4002   SDValue Ops[2];
4003   lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]);
4004   return DAG.getMergeValues(Ops, DL);
4005 }
4006 
4007 SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
4008                                             SelectionDAG &DAG) const {
4009   EVT VT = Op.getValueType();
4010   SDLoc DL(Op);
4011 
4012   // DL(G) returns the remainder in the even register and the
4013   // quotient in the odd register.
4014   SDValue Ops[2];
4015   lowerGR128Binary(DAG, DL, VT, SystemZISD::UDIVREM,
4016                    Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
4017   return DAG.getMergeValues(Ops, DL);
4018 }
4019 
4020 SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
4021   assert(Op.getValueType() == MVT::i64 && "Should be 64-bit operation");
4022 
4023   // Get the known-zero masks for each operand.
4024   SDValue Ops[] = {Op.getOperand(0), Op.getOperand(1)};
4025   KnownBits Known[2] = {DAG.computeKnownBits(Ops[0]),
4026                         DAG.computeKnownBits(Ops[1])};
4027 
4028   // See if the upper 32 bits of one operand and the lower 32 bits of the
4029   // other are known zero.  They are the low and high operands respectively.
4030   uint64_t Masks[] = { Known[0].Zero.getZExtValue(),
4031                        Known[1].Zero.getZExtValue() };
4032   unsigned High, Low;
4033   if ((Masks[0] >> 32) == 0xffffffff && uint32_t(Masks[1]) == 0xffffffff)
4034     High = 1, Low = 0;
4035   else if ((Masks[1] >> 32) == 0xffffffff && uint32_t(Masks[0]) == 0xffffffff)
4036     High = 0, Low = 1;
4037   else
4038     return Op;
4039 
4040   SDValue LowOp = Ops[Low];
4041   SDValue HighOp = Ops[High];
4042 
4043   // If the high part is a constant, we're better off using IILH.
4044   if (HighOp.getOpcode() == ISD::Constant)
4045     return Op;
4046 
4047   // If the low part is a constant that is outside the range of LHI,
4048   // then we're better off using IILF.
4049   if (LowOp.getOpcode() == ISD::Constant) {
4050     int64_t Value = int32_t(cast<ConstantSDNode>(LowOp)->getZExtValue());
4051     if (!isInt<16>(Value))
4052       return Op;
4053   }
4054 
4055   // Check whether the high part is an AND that doesn't change the
4056   // high 32 bits and just masks out low bits.  We can skip it if so.
4057   if (HighOp.getOpcode() == ISD::AND &&
4058       HighOp.getOperand(1).getOpcode() == ISD::Constant) {
4059     SDValue HighOp0 = HighOp.getOperand(0);
4060     uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
4061     if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
4062       HighOp = HighOp0;
4063   }
4064 
4065   // Take advantage of the fact that all GR32 operations only change the
4066   // low 32 bits by truncating Low to an i32 and inserting it directly
4067   // using a subreg.  The interesting cases are those where the truncation
4068   // can be folded.
4069   SDLoc DL(Op);
4070   SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
4071   return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
4072                                    MVT::i64, HighOp, Low32);
4073 }
4074 
4075 // Lower SADDO/SSUBO/UADDO/USUBO nodes.
4076 SDValue SystemZTargetLowering::lowerXALUO(SDValue Op,
4077                                           SelectionDAG &DAG) const {
4078   SDNode *N = Op.getNode();
4079   SDValue LHS = N->getOperand(0);
4080   SDValue RHS = N->getOperand(1);
4081   SDLoc DL(N);
4082   unsigned BaseOp = 0;
4083   unsigned CCValid = 0;
4084   unsigned CCMask = 0;
4085 
4086   switch (Op.getOpcode()) {
4087   default: llvm_unreachable("Unknown instruction!");
4088   case ISD::SADDO:
4089     BaseOp = SystemZISD::SADDO;
4090     CCValid = SystemZ::CCMASK_ARITH;
4091     CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
4092     break;
4093   case ISD::SSUBO:
4094     BaseOp = SystemZISD::SSUBO;
4095     CCValid = SystemZ::CCMASK_ARITH;
4096     CCMask = SystemZ::CCMASK_ARITH_OVERFLOW;
4097     break;
4098   case ISD::UADDO:
4099     BaseOp = SystemZISD::UADDO;
4100     CCValid = SystemZ::CCMASK_LOGICAL;
4101     CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
4102     break;
4103   case ISD::USUBO:
4104     BaseOp = SystemZISD::USUBO;
4105     CCValid = SystemZ::CCMASK_LOGICAL;
4106     CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
4107     break;
4108   }
4109 
4110   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
4111   SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
4112 
4113   SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
4114   if (N->getValueType(1) == MVT::i1)
4115     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
4116 
4117   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
4118 }
4119 
4120 static bool isAddCarryChain(SDValue Carry) {
4121   while (Carry.getOpcode() == ISD::UADDO_CARRY)
4122     Carry = Carry.getOperand(2);
4123   return Carry.getOpcode() == ISD::UADDO;
4124 }
4125 
4126 static bool isSubBorrowChain(SDValue Carry) {
4127   while (Carry.getOpcode() == ISD::USUBO_CARRY)
4128     Carry = Carry.getOperand(2);
4129   return Carry.getOpcode() == ISD::USUBO;
4130 }
4131 
4132 // Lower UADDO_CARRY/USUBO_CARRY nodes.
4133 SDValue SystemZTargetLowering::lowerUADDSUBO_CARRY(SDValue Op,
4134                                                    SelectionDAG &DAG) const {
4135 
4136   SDNode *N = Op.getNode();
4137   MVT VT = N->getSimpleValueType(0);
4138 
4139   // Let legalize expand this if it isn't a legal type yet.
4140   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
4141     return SDValue();
4142 
4143   SDValue LHS = N->getOperand(0);
4144   SDValue RHS = N->getOperand(1);
4145   SDValue Carry = Op.getOperand(2);
4146   SDLoc DL(N);
4147   unsigned BaseOp = 0;
4148   unsigned CCValid = 0;
4149   unsigned CCMask = 0;
4150 
4151   switch (Op.getOpcode()) {
4152   default: llvm_unreachable("Unknown instruction!");
4153   case ISD::UADDO_CARRY:
4154     if (!isAddCarryChain(Carry))
4155       return SDValue();
4156 
4157     BaseOp = SystemZISD::ADDCARRY;
4158     CCValid = SystemZ::CCMASK_LOGICAL;
4159     CCMask = SystemZ::CCMASK_LOGICAL_CARRY;
4160     break;
4161   case ISD::USUBO_CARRY:
4162     if (!isSubBorrowChain(Carry))
4163       return SDValue();
4164 
4165     BaseOp = SystemZISD::SUBCARRY;
4166     CCValid = SystemZ::CCMASK_LOGICAL;
4167     CCMask = SystemZ::CCMASK_LOGICAL_BORROW;
4168     break;
4169   }
4170 
4171   // Set the condition code from the carry flag.
4172   Carry = DAG.getNode(SystemZISD::GET_CCMASK, DL, MVT::i32, Carry,
4173                       DAG.getConstant(CCValid, DL, MVT::i32),
4174                       DAG.getConstant(CCMask, DL, MVT::i32));
4175 
4176   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4177   SDValue Result = DAG.getNode(BaseOp, DL, VTs, LHS, RHS, Carry);
4178 
4179   SDValue SetCC = emitSETCC(DAG, DL, Result.getValue(1), CCValid, CCMask);
4180   if (N->getValueType(1) == MVT::i1)
4181     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
4182 
4183   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, SetCC);
4184 }
4185 
4186 SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
4187                                           SelectionDAG &DAG) const {
4188   EVT VT = Op.getValueType();
4189   SDLoc DL(Op);
4190   Op = Op.getOperand(0);
4191 
4192   // Handle vector types via VPOPCT.
4193   if (VT.isVector()) {
4194     Op = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Op);
4195     Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::v16i8, Op);
4196     switch (VT.getScalarSizeInBits()) {
4197     case 8:
4198       break;
4199     case 16: {
4200       Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
4201       SDValue Shift = DAG.getConstant(8, DL, MVT::i32);
4202       SDValue Tmp = DAG.getNode(SystemZISD::VSHL_BY_SCALAR, DL, VT, Op, Shift);
4203       Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
4204       Op = DAG.getNode(SystemZISD::VSRL_BY_SCALAR, DL, VT, Op, Shift);
4205       break;
4206     }
4207     case 32: {
4208       SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
4209                                             DAG.getConstant(0, DL, MVT::i32));
4210       Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
4211       break;
4212     }
4213     case 64: {
4214       SDValue Tmp = DAG.getSplatBuildVector(MVT::v16i8, DL,
4215                                             DAG.getConstant(0, DL, MVT::i32));
4216       Op = DAG.getNode(SystemZISD::VSUM, DL, MVT::v4i32, Op, Tmp);
4217       Op = DAG.getNode(SystemZISD::VSUM, DL, VT, Op, Tmp);
4218       break;
4219     }
4220     default:
4221       llvm_unreachable("Unexpected type");
4222     }
4223     return Op;
4224   }
4225 
4226   // Get the known-zero mask for the operand.
4227   KnownBits Known = DAG.computeKnownBits(Op);
4228   unsigned NumSignificantBits = Known.getMaxValue().getActiveBits();
4229   if (NumSignificantBits == 0)
4230     return DAG.getConstant(0, DL, VT);
4231 
4232   // Skip known-zero high parts of the operand.
4233   int64_t OrigBitSize = VT.getSizeInBits();
4234   int64_t BitSize = llvm::bit_ceil(NumSignificantBits);
4235   BitSize = std::min(BitSize, OrigBitSize);
4236 
4237   // The POPCNT instruction counts the number of bits in each byte.
4238   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op);
4239   Op = DAG.getNode(SystemZISD::POPCNT, DL, MVT::i64, Op);
4240   Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
4241 
4242   // Add up per-byte counts in a binary tree.  All bits of Op at
4243   // position larger than BitSize remain zero throughout.
4244   for (int64_t I = BitSize / 2; I >= 8; I = I / 2) {
4245     SDValue Tmp = DAG.getNode(ISD::SHL, DL, VT, Op, DAG.getConstant(I, DL, VT));
4246     if (BitSize != OrigBitSize)
4247       Tmp = DAG.getNode(ISD::AND, DL, VT, Tmp,
4248                         DAG.getConstant(((uint64_t)1 << BitSize) - 1, DL, VT));
4249     Op = DAG.getNode(ISD::ADD, DL, VT, Op, Tmp);
4250   }
4251 
4252   // Extract overall result from high byte.
4253   if (BitSize > 8)
4254     Op = DAG.getNode(ISD::SRL, DL, VT, Op,
4255                      DAG.getConstant(BitSize - 8, DL, VT));
4256 
4257   return Op;
4258 }
4259 
4260 SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
4261                                                  SelectionDAG &DAG) const {
4262   SDLoc DL(Op);
4263   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
4264     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
4265   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
4266     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
4267 
4268   // The only fence that needs an instruction is a sequentially-consistent
4269   // cross-thread fence.
4270   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
4271       FenceSSID == SyncScope::System) {
4272     return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other,
4273                                       Op.getOperand(0)),
4274                    0);
4275   }
4276 
4277   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
4278   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
4279 }
4280 
4281 // Op is an atomic load.  Lower it into a normal volatile load.
4282 SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
4283                                                 SelectionDAG &DAG) const {
4284   auto *Node = cast<AtomicSDNode>(Op.getNode());
4285   return DAG.getExtLoad(ISD::EXTLOAD, SDLoc(Op), Op.getValueType(),
4286                         Node->getChain(), Node->getBasePtr(),
4287                         Node->getMemoryVT(), Node->getMemOperand());
4288 }
4289 
4290 // Op is an atomic store.  Lower it into a normal volatile store.
4291 SDValue SystemZTargetLowering::lowerATOMIC_STORE(SDValue Op,
4292                                                  SelectionDAG &DAG) const {
4293   auto *Node = cast<AtomicSDNode>(Op.getNode());
4294   SDValue Chain = DAG.getTruncStore(Node->getChain(), SDLoc(Op), Node->getVal(),
4295                                     Node->getBasePtr(), Node->getMemoryVT(),
4296                                     Node->getMemOperand());
4297   // We have to enforce sequential consistency by performing a
4298   // serialization operation after the store.
4299   if (Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent)
4300     Chain = SDValue(DAG.getMachineNode(SystemZ::Serialize, SDLoc(Op),
4301                                        MVT::Other, Chain), 0);
4302   return Chain;
4303 }
4304 
4305 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
4306 // two into the fullword ATOMIC_LOADW_* operation given by Opcode.
4307 SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
4308                                                    SelectionDAG &DAG,
4309                                                    unsigned Opcode) const {
4310   auto *Node = cast<AtomicSDNode>(Op.getNode());
4311 
4312   // 32-bit operations need no code outside the main loop.
4313   EVT NarrowVT = Node->getMemoryVT();
4314   EVT WideVT = MVT::i32;
4315   if (NarrowVT == WideVT)
4316     return Op;
4317 
4318   int64_t BitSize = NarrowVT.getSizeInBits();
4319   SDValue ChainIn = Node->getChain();
4320   SDValue Addr = Node->getBasePtr();
4321   SDValue Src2 = Node->getVal();
4322   MachineMemOperand *MMO = Node->getMemOperand();
4323   SDLoc DL(Node);
4324   EVT PtrVT = Addr.getValueType();
4325 
4326   // Convert atomic subtracts of constants into additions.
4327   if (Opcode == SystemZISD::ATOMIC_LOADW_SUB)
4328     if (auto *Const = dyn_cast<ConstantSDNode>(Src2)) {
4329       Opcode = SystemZISD::ATOMIC_LOADW_ADD;
4330       Src2 = DAG.getConstant(-Const->getSExtValue(), DL, Src2.getValueType());
4331     }
4332 
4333   // Get the address of the containing word.
4334   SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
4335                                     DAG.getConstant(-4, DL, PtrVT));
4336 
4337   // Get the number of bits that the word must be rotated left in order
4338   // to bring the field to the top bits of a GR32.
4339   SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
4340                                  DAG.getConstant(3, DL, PtrVT));
4341   BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
4342 
4343   // Get the complementing shift amount, for rotating a field in the top
4344   // bits back to its proper position.
4345   SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
4346                                     DAG.getConstant(0, DL, WideVT), BitShift);
4347 
4348   // Extend the source operand to 32 bits and prepare it for the inner loop.
4349   // ATOMIC_SWAPW uses RISBG to rotate the field left, but all other
4350   // operations require the source to be shifted in advance.  (This shift
4351   // can be folded if the source is constant.)  For AND and NAND, the lower
4352   // bits must be set, while for other opcodes they should be left clear.
4353   if (Opcode != SystemZISD::ATOMIC_SWAPW)
4354     Src2 = DAG.getNode(ISD::SHL, DL, WideVT, Src2,
4355                        DAG.getConstant(32 - BitSize, DL, WideVT));
4356   if (Opcode == SystemZISD::ATOMIC_LOADW_AND ||
4357       Opcode == SystemZISD::ATOMIC_LOADW_NAND)
4358     Src2 = DAG.getNode(ISD::OR, DL, WideVT, Src2,
4359                        DAG.getConstant(uint32_t(-1) >> BitSize, DL, WideVT));
4360 
4361   // Construct the ATOMIC_LOADW_* node.
4362   SDVTList VTList = DAG.getVTList(WideVT, MVT::Other);
4363   SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
4364                     DAG.getConstant(BitSize, DL, WideVT) };
4365   SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
4366                                              NarrowVT, MMO);
4367 
4368   // Rotate the result of the final CS so that the field is in the lower
4369   // bits of a GR32, then truncate it.
4370   SDValue ResultShift = DAG.getNode(ISD::ADD, DL, WideVT, BitShift,
4371                                     DAG.getConstant(BitSize, DL, WideVT));
4372   SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
4373 
4374   SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
4375   return DAG.getMergeValues(RetOps, DL);
4376 }
4377 
4378 // Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
4379 // into ATOMIC_LOADW_SUBs and decide whether to convert 32- and 64-bit
4380 // operations into additions.
4381 SDValue SystemZTargetLowering::lowerATOMIC_LOAD_SUB(SDValue Op,
4382                                                     SelectionDAG &DAG) const {
4383   auto *Node = cast<AtomicSDNode>(Op.getNode());
4384   EVT MemVT = Node->getMemoryVT();
4385   if (MemVT == MVT::i32 || MemVT == MVT::i64) {
4386     // A full-width operation.
4387     assert(Op.getValueType() == MemVT && "Mismatched VTs");
4388     SDValue Src2 = Node->getVal();
4389     SDValue NegSrc2;
4390     SDLoc DL(Src2);
4391 
4392     if (auto *Op2 = dyn_cast<ConstantSDNode>(Src2)) {
4393       // Use an addition if the operand is constant and either LAA(G) is
4394       // available or the negative value is in the range of A(G)FHI.
4395       int64_t Value = (-Op2->getAPIntValue()).getSExtValue();
4396       if (isInt<32>(Value) || Subtarget.hasInterlockedAccess1())
4397         NegSrc2 = DAG.getConstant(Value, DL, MemVT);
4398     } else if (Subtarget.hasInterlockedAccess1())
4399       // Use LAA(G) if available.
4400       NegSrc2 = DAG.getNode(ISD::SUB, DL, MemVT, DAG.getConstant(0, DL, MemVT),
4401                             Src2);
4402 
4403     if (NegSrc2.getNode())
4404       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, MemVT,
4405                            Node->getChain(), Node->getBasePtr(), NegSrc2,
4406                            Node->getMemOperand());
4407 
4408     // Use the node as-is.
4409     return Op;
4410   }
4411 
4412   return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_SUB);
4413 }
4414 
4415 // Lower 8/16/32/64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS node.
4416 SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
4417                                                     SelectionDAG &DAG) const {
4418   auto *Node = cast<AtomicSDNode>(Op.getNode());
4419   SDValue ChainIn = Node->getOperand(0);
4420   SDValue Addr = Node->getOperand(1);
4421   SDValue CmpVal = Node->getOperand(2);
4422   SDValue SwapVal = Node->getOperand(3);
4423   MachineMemOperand *MMO = Node->getMemOperand();
4424   SDLoc DL(Node);
4425 
4426   // We have native support for 32-bit and 64-bit compare and swap, but we
4427   // still need to expand extracting the "success" result from the CC.
4428   EVT NarrowVT = Node->getMemoryVT();
4429   EVT WideVT = NarrowVT == MVT::i64 ? MVT::i64 : MVT::i32;
4430   if (NarrowVT == WideVT) {
4431     SDVTList Tys = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
4432     SDValue Ops[] = { ChainIn, Addr, CmpVal, SwapVal };
4433     SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP,
4434                                                DL, Tys, Ops, NarrowVT, MMO);
4435     SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
4436                                 SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
4437 
4438     DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), AtomicOp.getValue(0));
4439     DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
4440     DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
4441     return SDValue();
4442   }
4443 
4444   // Convert 8-bit and 16-bit compare and swap to a loop, implemented
4445   // via a fullword ATOMIC_CMP_SWAPW operation.
4446   int64_t BitSize = NarrowVT.getSizeInBits();
4447   EVT PtrVT = Addr.getValueType();
4448 
4449   // Get the address of the containing word.
4450   SDValue AlignedAddr = DAG.getNode(ISD::AND, DL, PtrVT, Addr,
4451                                     DAG.getConstant(-4, DL, PtrVT));
4452 
4453   // Get the number of bits that the word must be rotated left in order
4454   // to bring the field to the top bits of a GR32.
4455   SDValue BitShift = DAG.getNode(ISD::SHL, DL, PtrVT, Addr,
4456                                  DAG.getConstant(3, DL, PtrVT));
4457   BitShift = DAG.getNode(ISD::TRUNCATE, DL, WideVT, BitShift);
4458 
4459   // Get the complementing shift amount, for rotating a field in the top
4460   // bits back to its proper position.
4461   SDValue NegBitShift = DAG.getNode(ISD::SUB, DL, WideVT,
4462                                     DAG.getConstant(0, DL, WideVT), BitShift);
4463 
4464   // Construct the ATOMIC_CMP_SWAPW node.
4465   SDVTList VTList = DAG.getVTList(WideVT, MVT::i32, MVT::Other);
4466   SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
4467                     NegBitShift, DAG.getConstant(BitSize, DL, WideVT) };
4468   SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
4469                                              VTList, Ops, NarrowVT, MMO);
4470   SDValue Success = emitSETCC(DAG, DL, AtomicOp.getValue(1),
4471                               SystemZ::CCMASK_ICMP, SystemZ::CCMASK_CMP_EQ);
4472 
4473   // emitAtomicCmpSwapW() will zero extend the result (original value).
4474   SDValue OrigVal = DAG.getNode(ISD::AssertZext, DL, WideVT, AtomicOp.getValue(0),
4475                                 DAG.getValueType(NarrowVT));
4476   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), OrigVal);
4477   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
4478   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), AtomicOp.getValue(2));
4479   return SDValue();
4480 }
4481 
4482 MachineMemOperand::Flags
4483 SystemZTargetLowering::getTargetMMOFlags(const Instruction &I) const {
4484   // Because of how we convert atomic_load and atomic_store to normal loads and
4485   // stores in the DAG, we need to ensure that the MMOs are marked volatile
4486   // since DAGCombine hasn't been updated to account for atomic, but non
4487   // volatile loads.  (See D57601)
4488   if (auto *SI = dyn_cast<StoreInst>(&I))
4489     if (SI->isAtomic())
4490       return MachineMemOperand::MOVolatile;
4491   if (auto *LI = dyn_cast<LoadInst>(&I))
4492     if (LI->isAtomic())
4493       return MachineMemOperand::MOVolatile;
4494   if (auto *AI = dyn_cast<AtomicRMWInst>(&I))
4495     if (AI->isAtomic())
4496       return MachineMemOperand::MOVolatile;
4497   if (auto *AI = dyn_cast<AtomicCmpXchgInst>(&I))
4498     if (AI->isAtomic())
4499       return MachineMemOperand::MOVolatile;
4500   return MachineMemOperand::MONone;
4501 }
4502 
4503 SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
4504                                               SelectionDAG &DAG) const {
4505   MachineFunction &MF = DAG.getMachineFunction();
4506   auto *Regs = Subtarget.getSpecialRegisters();
4507   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
4508     report_fatal_error("Variable-sized stack allocations are not supported "
4509                        "in GHC calling convention");
4510   return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
4511                             Regs->getStackPointerRegister(), Op.getValueType());
4512 }
4513 
4514 SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
4515                                                  SelectionDAG &DAG) const {
4516   MachineFunction &MF = DAG.getMachineFunction();
4517   auto *Regs = Subtarget.getSpecialRegisters();
4518   bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
4519 
4520   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
4521     report_fatal_error("Variable-sized stack allocations are not supported "
4522                        "in GHC calling convention");
4523 
4524   SDValue Chain = Op.getOperand(0);
4525   SDValue NewSP = Op.getOperand(1);
4526   SDValue Backchain;
4527   SDLoc DL(Op);
4528 
4529   if (StoreBackchain) {
4530     SDValue OldSP = DAG.getCopyFromReg(
4531         Chain, DL, Regs->getStackPointerRegister(), MVT::i64);
4532     Backchain = DAG.getLoad(MVT::i64, DL, Chain, getBackchainAddress(OldSP, DAG),
4533                             MachinePointerInfo());
4534   }
4535 
4536   Chain = DAG.getCopyToReg(Chain, DL, Regs->getStackPointerRegister(), NewSP);
4537 
4538   if (StoreBackchain)
4539     Chain = DAG.getStore(Chain, DL, Backchain, getBackchainAddress(NewSP, DAG),
4540                          MachinePointerInfo());
4541 
4542   return Chain;
4543 }
4544 
4545 SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
4546                                              SelectionDAG &DAG) const {
4547   bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
4548   if (!IsData)
4549     // Just preserve the chain.
4550     return Op.getOperand(0);
4551 
4552   SDLoc DL(Op);
4553   bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
4554   unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
4555   auto *Node = cast<MemIntrinsicSDNode>(Op.getNode());
4556   SDValue Ops[] = {Op.getOperand(0), DAG.getTargetConstant(Code, DL, MVT::i32),
4557                    Op.getOperand(1)};
4558   return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, DL,
4559                                  Node->getVTList(), Ops,
4560                                  Node->getMemoryVT(), Node->getMemOperand());
4561 }
4562 
4563 // Convert condition code in CCReg to an i32 value.
4564 static SDValue getCCResult(SelectionDAG &DAG, SDValue CCReg) {
4565   SDLoc DL(CCReg);
4566   SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, CCReg);
4567   return DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
4568                      DAG.getConstant(SystemZ::IPM_CC, DL, MVT::i32));
4569 }
4570 
4571 SDValue
4572 SystemZTargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
4573                                               SelectionDAG &DAG) const {
4574   unsigned Opcode, CCValid;
4575   if (isIntrinsicWithCCAndChain(Op, Opcode, CCValid)) {
4576     assert(Op->getNumValues() == 2 && "Expected only CC result and chain");
4577     SDNode *Node = emitIntrinsicWithCCAndChain(DAG, Op, Opcode);
4578     SDValue CC = getCCResult(DAG, SDValue(Node, 0));
4579     DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), CC);
4580     return SDValue();
4581   }
4582 
4583   return SDValue();
4584 }
4585 
4586 SDValue
4587 SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
4588                                                SelectionDAG &DAG) const {
4589   unsigned Opcode, CCValid;
4590   if (isIntrinsicWithCC(Op, Opcode, CCValid)) {
4591     SDNode *Node = emitIntrinsicWithCC(DAG, Op, Opcode);
4592     if (Op->getNumValues() == 1)
4593       return getCCResult(DAG, SDValue(Node, 0));
4594     assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result");
4595     return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(),
4596                        SDValue(Node, 0), getCCResult(DAG, SDValue(Node, 1)));
4597   }
4598 
4599   unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4600   switch (Id) {
4601   case Intrinsic::thread_pointer:
4602     return lowerThreadPointer(SDLoc(Op), DAG);
4603 
4604   case Intrinsic::s390_vpdi:
4605     return DAG.getNode(SystemZISD::PERMUTE_DWORDS, SDLoc(Op), Op.getValueType(),
4606                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4607 
4608   case Intrinsic::s390_vperm:
4609     return DAG.getNode(SystemZISD::PERMUTE, SDLoc(Op), Op.getValueType(),
4610                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4611 
4612   case Intrinsic::s390_vuphb:
4613   case Intrinsic::s390_vuphh:
4614   case Intrinsic::s390_vuphf:
4615     return DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(Op), Op.getValueType(),
4616                        Op.getOperand(1));
4617 
4618   case Intrinsic::s390_vuplhb:
4619   case Intrinsic::s390_vuplhh:
4620   case Intrinsic::s390_vuplhf:
4621     return DAG.getNode(SystemZISD::UNPACKL_HIGH, SDLoc(Op), Op.getValueType(),
4622                        Op.getOperand(1));
4623 
4624   case Intrinsic::s390_vuplb:
4625   case Intrinsic::s390_vuplhw:
4626   case Intrinsic::s390_vuplf:
4627     return DAG.getNode(SystemZISD::UNPACK_LOW, SDLoc(Op), Op.getValueType(),
4628                        Op.getOperand(1));
4629 
4630   case Intrinsic::s390_vupllb:
4631   case Intrinsic::s390_vupllh:
4632   case Intrinsic::s390_vupllf:
4633     return DAG.getNode(SystemZISD::UNPACKL_LOW, SDLoc(Op), Op.getValueType(),
4634                        Op.getOperand(1));
4635 
4636   case Intrinsic::s390_vsumb:
4637   case Intrinsic::s390_vsumh:
4638   case Intrinsic::s390_vsumgh:
4639   case Intrinsic::s390_vsumgf:
4640   case Intrinsic::s390_vsumqf:
4641   case Intrinsic::s390_vsumqg:
4642     return DAG.getNode(SystemZISD::VSUM, SDLoc(Op), Op.getValueType(),
4643                        Op.getOperand(1), Op.getOperand(2));
4644   }
4645 
4646   return SDValue();
4647 }
4648 
4649 namespace {
4650 // Says that SystemZISD operation Opcode can be used to perform the equivalent
4651 // of a VPERM with permute vector Bytes.  If Opcode takes three operands,
4652 // Operand is the constant third operand, otherwise it is the number of
4653 // bytes in each element of the result.
4654 struct Permute {
4655   unsigned Opcode;
4656   unsigned Operand;
4657   unsigned char Bytes[SystemZ::VectorBytes];
4658 };
4659 }
4660 
4661 static const Permute PermuteForms[] = {
4662   // VMRHG
4663   { SystemZISD::MERGE_HIGH, 8,
4664     { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 } },
4665   // VMRHF
4666   { SystemZISD::MERGE_HIGH, 4,
4667     { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 } },
4668   // VMRHH
4669   { SystemZISD::MERGE_HIGH, 2,
4670     { 0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23 } },
4671   // VMRHB
4672   { SystemZISD::MERGE_HIGH, 1,
4673     { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 } },
4674   // VMRLG
4675   { SystemZISD::MERGE_LOW, 8,
4676     { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 } },
4677   // VMRLF
4678   { SystemZISD::MERGE_LOW, 4,
4679     { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 } },
4680   // VMRLH
4681   { SystemZISD::MERGE_LOW, 2,
4682     { 8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31 } },
4683   // VMRLB
4684   { SystemZISD::MERGE_LOW, 1,
4685     { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 } },
4686   // VPKG
4687   { SystemZISD::PACK, 4,
4688     { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 } },
4689   // VPKF
4690   { SystemZISD::PACK, 2,
4691     { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 } },
4692   // VPKH
4693   { SystemZISD::PACK, 1,
4694     { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 } },
4695   // VPDI V1, V2, 4  (low half of V1, high half of V2)
4696   { SystemZISD::PERMUTE_DWORDS, 4,
4697     { 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 } },
4698   // VPDI V1, V2, 1  (high half of V1, low half of V2)
4699   { SystemZISD::PERMUTE_DWORDS, 1,
4700     { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 } }
4701 };
4702 
4703 // Called after matching a vector shuffle against a particular pattern.
4704 // Both the original shuffle and the pattern have two vector operands.
4705 // OpNos[0] is the operand of the original shuffle that should be used for
4706 // operand 0 of the pattern, or -1 if operand 0 of the pattern can be anything.
4707 // OpNos[1] is the same for operand 1 of the pattern.  Resolve these -1s and
4708 // set OpNo0 and OpNo1 to the shuffle operands that should actually be used
4709 // for operands 0 and 1 of the pattern.
4710 static bool chooseShuffleOpNos(int *OpNos, unsigned &OpNo0, unsigned &OpNo1) {
4711   if (OpNos[0] < 0) {
4712     if (OpNos[1] < 0)
4713       return false;
4714     OpNo0 = OpNo1 = OpNos[1];
4715   } else if (OpNos[1] < 0) {
4716     OpNo0 = OpNo1 = OpNos[0];
4717   } else {
4718     OpNo0 = OpNos[0];
4719     OpNo1 = OpNos[1];
4720   }
4721   return true;
4722 }
4723 
4724 // Bytes is a VPERM-like permute vector, except that -1 is used for
4725 // undefined bytes.  Return true if the VPERM can be implemented using P.
4726 // When returning true set OpNo0 to the VPERM operand that should be
4727 // used for operand 0 of P and likewise OpNo1 for operand 1 of P.
4728 //
4729 // For example, if swapping the VPERM operands allows P to match, OpNo0
4730 // will be 1 and OpNo1 will be 0.  If instead Bytes only refers to one
4731 // operand, but rewriting it to use two duplicated operands allows it to
4732 // match P, then OpNo0 and OpNo1 will be the same.
4733 static bool matchPermute(const SmallVectorImpl<int> &Bytes, const Permute &P,
4734                          unsigned &OpNo0, unsigned &OpNo1) {
4735   int OpNos[] = { -1, -1 };
4736   for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
4737     int Elt = Bytes[I];
4738     if (Elt >= 0) {
4739       // Make sure that the two permute vectors use the same suboperand
4740       // byte number.  Only the operand numbers (the high bits) are
4741       // allowed to differ.
4742       if ((Elt ^ P.Bytes[I]) & (SystemZ::VectorBytes - 1))
4743         return false;
4744       int ModelOpNo = P.Bytes[I] / SystemZ::VectorBytes;
4745       int RealOpNo = unsigned(Elt) / SystemZ::VectorBytes;
4746       // Make sure that the operand mappings are consistent with previous
4747       // elements.
4748       if (OpNos[ModelOpNo] == 1 - RealOpNo)
4749         return false;
4750       OpNos[ModelOpNo] = RealOpNo;
4751     }
4752   }
4753   return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
4754 }
4755 
4756 // As above, but search for a matching permute.
4757 static const Permute *matchPermute(const SmallVectorImpl<int> &Bytes,
4758                                    unsigned &OpNo0, unsigned &OpNo1) {
4759   for (auto &P : PermuteForms)
4760     if (matchPermute(Bytes, P, OpNo0, OpNo1))
4761       return &P;
4762   return nullptr;
4763 }
4764 
4765 // Bytes is a VPERM-like permute vector, except that -1 is used for
4766 // undefined bytes.  This permute is an operand of an outer permute.
4767 // See whether redistributing the -1 bytes gives a shuffle that can be
4768 // implemented using P.  If so, set Transform to a VPERM-like permute vector
4769 // that, when applied to the result of P, gives the original permute in Bytes.
4770 static bool matchDoublePermute(const SmallVectorImpl<int> &Bytes,
4771                                const Permute &P,
4772                                SmallVectorImpl<int> &Transform) {
4773   unsigned To = 0;
4774   for (unsigned From = 0; From < SystemZ::VectorBytes; ++From) {
4775     int Elt = Bytes[From];
4776     if (Elt < 0)
4777       // Byte number From of the result is undefined.
4778       Transform[From] = -1;
4779     else {
4780       while (P.Bytes[To] != Elt) {
4781         To += 1;
4782         if (To == SystemZ::VectorBytes)
4783           return false;
4784       }
4785       Transform[From] = To;
4786     }
4787   }
4788   return true;
4789 }
4790 
4791 // As above, but search for a matching permute.
4792 static const Permute *matchDoublePermute(const SmallVectorImpl<int> &Bytes,
4793                                          SmallVectorImpl<int> &Transform) {
4794   for (auto &P : PermuteForms)
4795     if (matchDoublePermute(Bytes, P, Transform))
4796       return &P;
4797   return nullptr;
4798 }
4799 
4800 // Convert the mask of the given shuffle op into a byte-level mask,
4801 // as if it had type vNi8.
4802 static bool getVPermMask(SDValue ShuffleOp,
4803                          SmallVectorImpl<int> &Bytes) {
4804   EVT VT = ShuffleOp.getValueType();
4805   unsigned NumElements = VT.getVectorNumElements();
4806   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
4807 
4808   if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(ShuffleOp)) {
4809     Bytes.resize(NumElements * BytesPerElement, -1);
4810     for (unsigned I = 0; I < NumElements; ++I) {
4811       int Index = VSN->getMaskElt(I);
4812       if (Index >= 0)
4813         for (unsigned J = 0; J < BytesPerElement; ++J)
4814           Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
4815     }
4816     return true;
4817   }
4818   if (SystemZISD::SPLAT == ShuffleOp.getOpcode() &&
4819       isa<ConstantSDNode>(ShuffleOp.getOperand(1))) {
4820     unsigned Index = ShuffleOp.getConstantOperandVal(1);
4821     Bytes.resize(NumElements * BytesPerElement, -1);
4822     for (unsigned I = 0; I < NumElements; ++I)
4823       for (unsigned J = 0; J < BytesPerElement; ++J)
4824         Bytes[I * BytesPerElement + J] = Index * BytesPerElement + J;
4825     return true;
4826   }
4827   return false;
4828 }
4829 
4830 // Bytes is a VPERM-like permute vector, except that -1 is used for
4831 // undefined bytes.  See whether bytes [Start, Start + BytesPerElement) of
4832 // the result come from a contiguous sequence of bytes from one input.
4833 // Set Base to the selector for the first byte if so.
4834 static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
4835                             unsigned BytesPerElement, int &Base) {
4836   Base = -1;
4837   for (unsigned I = 0; I < BytesPerElement; ++I) {
4838     if (Bytes[Start + I] >= 0) {
4839       unsigned Elem = Bytes[Start + I];
4840       if (Base < 0) {
4841         Base = Elem - I;
4842         // Make sure the bytes would come from one input operand.
4843         if (unsigned(Base) % Bytes.size() + BytesPerElement > Bytes.size())
4844           return false;
4845       } else if (unsigned(Base) != Elem - I)
4846         return false;
4847     }
4848   }
4849   return true;
4850 }
4851 
4852 // Bytes is a VPERM-like permute vector, except that -1 is used for
4853 // undefined bytes.  Return true if it can be performed using VSLDB.
4854 // When returning true, set StartIndex to the shift amount and OpNo0
4855 // and OpNo1 to the VPERM operands that should be used as the first
4856 // and second shift operand respectively.
4857 static bool isShlDoublePermute(const SmallVectorImpl<int> &Bytes,
4858                                unsigned &StartIndex, unsigned &OpNo0,
4859                                unsigned &OpNo1) {
4860   int OpNos[] = { -1, -1 };
4861   int Shift = -1;
4862   for (unsigned I = 0; I < 16; ++I) {
4863     int Index = Bytes[I];
4864     if (Index >= 0) {
4865       int ExpectedShift = (Index - I) % SystemZ::VectorBytes;
4866       int ModelOpNo = unsigned(ExpectedShift + I) / SystemZ::VectorBytes;
4867       int RealOpNo = unsigned(Index) / SystemZ::VectorBytes;
4868       if (Shift < 0)
4869         Shift = ExpectedShift;
4870       else if (Shift != ExpectedShift)
4871         return false;
4872       // Make sure that the operand mappings are consistent with previous
4873       // elements.
4874       if (OpNos[ModelOpNo] == 1 - RealOpNo)
4875         return false;
4876       OpNos[ModelOpNo] = RealOpNo;
4877     }
4878   }
4879   StartIndex = Shift;
4880   return chooseShuffleOpNos(OpNos, OpNo0, OpNo1);
4881 }
4882 
4883 // Create a node that performs P on operands Op0 and Op1, casting the
4884 // operands to the appropriate type.  The type of the result is determined by P.
4885 static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4886                               const Permute &P, SDValue Op0, SDValue Op1) {
4887   // VPDI (PERMUTE_DWORDS) always operates on v2i64s.  The input
4888   // elements of a PACK are twice as wide as the outputs.
4889   unsigned InBytes = (P.Opcode == SystemZISD::PERMUTE_DWORDS ? 8 :
4890                       P.Opcode == SystemZISD::PACK ? P.Operand * 2 :
4891                       P.Operand);
4892   // Cast both operands to the appropriate type.
4893   MVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBytes * 8),
4894                               SystemZ::VectorBytes / InBytes);
4895   Op0 = DAG.getNode(ISD::BITCAST, DL, InVT, Op0);
4896   Op1 = DAG.getNode(ISD::BITCAST, DL, InVT, Op1);
4897   SDValue Op;
4898   if (P.Opcode == SystemZISD::PERMUTE_DWORDS) {
4899     SDValue Op2 = DAG.getTargetConstant(P.Operand, DL, MVT::i32);
4900     Op = DAG.getNode(SystemZISD::PERMUTE_DWORDS, DL, InVT, Op0, Op1, Op2);
4901   } else if (P.Opcode == SystemZISD::PACK) {
4902     MVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(P.Operand * 8),
4903                                  SystemZ::VectorBytes / P.Operand);
4904     Op = DAG.getNode(SystemZISD::PACK, DL, OutVT, Op0, Op1);
4905   } else {
4906     Op = DAG.getNode(P.Opcode, DL, InVT, Op0, Op1);
4907   }
4908   return Op;
4909 }
4910 
4911 static bool isZeroVector(SDValue N) {
4912   if (N->getOpcode() == ISD::BITCAST)
4913     N = N->getOperand(0);
4914   if (N->getOpcode() == ISD::SPLAT_VECTOR)
4915     if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
4916       return Op->getZExtValue() == 0;
4917   return ISD::isBuildVectorAllZeros(N.getNode());
4918 }
4919 
4920 // Return the index of the zero/undef vector, or UINT32_MAX if not found.
4921 static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) {
4922   for (unsigned I = 0; I < Num ; I++)
4923     if (isZeroVector(Ops[I]))
4924       return I;
4925   return UINT32_MAX;
4926 }
4927 
4928 // Bytes is a VPERM-like permute vector, except that -1 is used for
4929 // undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
4930 // VSLDB or VPERM.
4931 static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
4932                                      SDValue *Ops,
4933                                      const SmallVectorImpl<int> &Bytes) {
4934   for (unsigned I = 0; I < 2; ++I)
4935     Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
4936 
4937   // First see whether VSLDB can be used.
4938   unsigned StartIndex, OpNo0, OpNo1;
4939   if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
4940     return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
4941                        Ops[OpNo1],
4942                        DAG.getTargetConstant(StartIndex, DL, MVT::i32));
4943 
4944   // Fall back on VPERM.  Construct an SDNode for the permute vector.  Try to
4945   // eliminate a zero vector by reusing any zero index in the permute vector.
4946   unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2);
4947   if (ZeroVecIdx != UINT32_MAX) {
4948     bool MaskFirst = true;
4949     int ZeroIdx = -1;
4950     for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
4951       unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
4952       unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
4953       if (OpNo == ZeroVecIdx && I == 0) {
4954         // If the first byte is zero, use mask as first operand.
4955         ZeroIdx = 0;
4956         break;
4957       }
4958       if (OpNo != ZeroVecIdx && Byte == 0) {
4959         // If mask contains a zero, use it by placing that vector first.
4960         ZeroIdx = I + SystemZ::VectorBytes;
4961         MaskFirst = false;
4962         break;
4963       }
4964     }
4965     if (ZeroIdx != -1) {
4966       SDValue IndexNodes[SystemZ::VectorBytes];
4967       for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
4968         if (Bytes[I] >= 0) {
4969           unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
4970           unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
4971           if (OpNo == ZeroVecIdx)
4972             IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32);
4973           else {
4974             unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte;
4975             IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32);
4976           }
4977         } else
4978           IndexNodes[I] = DAG.getUNDEF(MVT::i32);
4979       }
4980       SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
4981       SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0];
4982       if (MaskFirst)
4983         return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src,
4984                            Mask);
4985       else
4986         return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask,
4987                            Mask);
4988     }
4989   }
4990 
4991   SDValue IndexNodes[SystemZ::VectorBytes];
4992   for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
4993     if (Bytes[I] >= 0)
4994       IndexNodes[I] = DAG.getConstant(Bytes[I], DL, MVT::i32);
4995     else
4996       IndexNodes[I] = DAG.getUNDEF(MVT::i32);
4997   SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
4998   return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0],
4999                      (!Ops[1].isUndef() ? Ops[1] : Ops[0]), Op2);
5000 }
5001 
5002 namespace {
5003 // Describes a general N-operand vector shuffle.
5004 struct GeneralShuffle {
5005   GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
5006   void addUndef();
5007   bool add(SDValue, unsigned);
5008   SDValue getNode(SelectionDAG &, const SDLoc &);
5009   void tryPrepareForUnpack();
5010   bool unpackWasPrepared() { return UnpackFromEltSize <= 4; }
5011   SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
5012 
5013   // The operands of the shuffle.
5014   SmallVector<SDValue, SystemZ::VectorBytes> Ops;
5015 
5016   // Index I is -1 if byte I of the result is undefined.  Otherwise the
5017   // result comes from byte Bytes[I] % SystemZ::VectorBytes of operand
5018   // Bytes[I] / SystemZ::VectorBytes.
5019   SmallVector<int, SystemZ::VectorBytes> Bytes;
5020 
5021   // The type of the shuffle result.
5022   EVT VT;
5023 
5024   // Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
5025   unsigned UnpackFromEltSize;
5026 };
5027 }
5028 
5029 // Add an extra undefined element to the shuffle.
5030 void GeneralShuffle::addUndef() {
5031   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
5032   for (unsigned I = 0; I < BytesPerElement; ++I)
5033     Bytes.push_back(-1);
5034 }
5035 
5036 // Add an extra element to the shuffle, taking it from element Elem of Op.
5037 // A null Op indicates a vector input whose value will be calculated later;
5038 // there is at most one such input per shuffle and it always has the same
5039 // type as the result. Aborts and returns false if the source vector elements
5040 // of an EXTRACT_VECTOR_ELT are smaller than the destination elements. Per
5041 // LLVM they become implicitly extended, but this is rare and not optimized.
5042 bool GeneralShuffle::add(SDValue Op, unsigned Elem) {
5043   unsigned BytesPerElement = VT.getVectorElementType().getStoreSize();
5044 
5045   // The source vector can have wider elements than the result,
5046   // either through an explicit TRUNCATE or because of type legalization.
5047   // We want the least significant part.
5048   EVT FromVT = Op.getNode() ? Op.getValueType() : VT;
5049   unsigned FromBytesPerElement = FromVT.getVectorElementType().getStoreSize();
5050 
5051   // Return false if the source elements are smaller than their destination
5052   // elements.
5053   if (FromBytesPerElement < BytesPerElement)
5054     return false;
5055 
5056   unsigned Byte = ((Elem * FromBytesPerElement) % SystemZ::VectorBytes +
5057                    (FromBytesPerElement - BytesPerElement));
5058 
5059   // Look through things like shuffles and bitcasts.
5060   while (Op.getNode()) {
5061     if (Op.getOpcode() == ISD::BITCAST)
5062       Op = Op.getOperand(0);
5063     else if (Op.getOpcode() == ISD::VECTOR_SHUFFLE && Op.hasOneUse()) {
5064       // See whether the bytes we need come from a contiguous part of one
5065       // operand.
5066       SmallVector<int, SystemZ::VectorBytes> OpBytes;
5067       if (!getVPermMask(Op, OpBytes))
5068         break;
5069       int NewByte;
5070       if (!getShuffleInput(OpBytes, Byte, BytesPerElement, NewByte))
5071         break;
5072       if (NewByte < 0) {
5073         addUndef();
5074         return true;
5075       }
5076       Op = Op.getOperand(unsigned(NewByte) / SystemZ::VectorBytes);
5077       Byte = unsigned(NewByte) % SystemZ::VectorBytes;
5078     } else if (Op.isUndef()) {
5079       addUndef();
5080       return true;
5081     } else
5082       break;
5083   }
5084 
5085   // Make sure that the source of the extraction is in Ops.
5086   unsigned OpNo = 0;
5087   for (; OpNo < Ops.size(); ++OpNo)
5088     if (Ops[OpNo] == Op)
5089       break;
5090   if (OpNo == Ops.size())
5091     Ops.push_back(Op);
5092 
5093   // Add the element to Bytes.
5094   unsigned Base = OpNo * SystemZ::VectorBytes + Byte;
5095   for (unsigned I = 0; I < BytesPerElement; ++I)
5096     Bytes.push_back(Base + I);
5097 
5098   return true;
5099 }
5100 
5101 // Return SDNodes for the completed shuffle.
5102 SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
5103   assert(Bytes.size() == SystemZ::VectorBytes && "Incomplete vector");
5104 
5105   if (Ops.size() == 0)
5106     return DAG.getUNDEF(VT);
5107 
5108   // Use a single unpack if possible as the last operation.
5109   tryPrepareForUnpack();
5110 
5111   // Make sure that there are at least two shuffle operands.
5112   if (Ops.size() == 1)
5113     Ops.push_back(DAG.getUNDEF(MVT::v16i8));
5114 
5115   // Create a tree of shuffles, deferring root node until after the loop.
5116   // Try to redistribute the undefined elements of non-root nodes so that
5117   // the non-root shuffles match something like a pack or merge, then adjust
5118   // the parent node's permute vector to compensate for the new order.
5119   // Among other things, this copes with vectors like <2 x i16> that were
5120   // padded with undefined elements during type legalization.
5121   //
5122   // In the best case this redistribution will lead to the whole tree
5123   // using packs and merges.  It should rarely be a loss in other cases.
5124   unsigned Stride = 1;
5125   for (; Stride * 2 < Ops.size(); Stride *= 2) {
5126     for (unsigned I = 0; I < Ops.size() - Stride; I += Stride * 2) {
5127       SDValue SubOps[] = { Ops[I], Ops[I + Stride] };
5128 
5129       // Create a mask for just these two operands.
5130       SmallVector<int, SystemZ::VectorBytes> NewBytes(SystemZ::VectorBytes);
5131       for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
5132         unsigned OpNo = unsigned(Bytes[J]) / SystemZ::VectorBytes;
5133         unsigned Byte = unsigned(Bytes[J]) % SystemZ::VectorBytes;
5134         if (OpNo == I)
5135           NewBytes[J] = Byte;
5136         else if (OpNo == I + Stride)
5137           NewBytes[J] = SystemZ::VectorBytes + Byte;
5138         else
5139           NewBytes[J] = -1;
5140       }
5141       // See if it would be better to reorganize NewMask to avoid using VPERM.
5142       SmallVector<int, SystemZ::VectorBytes> NewBytesMap(SystemZ::VectorBytes);
5143       if (const Permute *P = matchDoublePermute(NewBytes, NewBytesMap)) {
5144         Ops[I] = getPermuteNode(DAG, DL, *P, SubOps[0], SubOps[1]);
5145         // Applying NewBytesMap to Ops[I] gets back to NewBytes.
5146         for (unsigned J = 0; J < SystemZ::VectorBytes; ++J) {
5147           if (NewBytes[J] >= 0) {
5148             assert(unsigned(NewBytesMap[J]) < SystemZ::VectorBytes &&
5149                    "Invalid double permute");
5150             Bytes[J] = I * SystemZ::VectorBytes + NewBytesMap[J];
5151           } else
5152             assert(NewBytesMap[J] < 0 && "Invalid double permute");
5153         }
5154       } else {
5155         // Just use NewBytes on the operands.
5156         Ops[I] = getGeneralPermuteNode(DAG, DL, SubOps, NewBytes);
5157         for (unsigned J = 0; J < SystemZ::VectorBytes; ++J)
5158           if (NewBytes[J] >= 0)
5159             Bytes[J] = I * SystemZ::VectorBytes + J;
5160       }
5161     }
5162   }
5163 
5164   // Now we just have 2 inputs.  Put the second operand in Ops[1].
5165   if (Stride > 1) {
5166     Ops[1] = Ops[Stride];
5167     for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
5168       if (Bytes[I] >= int(SystemZ::VectorBytes))
5169         Bytes[I] -= (Stride - 1) * SystemZ::VectorBytes;
5170   }
5171 
5172   // Look for an instruction that can do the permute without resorting
5173   // to VPERM.
5174   unsigned OpNo0, OpNo1;
5175   SDValue Op;
5176   if (unpackWasPrepared() && Ops[1].isUndef())
5177     Op = Ops[0];
5178   else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
5179     Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
5180   else
5181     Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
5182 
5183   Op = insertUnpackIfPrepared(DAG, DL, Op);
5184 
5185   return DAG.getNode(ISD::BITCAST, DL, VT, Op);
5186 }
5187 
5188 #ifndef NDEBUG
5189 static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
5190   dbgs() << Msg.c_str() << " { ";
5191   for (unsigned i = 0; i < Bytes.size(); i++)
5192     dbgs() << Bytes[i] << " ";
5193   dbgs() << "}\n";
5194 }
5195 #endif
5196 
5197 // If the Bytes vector matches an unpack operation, prepare to do the unpack
5198 // after all else by removing the zero vector and the effect of the unpack on
5199 // Bytes.
5200 void GeneralShuffle::tryPrepareForUnpack() {
5201   uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size());
5202   if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1)
5203     return;
5204 
5205   // Only do this if removing the zero vector reduces the depth, otherwise
5206   // the critical path will increase with the final unpack.
5207   if (Ops.size() > 2 &&
5208       Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1))
5209     return;
5210 
5211   // Find an unpack that would allow removing the zero vector from Ops.
5212   UnpackFromEltSize = 1;
5213   for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) {
5214     bool MatchUnpack = true;
5215     SmallVector<int, SystemZ::VectorBytes> SrcBytes;
5216     for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) {
5217       unsigned ToEltSize = UnpackFromEltSize * 2;
5218       bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
5219       if (!IsZextByte)
5220         SrcBytes.push_back(Bytes[Elt]);
5221       if (Bytes[Elt] != -1) {
5222         unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes;
5223         if (IsZextByte != (OpNo == ZeroVecOpNo)) {
5224           MatchUnpack = false;
5225           break;
5226         }
5227       }
5228     }
5229     if (MatchUnpack) {
5230       if (Ops.size() == 2) {
5231         // Don't use unpack if a single source operand needs rearrangement.
5232         for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++)
5233           if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) {
5234             UnpackFromEltSize = UINT_MAX;
5235             return;
5236           }
5237       }
5238       break;
5239     }
5240   }
5241   if (UnpackFromEltSize > 4)
5242     return;
5243 
5244   LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size "
5245              << UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo
5246              << ".\n";
5247              dumpBytes(Bytes, "Original Bytes vector:"););
5248 
5249   // Apply the unpack in reverse to the Bytes array.
5250   unsigned B = 0;
5251   for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) {
5252     Elt += UnpackFromEltSize;
5253     for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++)
5254       Bytes[B] = Bytes[Elt];
5255   }
5256   while (B < SystemZ::VectorBytes)
5257     Bytes[B++] = -1;
5258 
5259   // Remove the zero vector from Ops
5260   Ops.erase(&Ops[ZeroVecOpNo]);
5261   for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
5262     if (Bytes[I] >= 0) {
5263       unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
5264       if (OpNo > ZeroVecOpNo)
5265         Bytes[I] -= SystemZ::VectorBytes;
5266     }
5267 
5268   LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:");
5269              dbgs() << "\n";);
5270 }
5271 
5272 SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG,
5273                                                const SDLoc &DL,
5274                                                SDValue Op) {
5275   if (!unpackWasPrepared())
5276     return Op;
5277   unsigned InBits = UnpackFromEltSize * 8;
5278   EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits),
5279                                 SystemZ::VectorBits / InBits);
5280   SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op);
5281   unsigned OutBits = InBits * 2;
5282   EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits),
5283                                SystemZ::VectorBits / OutBits);
5284   return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
5285 }
5286 
5287 // Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
5288 static bool isScalarToVector(SDValue Op) {
5289   for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
5290     if (!Op.getOperand(I).isUndef())
5291       return false;
5292   return true;
5293 }
5294 
5295 // Return a vector of type VT that contains Value in the first element.
5296 // The other elements don't matter.
5297 static SDValue buildScalarToVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
5298                                    SDValue Value) {
5299   // If we have a constant, replicate it to all elements and let the
5300   // BUILD_VECTOR lowering take care of it.
5301   if (Value.getOpcode() == ISD::Constant ||
5302       Value.getOpcode() == ISD::ConstantFP) {
5303     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Value);
5304     return DAG.getBuildVector(VT, DL, Ops);
5305   }
5306   if (Value.isUndef())
5307     return DAG.getUNDEF(VT);
5308   return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
5309 }
5310 
5311 // Return a vector of type VT in which Op0 is in element 0 and Op1 is in
5312 // element 1.  Used for cases in which replication is cheap.
5313 static SDValue buildMergeScalars(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
5314                                  SDValue Op0, SDValue Op1) {
5315   if (Op0.isUndef()) {
5316     if (Op1.isUndef())
5317       return DAG.getUNDEF(VT);
5318     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op1);
5319   }
5320   if (Op1.isUndef())
5321     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0);
5322   return DAG.getNode(SystemZISD::MERGE_HIGH, DL, VT,
5323                      buildScalarToVector(DAG, DL, VT, Op0),
5324                      buildScalarToVector(DAG, DL, VT, Op1));
5325 }
5326 
5327 // Extend GPR scalars Op0 and Op1 to doublewords and return a v2i64
5328 // vector for them.
5329 static SDValue joinDwords(SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
5330                           SDValue Op1) {
5331   if (Op0.isUndef() && Op1.isUndef())
5332     return DAG.getUNDEF(MVT::v2i64);
5333   // If one of the two inputs is undefined then replicate the other one,
5334   // in order to avoid using another register unnecessarily.
5335   if (Op0.isUndef())
5336     Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
5337   else if (Op1.isUndef())
5338     Op0 = Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
5339   else {
5340     Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op0);
5341     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op1);
5342   }
5343   return DAG.getNode(SystemZISD::JOIN_DWORDS, DL, MVT::v2i64, Op0, Op1);
5344 }
5345 
5346 // If a BUILD_VECTOR contains some EXTRACT_VECTOR_ELTs, it's usually
5347 // better to use VECTOR_SHUFFLEs on them, only using BUILD_VECTOR for
5348 // the non-EXTRACT_VECTOR_ELT elements.  See if the given BUILD_VECTOR
5349 // would benefit from this representation and return it if so.
5350 static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
5351                                      BuildVectorSDNode *BVN) {
5352   EVT VT = BVN->getValueType(0);
5353   unsigned NumElements = VT.getVectorNumElements();
5354 
5355   // Represent the BUILD_VECTOR as an N-operand VECTOR_SHUFFLE-like operation
5356   // on byte vectors.  If there are non-EXTRACT_VECTOR_ELT elements that still
5357   // need a BUILD_VECTOR, add an additional placeholder operand for that
5358   // BUILD_VECTOR and store its operands in ResidueOps.
5359   GeneralShuffle GS(VT);
5360   SmallVector<SDValue, SystemZ::VectorBytes> ResidueOps;
5361   bool FoundOne = false;
5362   for (unsigned I = 0; I < NumElements; ++I) {
5363     SDValue Op = BVN->getOperand(I);
5364     if (Op.getOpcode() == ISD::TRUNCATE)
5365       Op = Op.getOperand(0);
5366     if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5367         Op.getOperand(1).getOpcode() == ISD::Constant) {
5368       unsigned Elem = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5369       if (!GS.add(Op.getOperand(0), Elem))
5370         return SDValue();
5371       FoundOne = true;
5372     } else if (Op.isUndef()) {
5373       GS.addUndef();
5374     } else {
5375       if (!GS.add(SDValue(), ResidueOps.size()))
5376         return SDValue();
5377       ResidueOps.push_back(BVN->getOperand(I));
5378     }
5379   }
5380 
5381   // Nothing to do if there are no EXTRACT_VECTOR_ELTs.
5382   if (!FoundOne)
5383     return SDValue();
5384 
5385   // Create the BUILD_VECTOR for the remaining elements, if any.
5386   if (!ResidueOps.empty()) {
5387     while (ResidueOps.size() < NumElements)
5388       ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType()));
5389     for (auto &Op : GS.Ops) {
5390       if (!Op.getNode()) {
5391         Op = DAG.getBuildVector(VT, SDLoc(BVN), ResidueOps);
5392         break;
5393       }
5394     }
5395   }
5396   return GS.getNode(DAG, SDLoc(BVN));
5397 }
5398 
5399 bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
5400   if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
5401     return true;
5402   if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
5403     return true;
5404   return false;
5405 }
5406 
5407 // Combine GPR scalar values Elems into a vector of type VT.
5408 SDValue
5409 SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
5410                                    SmallVectorImpl<SDValue> &Elems) const {
5411   // See whether there is a single replicated value.
5412   SDValue Single;
5413   unsigned int NumElements = Elems.size();
5414   unsigned int Count = 0;
5415   for (auto Elem : Elems) {
5416     if (!Elem.isUndef()) {
5417       if (!Single.getNode())
5418         Single = Elem;
5419       else if (Elem != Single) {
5420         Single = SDValue();
5421         break;
5422       }
5423       Count += 1;
5424     }
5425   }
5426   // There are three cases here:
5427   //
5428   // - if the only defined element is a loaded one, the best sequence
5429   //   is a replicating load.
5430   //
5431   // - otherwise, if the only defined element is an i64 value, we will
5432   //   end up with the same VLVGP sequence regardless of whether we short-cut
5433   //   for replication or fall through to the later code.
5434   //
5435   // - otherwise, if the only defined element is an i32 or smaller value,
5436   //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
5437   //   This is only a win if the single defined element is used more than once.
5438   //   In other cases we're better off using a single VLVGx.
5439   if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
5440     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
5441 
5442   // If all elements are loads, use VLREP/VLEs (below).
5443   bool AllLoads = true;
5444   for (auto Elem : Elems)
5445     if (!isVectorElementLoad(Elem)) {
5446       AllLoads = false;
5447       break;
5448     }
5449 
5450   // The best way of building a v2i64 from two i64s is to use VLVGP.
5451   if (VT == MVT::v2i64 && !AllLoads)
5452     return joinDwords(DAG, DL, Elems[0], Elems[1]);
5453 
5454   // Use a 64-bit merge high to combine two doubles.
5455   if (VT == MVT::v2f64 && !AllLoads)
5456     return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
5457 
5458   // Build v4f32 values directly from the FPRs:
5459   //
5460   //   <Axxx> <Bxxx> <Cxxxx> <Dxxx>
5461   //         V              V         VMRHF
5462   //      <ABxx>         <CDxx>
5463   //                V                 VMRHG
5464   //              <ABCD>
5465   if (VT == MVT::v4f32 && !AllLoads) {
5466     SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
5467     SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
5468     // Avoid unnecessary undefs by reusing the other operand.
5469     if (Op01.isUndef())
5470       Op01 = Op23;
5471     else if (Op23.isUndef())
5472       Op23 = Op01;
5473     // Merging identical replications is a no-op.
5474     if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
5475       return Op01;
5476     Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
5477     Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
5478     SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
5479                              DL, MVT::v2i64, Op01, Op23);
5480     return DAG.getNode(ISD::BITCAST, DL, VT, Op);
5481   }
5482 
5483   // Collect the constant terms.
5484   SmallVector<SDValue, SystemZ::VectorBytes> Constants(NumElements, SDValue());
5485   SmallVector<bool, SystemZ::VectorBytes> Done(NumElements, false);
5486 
5487   unsigned NumConstants = 0;
5488   for (unsigned I = 0; I < NumElements; ++I) {
5489     SDValue Elem = Elems[I];
5490     if (Elem.getOpcode() == ISD::Constant ||
5491         Elem.getOpcode() == ISD::ConstantFP) {
5492       NumConstants += 1;
5493       Constants[I] = Elem;
5494       Done[I] = true;
5495     }
5496   }
5497   // If there was at least one constant, fill in the other elements of
5498   // Constants with undefs to get a full vector constant and use that
5499   // as the starting point.
5500   SDValue Result;
5501   SDValue ReplicatedVal;
5502   if (NumConstants > 0) {
5503     for (unsigned I = 0; I < NumElements; ++I)
5504       if (!Constants[I].getNode())
5505         Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
5506     Result = DAG.getBuildVector(VT, DL, Constants);
5507   } else {
5508     // Otherwise try to use VLREP or VLVGP to start the sequence in order to
5509     // avoid a false dependency on any previous contents of the vector
5510     // register.
5511 
5512     // Use a VLREP if at least one element is a load. Make sure to replicate
5513     // the load with the most elements having its value.
5514     std::map<const SDNode*, unsigned> UseCounts;
5515     SDNode *LoadMaxUses = nullptr;
5516     for (unsigned I = 0; I < NumElements; ++I)
5517       if (isVectorElementLoad(Elems[I])) {
5518         SDNode *Ld = Elems[I].getNode();
5519         UseCounts[Ld]++;
5520         if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
5521           LoadMaxUses = Ld;
5522       }
5523     if (LoadMaxUses != nullptr) {
5524       ReplicatedVal = SDValue(LoadMaxUses, 0);
5525       Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, ReplicatedVal);
5526     } else {
5527       // Try to use VLVGP.
5528       unsigned I1 = NumElements / 2 - 1;
5529       unsigned I2 = NumElements - 1;
5530       bool Def1 = !Elems[I1].isUndef();
5531       bool Def2 = !Elems[I2].isUndef();
5532       if (Def1 || Def2) {
5533         SDValue Elem1 = Elems[Def1 ? I1 : I2];
5534         SDValue Elem2 = Elems[Def2 ? I2 : I1];
5535         Result = DAG.getNode(ISD::BITCAST, DL, VT,
5536                              joinDwords(DAG, DL, Elem1, Elem2));
5537         Done[I1] = true;
5538         Done[I2] = true;
5539       } else
5540         Result = DAG.getUNDEF(VT);
5541     }
5542   }
5543 
5544   // Use VLVGx to insert the other elements.
5545   for (unsigned I = 0; I < NumElements; ++I)
5546     if (!Done[I] && !Elems[I].isUndef() && Elems[I] != ReplicatedVal)
5547       Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Result, Elems[I],
5548                            DAG.getConstant(I, DL, MVT::i32));
5549   return Result;
5550 }
5551 
5552 SDValue SystemZTargetLowering::lowerBUILD_VECTOR(SDValue Op,
5553                                                  SelectionDAG &DAG) const {
5554   auto *BVN = cast<BuildVectorSDNode>(Op.getNode());
5555   SDLoc DL(Op);
5556   EVT VT = Op.getValueType();
5557 
5558   if (BVN->isConstant()) {
5559     if (SystemZVectorConstantInfo(BVN).isVectorConstantLegal(Subtarget))
5560       return Op;
5561 
5562     // Fall back to loading it from memory.
5563     return SDValue();
5564   }
5565 
5566   // See if we should use shuffles to construct the vector from other vectors.
5567   if (SDValue Res = tryBuildVectorShuffle(DAG, BVN))
5568     return Res;
5569 
5570   // Detect SCALAR_TO_VECTOR conversions.
5571   if (isOperationLegal(ISD::SCALAR_TO_VECTOR, VT) && isScalarToVector(Op))
5572     return buildScalarToVector(DAG, DL, VT, Op.getOperand(0));
5573 
5574   // Otherwise use buildVector to build the vector up from GPRs.
5575   unsigned NumElements = Op.getNumOperands();
5576   SmallVector<SDValue, SystemZ::VectorBytes> Ops(NumElements);
5577   for (unsigned I = 0; I < NumElements; ++I)
5578     Ops[I] = Op.getOperand(I);
5579   return buildVector(DAG, DL, VT, Ops);
5580 }
5581 
5582 SDValue SystemZTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
5583                                                    SelectionDAG &DAG) const {
5584   auto *VSN = cast<ShuffleVectorSDNode>(Op.getNode());
5585   SDLoc DL(Op);
5586   EVT VT = Op.getValueType();
5587   unsigned NumElements = VT.getVectorNumElements();
5588 
5589   if (VSN->isSplat()) {
5590     SDValue Op0 = Op.getOperand(0);
5591     unsigned Index = VSN->getSplatIndex();
5592     assert(Index < VT.getVectorNumElements() &&
5593            "Splat index should be defined and in first operand");
5594     // See whether the value we're splatting is directly available as a scalar.
5595     if ((Index == 0 && Op0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
5596         Op0.getOpcode() == ISD::BUILD_VECTOR)
5597       return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Op0.getOperand(Index));
5598     // Otherwise keep it as a vector-to-vector operation.
5599     return DAG.getNode(SystemZISD::SPLAT, DL, VT, Op.getOperand(0),
5600                        DAG.getTargetConstant(Index, DL, MVT::i32));
5601   }
5602 
5603   GeneralShuffle GS(VT);
5604   for (unsigned I = 0; I < NumElements; ++I) {
5605     int Elt = VSN->getMaskElt(I);
5606     if (Elt < 0)
5607       GS.addUndef();
5608     else if (!GS.add(Op.getOperand(unsigned(Elt) / NumElements),
5609                      unsigned(Elt) % NumElements))
5610       return SDValue();
5611   }
5612   return GS.getNode(DAG, SDLoc(VSN));
5613 }
5614 
5615 SDValue SystemZTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
5616                                                      SelectionDAG &DAG) const {
5617   SDLoc DL(Op);
5618   // Just insert the scalar into element 0 of an undefined vector.
5619   return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
5620                      Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
5621                      Op.getOperand(0), DAG.getConstant(0, DL, MVT::i32));
5622 }
5623 
5624 SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
5625                                                       SelectionDAG &DAG) const {
5626   // Handle insertions of floating-point values.
5627   SDLoc DL(Op);
5628   SDValue Op0 = Op.getOperand(0);
5629   SDValue Op1 = Op.getOperand(1);
5630   SDValue Op2 = Op.getOperand(2);
5631   EVT VT = Op.getValueType();
5632 
5633   // Insertions into constant indices of a v2f64 can be done using VPDI.
5634   // However, if the inserted value is a bitcast or a constant then it's
5635   // better to use GPRs, as below.
5636   if (VT == MVT::v2f64 &&
5637       Op1.getOpcode() != ISD::BITCAST &&
5638       Op1.getOpcode() != ISD::ConstantFP &&
5639       Op2.getOpcode() == ISD::Constant) {
5640     uint64_t Index = cast<ConstantSDNode>(Op2)->getZExtValue();
5641     unsigned Mask = VT.getVectorNumElements() - 1;
5642     if (Index <= Mask)
5643       return Op;
5644   }
5645 
5646   // Otherwise bitcast to the equivalent integer form and insert via a GPR.
5647   MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
5648   MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements());
5649   SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT,
5650                             DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0),
5651                             DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2);
5652   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
5653 }
5654 
5655 SDValue
5656 SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
5657                                                SelectionDAG &DAG) const {
5658   // Handle extractions of floating-point values.
5659   SDLoc DL(Op);
5660   SDValue Op0 = Op.getOperand(0);
5661   SDValue Op1 = Op.getOperand(1);
5662   EVT VT = Op.getValueType();
5663   EVT VecVT = Op0.getValueType();
5664 
5665   // Extractions of constant indices can be done directly.
5666   if (auto *CIndexN = dyn_cast<ConstantSDNode>(Op1)) {
5667     uint64_t Index = CIndexN->getZExtValue();
5668     unsigned Mask = VecVT.getVectorNumElements() - 1;
5669     if (Index <= Mask)
5670       return Op;
5671   }
5672 
5673   // Otherwise bitcast to the equivalent integer form and extract via a GPR.
5674   MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
5675   MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements());
5676   SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT,
5677                             DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1);
5678   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
5679 }
5680 
5681 SDValue SystemZTargetLowering::
5682 lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
5683   SDValue PackedOp = Op.getOperand(0);
5684   EVT OutVT = Op.getValueType();
5685   EVT InVT = PackedOp.getValueType();
5686   unsigned ToBits = OutVT.getScalarSizeInBits();
5687   unsigned FromBits = InVT.getScalarSizeInBits();
5688   do {
5689     FromBits *= 2;
5690     EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
5691                                  SystemZ::VectorBits / FromBits);
5692     PackedOp =
5693       DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
5694   } while (FromBits != ToBits);
5695   return PackedOp;
5696 }
5697 
5698 // Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
5699 SDValue SystemZTargetLowering::
5700 lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
5701   SDValue PackedOp = Op.getOperand(0);
5702   SDLoc DL(Op);
5703   EVT OutVT = Op.getValueType();
5704   EVT InVT = PackedOp.getValueType();
5705   unsigned InNumElts = InVT.getVectorNumElements();
5706   unsigned OutNumElts = OutVT.getVectorNumElements();
5707   unsigned NumInPerOut = InNumElts / OutNumElts;
5708 
5709   SDValue ZeroVec =
5710     DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
5711 
5712   SmallVector<int, 16> Mask(InNumElts);
5713   unsigned ZeroVecElt = InNumElts;
5714   for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
5715     unsigned MaskElt = PackedElt * NumInPerOut;
5716     unsigned End = MaskElt + NumInPerOut - 1;
5717     for (; MaskElt < End; MaskElt++)
5718       Mask[MaskElt] = ZeroVecElt++;
5719     Mask[MaskElt] = PackedElt;
5720   }
5721   SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
5722   return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
5723 }
5724 
5725 SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
5726                                           unsigned ByScalar) const {
5727   // Look for cases where a vector shift can use the *_BY_SCALAR form.
5728   SDValue Op0 = Op.getOperand(0);
5729   SDValue Op1 = Op.getOperand(1);
5730   SDLoc DL(Op);
5731   EVT VT = Op.getValueType();
5732   unsigned ElemBitSize = VT.getScalarSizeInBits();
5733 
5734   // See whether the shift vector is a splat represented as BUILD_VECTOR.
5735   if (auto *BVN = dyn_cast<BuildVectorSDNode>(Op1)) {
5736     APInt SplatBits, SplatUndef;
5737     unsigned SplatBitSize;
5738     bool HasAnyUndefs;
5739     // Check for constant splats.  Use ElemBitSize as the minimum element
5740     // width and reject splats that need wider elements.
5741     if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
5742                              ElemBitSize, true) &&
5743         SplatBitSize == ElemBitSize) {
5744       SDValue Shift = DAG.getConstant(SplatBits.getZExtValue() & 0xfff,
5745                                       DL, MVT::i32);
5746       return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
5747     }
5748     // Check for variable splats.
5749     BitVector UndefElements;
5750     SDValue Splat = BVN->getSplatValue(&UndefElements);
5751     if (Splat) {
5752       // Since i32 is the smallest legal type, we either need a no-op
5753       // or a truncation.
5754       SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Splat);
5755       return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
5756     }
5757   }
5758 
5759   // See whether the shift vector is a splat represented as SHUFFLE_VECTOR,
5760   // and the shift amount is directly available in a GPR.
5761   if (auto *VSN = dyn_cast<ShuffleVectorSDNode>(Op1)) {
5762     if (VSN->isSplat()) {
5763       SDValue VSNOp0 = VSN->getOperand(0);
5764       unsigned Index = VSN->getSplatIndex();
5765       assert(Index < VT.getVectorNumElements() &&
5766              "Splat index should be defined and in first operand");
5767       if ((Index == 0 && VSNOp0.getOpcode() == ISD::SCALAR_TO_VECTOR) ||
5768           VSNOp0.getOpcode() == ISD::BUILD_VECTOR) {
5769         // Since i32 is the smallest legal type, we either need a no-op
5770         // or a truncation.
5771         SDValue Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
5772                                     VSNOp0.getOperand(Index));
5773         return DAG.getNode(ByScalar, DL, VT, Op0, Shift);
5774       }
5775     }
5776   }
5777 
5778   // Otherwise just treat the current form as legal.
5779   return Op;
5780 }
5781 
5782 SDValue SystemZTargetLowering::lowerIS_FPCLASS(SDValue Op,
5783                                                SelectionDAG &DAG) const {
5784   SDLoc DL(Op);
5785   MVT ResultVT = Op.getSimpleValueType();
5786   SDValue Arg = Op.getOperand(0);
5787   auto CNode = cast<ConstantSDNode>(Op.getOperand(1));
5788   unsigned Check = CNode->getZExtValue();
5789 
5790   unsigned TDCMask = 0;
5791   if (Check & fcSNan)
5792     TDCMask |= SystemZ::TDCMASK_SNAN_PLUS | SystemZ::TDCMASK_SNAN_MINUS;
5793   if (Check & fcQNan)
5794     TDCMask |= SystemZ::TDCMASK_QNAN_PLUS | SystemZ::TDCMASK_QNAN_MINUS;
5795   if (Check & fcPosInf)
5796     TDCMask |= SystemZ::TDCMASK_INFINITY_PLUS;
5797   if (Check & fcNegInf)
5798     TDCMask |= SystemZ::TDCMASK_INFINITY_MINUS;
5799   if (Check & fcPosNormal)
5800     TDCMask |= SystemZ::TDCMASK_NORMAL_PLUS;
5801   if (Check & fcNegNormal)
5802     TDCMask |= SystemZ::TDCMASK_NORMAL_MINUS;
5803   if (Check & fcPosSubnormal)
5804     TDCMask |= SystemZ::TDCMASK_SUBNORMAL_PLUS;
5805   if (Check & fcNegSubnormal)
5806     TDCMask |= SystemZ::TDCMASK_SUBNORMAL_MINUS;
5807   if (Check & fcPosZero)
5808     TDCMask |= SystemZ::TDCMASK_ZERO_PLUS;
5809   if (Check & fcNegZero)
5810     TDCMask |= SystemZ::TDCMASK_ZERO_MINUS;
5811   SDValue TDCMaskV = DAG.getConstant(TDCMask, DL, MVT::i64);
5812 
5813   SDValue Intr = DAG.getNode(SystemZISD::TDC, DL, ResultVT, Arg, TDCMaskV);
5814   return getCCResult(DAG, Intr);
5815 }
5816 
5817 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
5818                                               SelectionDAG &DAG) const {
5819   switch (Op.getOpcode()) {
5820   case ISD::FRAMEADDR:
5821     return lowerFRAMEADDR(Op, DAG);
5822   case ISD::RETURNADDR:
5823     return lowerRETURNADDR(Op, DAG);
5824   case ISD::BR_CC:
5825     return lowerBR_CC(Op, DAG);
5826   case ISD::SELECT_CC:
5827     return lowerSELECT_CC(Op, DAG);
5828   case ISD::SETCC:
5829     return lowerSETCC(Op, DAG);
5830   case ISD::STRICT_FSETCC:
5831     return lowerSTRICT_FSETCC(Op, DAG, false);
5832   case ISD::STRICT_FSETCCS:
5833     return lowerSTRICT_FSETCC(Op, DAG, true);
5834   case ISD::GlobalAddress:
5835     return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
5836   case ISD::GlobalTLSAddress:
5837     return lowerGlobalTLSAddress(cast<GlobalAddressSDNode>(Op), DAG);
5838   case ISD::BlockAddress:
5839     return lowerBlockAddress(cast<BlockAddressSDNode>(Op), DAG);
5840   case ISD::JumpTable:
5841     return lowerJumpTable(cast<JumpTableSDNode>(Op), DAG);
5842   case ISD::ConstantPool:
5843     return lowerConstantPool(cast<ConstantPoolSDNode>(Op), DAG);
5844   case ISD::BITCAST:
5845     return lowerBITCAST(Op, DAG);
5846   case ISD::VASTART:
5847     return lowerVASTART(Op, DAG);
5848   case ISD::VACOPY:
5849     return lowerVACOPY(Op, DAG);
5850   case ISD::DYNAMIC_STACKALLOC:
5851     return lowerDYNAMIC_STACKALLOC(Op, DAG);
5852   case ISD::GET_DYNAMIC_AREA_OFFSET:
5853     return lowerGET_DYNAMIC_AREA_OFFSET(Op, DAG);
5854   case ISD::SMUL_LOHI:
5855     return lowerSMUL_LOHI(Op, DAG);
5856   case ISD::UMUL_LOHI:
5857     return lowerUMUL_LOHI(Op, DAG);
5858   case ISD::SDIVREM:
5859     return lowerSDIVREM(Op, DAG);
5860   case ISD::UDIVREM:
5861     return lowerUDIVREM(Op, DAG);
5862   case ISD::SADDO:
5863   case ISD::SSUBO:
5864   case ISD::UADDO:
5865   case ISD::USUBO:
5866     return lowerXALUO(Op, DAG);
5867   case ISD::UADDO_CARRY:
5868   case ISD::USUBO_CARRY:
5869     return lowerUADDSUBO_CARRY(Op, DAG);
5870   case ISD::OR:
5871     return lowerOR(Op, DAG);
5872   case ISD::CTPOP:
5873     return lowerCTPOP(Op, DAG);
5874   case ISD::ATOMIC_FENCE:
5875     return lowerATOMIC_FENCE(Op, DAG);
5876   case ISD::ATOMIC_SWAP:
5877     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
5878   case ISD::ATOMIC_STORE:
5879     return lowerATOMIC_STORE(Op, DAG);
5880   case ISD::ATOMIC_LOAD:
5881     return lowerATOMIC_LOAD(Op, DAG);
5882   case ISD::ATOMIC_LOAD_ADD:
5883     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
5884   case ISD::ATOMIC_LOAD_SUB:
5885     return lowerATOMIC_LOAD_SUB(Op, DAG);
5886   case ISD::ATOMIC_LOAD_AND:
5887     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_AND);
5888   case ISD::ATOMIC_LOAD_OR:
5889     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_OR);
5890   case ISD::ATOMIC_LOAD_XOR:
5891     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_XOR);
5892   case ISD::ATOMIC_LOAD_NAND:
5893     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_NAND);
5894   case ISD::ATOMIC_LOAD_MIN:
5895     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MIN);
5896   case ISD::ATOMIC_LOAD_MAX:
5897     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_MAX);
5898   case ISD::ATOMIC_LOAD_UMIN:
5899     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMIN);
5900   case ISD::ATOMIC_LOAD_UMAX:
5901     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_UMAX);
5902   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
5903     return lowerATOMIC_CMP_SWAP(Op, DAG);
5904   case ISD::STACKSAVE:
5905     return lowerSTACKSAVE(Op, DAG);
5906   case ISD::STACKRESTORE:
5907     return lowerSTACKRESTORE(Op, DAG);
5908   case ISD::PREFETCH:
5909     return lowerPREFETCH(Op, DAG);
5910   case ISD::INTRINSIC_W_CHAIN:
5911     return lowerINTRINSIC_W_CHAIN(Op, DAG);
5912   case ISD::INTRINSIC_WO_CHAIN:
5913     return lowerINTRINSIC_WO_CHAIN(Op, DAG);
5914   case ISD::BUILD_VECTOR:
5915     return lowerBUILD_VECTOR(Op, DAG);
5916   case ISD::VECTOR_SHUFFLE:
5917     return lowerVECTOR_SHUFFLE(Op, DAG);
5918   case ISD::SCALAR_TO_VECTOR:
5919     return lowerSCALAR_TO_VECTOR(Op, DAG);
5920   case ISD::INSERT_VECTOR_ELT:
5921     return lowerINSERT_VECTOR_ELT(Op, DAG);
5922   case ISD::EXTRACT_VECTOR_ELT:
5923     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5924   case ISD::SIGN_EXTEND_VECTOR_INREG:
5925     return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
5926   case ISD::ZERO_EXTEND_VECTOR_INREG:
5927     return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
5928   case ISD::SHL:
5929     return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
5930   case ISD::SRL:
5931     return lowerShift(Op, DAG, SystemZISD::VSRL_BY_SCALAR);
5932   case ISD::SRA:
5933     return lowerShift(Op, DAG, SystemZISD::VSRA_BY_SCALAR);
5934   case ISD::IS_FPCLASS:
5935     return lowerIS_FPCLASS(Op, DAG);
5936   case ISD::GET_ROUNDING:
5937     return lowerGET_ROUNDING(Op, DAG);
5938   default:
5939     llvm_unreachable("Unexpected node to lower");
5940   }
5941 }
5942 
5943 // Lower operations with invalid operand or result types (currently used
5944 // only for 128-bit integer types).
5945 void
5946 SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
5947                                              SmallVectorImpl<SDValue> &Results,
5948                                              SelectionDAG &DAG) const {
5949   switch (N->getOpcode()) {
5950   case ISD::ATOMIC_LOAD: {
5951     SDLoc DL(N);
5952     SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other);
5953     SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
5954     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
5955     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128,
5956                                           DL, Tys, Ops, MVT::i128, MMO);
5957     Results.push_back(lowerGR128ToI128(DAG, Res));
5958     Results.push_back(Res.getValue(1));
5959     break;
5960   }
5961   case ISD::ATOMIC_STORE: {
5962     SDLoc DL(N);
5963     SDVTList Tys = DAG.getVTList(MVT::Other);
5964     SDValue Ops[] = { N->getOperand(0),
5965                       lowerI128ToGR128(DAG, N->getOperand(2)),
5966                       N->getOperand(1) };
5967     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
5968     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128,
5969                                           DL, Tys, Ops, MVT::i128, MMO);
5970     // We have to enforce sequential consistency by performing a
5971     // serialization operation after the store.
5972     if (cast<AtomicSDNode>(N)->getSuccessOrdering() ==
5973         AtomicOrdering::SequentiallyConsistent)
5974       Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL,
5975                                        MVT::Other, Res), 0);
5976     Results.push_back(Res);
5977     break;
5978   }
5979   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
5980     SDLoc DL(N);
5981     SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other);
5982     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
5983                       lowerI128ToGR128(DAG, N->getOperand(2)),
5984                       lowerI128ToGR128(DAG, N->getOperand(3)) };
5985     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
5986     SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128,
5987                                           DL, Tys, Ops, MVT::i128, MMO);
5988     SDValue Success = emitSETCC(DAG, DL, Res.getValue(1),
5989                                 SystemZ::CCMASK_CS, SystemZ::CCMASK_CS_EQ);
5990     Success = DAG.getZExtOrTrunc(Success, DL, N->getValueType(1));
5991     Results.push_back(lowerGR128ToI128(DAG, Res));
5992     Results.push_back(Success);
5993     Results.push_back(Res.getValue(2));
5994     break;
5995   }
5996   case ISD::BITCAST: {
5997     SDValue Src = N->getOperand(0);
5998     if (N->getValueType(0) == MVT::i128 && Src.getValueType() == MVT::f128 &&
5999         !useSoftFloat()) {
6000       SDLoc DL(N);
6001       SDValue Lo, Hi;
6002       if (getRepRegClassFor(MVT::f128) == &SystemZ::VR128BitRegClass) {
6003         SDValue VecBC = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Src);
6004         Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
6005                          DAG.getConstant(1, DL, MVT::i32));
6006         Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, VecBC,
6007                          DAG.getConstant(0, DL, MVT::i32));
6008       } else {
6009         assert(getRepRegClassFor(MVT::f128) == &SystemZ::FP128BitRegClass &&
6010                "Unrecognized register class for f128.");
6011         SDValue LoFP = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
6012                                                   DL, MVT::f64, Src);
6013         SDValue HiFP = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
6014                                                   DL, MVT::f64, Src);
6015         Lo = DAG.getNode(ISD::BITCAST, DL, MVT::i64, LoFP);
6016         Hi = DAG.getNode(ISD::BITCAST, DL, MVT::i64, HiFP);
6017       }
6018       Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi));
6019     }
6020     break;
6021   }
6022   default:
6023     llvm_unreachable("Unexpected node to lower");
6024   }
6025 }
6026 
6027 void
6028 SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
6029                                           SmallVectorImpl<SDValue> &Results,
6030                                           SelectionDAG &DAG) const {
6031   return LowerOperationWrapper(N, Results, DAG);
6032 }
6033 
6034 const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
6035 #define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
6036   switch ((SystemZISD::NodeType)Opcode) {
6037     case SystemZISD::FIRST_NUMBER: break;
6038     OPCODE(RET_GLUE);
6039     OPCODE(CALL);
6040     OPCODE(SIBCALL);
6041     OPCODE(TLS_GDCALL);
6042     OPCODE(TLS_LDCALL);
6043     OPCODE(PCREL_WRAPPER);
6044     OPCODE(PCREL_OFFSET);
6045     OPCODE(ICMP);
6046     OPCODE(FCMP);
6047     OPCODE(STRICT_FCMP);
6048     OPCODE(STRICT_FCMPS);
6049     OPCODE(TM);
6050     OPCODE(BR_CCMASK);
6051     OPCODE(SELECT_CCMASK);
6052     OPCODE(ADJDYNALLOC);
6053     OPCODE(PROBED_ALLOCA);
6054     OPCODE(POPCNT);
6055     OPCODE(SMUL_LOHI);
6056     OPCODE(UMUL_LOHI);
6057     OPCODE(SDIVREM);
6058     OPCODE(UDIVREM);
6059     OPCODE(SADDO);
6060     OPCODE(SSUBO);
6061     OPCODE(UADDO);
6062     OPCODE(USUBO);
6063     OPCODE(ADDCARRY);
6064     OPCODE(SUBCARRY);
6065     OPCODE(GET_CCMASK);
6066     OPCODE(MVC);
6067     OPCODE(NC);
6068     OPCODE(OC);
6069     OPCODE(XC);
6070     OPCODE(CLC);
6071     OPCODE(MEMSET_MVC);
6072     OPCODE(STPCPY);
6073     OPCODE(STRCMP);
6074     OPCODE(SEARCH_STRING);
6075     OPCODE(IPM);
6076     OPCODE(TBEGIN);
6077     OPCODE(TBEGIN_NOFLOAT);
6078     OPCODE(TEND);
6079     OPCODE(BYTE_MASK);
6080     OPCODE(ROTATE_MASK);
6081     OPCODE(REPLICATE);
6082     OPCODE(JOIN_DWORDS);
6083     OPCODE(SPLAT);
6084     OPCODE(MERGE_HIGH);
6085     OPCODE(MERGE_LOW);
6086     OPCODE(SHL_DOUBLE);
6087     OPCODE(PERMUTE_DWORDS);
6088     OPCODE(PERMUTE);
6089     OPCODE(PACK);
6090     OPCODE(PACKS_CC);
6091     OPCODE(PACKLS_CC);
6092     OPCODE(UNPACK_HIGH);
6093     OPCODE(UNPACKL_HIGH);
6094     OPCODE(UNPACK_LOW);
6095     OPCODE(UNPACKL_LOW);
6096     OPCODE(VSHL_BY_SCALAR);
6097     OPCODE(VSRL_BY_SCALAR);
6098     OPCODE(VSRA_BY_SCALAR);
6099     OPCODE(VSUM);
6100     OPCODE(VICMPE);
6101     OPCODE(VICMPH);
6102     OPCODE(VICMPHL);
6103     OPCODE(VICMPES);
6104     OPCODE(VICMPHS);
6105     OPCODE(VICMPHLS);
6106     OPCODE(VFCMPE);
6107     OPCODE(STRICT_VFCMPE);
6108     OPCODE(STRICT_VFCMPES);
6109     OPCODE(VFCMPH);
6110     OPCODE(STRICT_VFCMPH);
6111     OPCODE(STRICT_VFCMPHS);
6112     OPCODE(VFCMPHE);
6113     OPCODE(STRICT_VFCMPHE);
6114     OPCODE(STRICT_VFCMPHES);
6115     OPCODE(VFCMPES);
6116     OPCODE(VFCMPHS);
6117     OPCODE(VFCMPHES);
6118     OPCODE(VFTCI);
6119     OPCODE(VEXTEND);
6120     OPCODE(STRICT_VEXTEND);
6121     OPCODE(VROUND);
6122     OPCODE(STRICT_VROUND);
6123     OPCODE(VTM);
6124     OPCODE(VFAE_CC);
6125     OPCODE(VFAEZ_CC);
6126     OPCODE(VFEE_CC);
6127     OPCODE(VFEEZ_CC);
6128     OPCODE(VFENE_CC);
6129     OPCODE(VFENEZ_CC);
6130     OPCODE(VISTR_CC);
6131     OPCODE(VSTRC_CC);
6132     OPCODE(VSTRCZ_CC);
6133     OPCODE(VSTRS_CC);
6134     OPCODE(VSTRSZ_CC);
6135     OPCODE(TDC);
6136     OPCODE(ATOMIC_SWAPW);
6137     OPCODE(ATOMIC_LOADW_ADD);
6138     OPCODE(ATOMIC_LOADW_SUB);
6139     OPCODE(ATOMIC_LOADW_AND);
6140     OPCODE(ATOMIC_LOADW_OR);
6141     OPCODE(ATOMIC_LOADW_XOR);
6142     OPCODE(ATOMIC_LOADW_NAND);
6143     OPCODE(ATOMIC_LOADW_MIN);
6144     OPCODE(ATOMIC_LOADW_MAX);
6145     OPCODE(ATOMIC_LOADW_UMIN);
6146     OPCODE(ATOMIC_LOADW_UMAX);
6147     OPCODE(ATOMIC_CMP_SWAPW);
6148     OPCODE(ATOMIC_CMP_SWAP);
6149     OPCODE(ATOMIC_LOAD_128);
6150     OPCODE(ATOMIC_STORE_128);
6151     OPCODE(ATOMIC_CMP_SWAP_128);
6152     OPCODE(LRV);
6153     OPCODE(STRV);
6154     OPCODE(VLER);
6155     OPCODE(VSTER);
6156     OPCODE(PREFETCH);
6157     OPCODE(ADA_ENTRY);
6158   }
6159   return nullptr;
6160 #undef OPCODE
6161 }
6162 
6163 // Return true if VT is a vector whose elements are a whole number of bytes
6164 // in width. Also check for presence of vector support.
6165 bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
6166   if (!Subtarget.hasVector())
6167     return false;
6168 
6169   return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
6170 }
6171 
6172 // Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
6173 // producing a result of type ResVT.  Op is a possibly bitcast version
6174 // of the input vector and Index is the index (based on type VecVT) that
6175 // should be extracted.  Return the new extraction if a simplification
6176 // was possible or if Force is true.
6177 SDValue SystemZTargetLowering::combineExtract(const SDLoc &DL, EVT ResVT,
6178                                               EVT VecVT, SDValue Op,
6179                                               unsigned Index,
6180                                               DAGCombinerInfo &DCI,
6181                                               bool Force) const {
6182   SelectionDAG &DAG = DCI.DAG;
6183 
6184   // The number of bytes being extracted.
6185   unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
6186 
6187   for (;;) {
6188     unsigned Opcode = Op.getOpcode();
6189     if (Opcode == ISD::BITCAST)
6190       // Look through bitcasts.
6191       Op = Op.getOperand(0);
6192     else if ((Opcode == ISD::VECTOR_SHUFFLE || Opcode == SystemZISD::SPLAT) &&
6193              canTreatAsByteVector(Op.getValueType())) {
6194       // Get a VPERM-like permute mask and see whether the bytes covered
6195       // by the extracted element are a contiguous sequence from one
6196       // source operand.
6197       SmallVector<int, SystemZ::VectorBytes> Bytes;
6198       if (!getVPermMask(Op, Bytes))
6199         break;
6200       int First;
6201       if (!getShuffleInput(Bytes, Index * BytesPerElement,
6202                            BytesPerElement, First))
6203         break;
6204       if (First < 0)
6205         return DAG.getUNDEF(ResVT);
6206       // Make sure the contiguous sequence starts at a multiple of the
6207       // original element size.
6208       unsigned Byte = unsigned(First) % Bytes.size();
6209       if (Byte % BytesPerElement != 0)
6210         break;
6211       // We can get the extracted value directly from an input.
6212       Index = Byte / BytesPerElement;
6213       Op = Op.getOperand(unsigned(First) / Bytes.size());
6214       Force = true;
6215     } else if (Opcode == ISD::BUILD_VECTOR &&
6216                canTreatAsByteVector(Op.getValueType())) {
6217       // We can only optimize this case if the BUILD_VECTOR elements are
6218       // at least as wide as the extracted value.
6219       EVT OpVT = Op.getValueType();
6220       unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
6221       if (OpBytesPerElement < BytesPerElement)
6222         break;
6223       // Make sure that the least-significant bit of the extracted value
6224       // is the least significant bit of an input.
6225       unsigned End = (Index + 1) * BytesPerElement;
6226       if (End % OpBytesPerElement != 0)
6227         break;
6228       // We're extracting the low part of one operand of the BUILD_VECTOR.
6229       Op = Op.getOperand(End / OpBytesPerElement - 1);
6230       if (!Op.getValueType().isInteger()) {
6231         EVT VT = MVT::getIntegerVT(Op.getValueSizeInBits());
6232         Op = DAG.getNode(ISD::BITCAST, DL, VT, Op);
6233         DCI.AddToWorklist(Op.getNode());
6234       }
6235       EVT VT = MVT::getIntegerVT(ResVT.getSizeInBits());
6236       Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
6237       if (VT != ResVT) {
6238         DCI.AddToWorklist(Op.getNode());
6239         Op = DAG.getNode(ISD::BITCAST, DL, ResVT, Op);
6240       }
6241       return Op;
6242     } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
6243                 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG ||
6244                 Opcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
6245                canTreatAsByteVector(Op.getValueType()) &&
6246                canTreatAsByteVector(Op.getOperand(0).getValueType())) {
6247       // Make sure that only the unextended bits are significant.
6248       EVT ExtVT = Op.getValueType();
6249       EVT OpVT = Op.getOperand(0).getValueType();
6250       unsigned ExtBytesPerElement = ExtVT.getVectorElementType().getStoreSize();
6251       unsigned OpBytesPerElement = OpVT.getVectorElementType().getStoreSize();
6252       unsigned Byte = Index * BytesPerElement;
6253       unsigned SubByte = Byte % ExtBytesPerElement;
6254       unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement;
6255       if (SubByte < MinSubByte ||
6256           SubByte + BytesPerElement > ExtBytesPerElement)
6257         break;
6258       // Get the byte offset of the unextended element
6259       Byte = Byte / ExtBytesPerElement * OpBytesPerElement;
6260       // ...then add the byte offset relative to that element.
6261       Byte += SubByte - MinSubByte;
6262       if (Byte % BytesPerElement != 0)
6263         break;
6264       Op = Op.getOperand(0);
6265       Index = Byte / BytesPerElement;
6266       Force = true;
6267     } else
6268       break;
6269   }
6270   if (Force) {
6271     if (Op.getValueType() != VecVT) {
6272       Op = DAG.getNode(ISD::BITCAST, DL, VecVT, Op);
6273       DCI.AddToWorklist(Op.getNode());
6274     }
6275     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Op,
6276                        DAG.getConstant(Index, DL, MVT::i32));
6277   }
6278   return SDValue();
6279 }
6280 
6281 // Optimize vector operations in scalar value Op on the basis that Op
6282 // is truncated to TruncVT.
6283 SDValue SystemZTargetLowering::combineTruncateExtract(
6284     const SDLoc &DL, EVT TruncVT, SDValue Op, DAGCombinerInfo &DCI) const {
6285   // If we have (trunc (extract_vector_elt X, Y)), try to turn it into
6286   // (extract_vector_elt (bitcast X), Y'), where (bitcast X) has elements
6287   // of type TruncVT.
6288   if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6289       TruncVT.getSizeInBits() % 8 == 0) {
6290     SDValue Vec = Op.getOperand(0);
6291     EVT VecVT = Vec.getValueType();
6292     if (canTreatAsByteVector(VecVT)) {
6293       if (auto *IndexN = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
6294         unsigned BytesPerElement = VecVT.getVectorElementType().getStoreSize();
6295         unsigned TruncBytes = TruncVT.getStoreSize();
6296         if (BytesPerElement % TruncBytes == 0) {
6297           // Calculate the value of Y' in the above description.  We are
6298           // splitting the original elements into Scale equal-sized pieces
6299           // and for truncation purposes want the last (least-significant)
6300           // of these pieces for IndexN.  This is easiest to do by calculating
6301           // the start index of the following element and then subtracting 1.
6302           unsigned Scale = BytesPerElement / TruncBytes;
6303           unsigned NewIndex = (IndexN->getZExtValue() + 1) * Scale - 1;
6304 
6305           // Defer the creation of the bitcast from X to combineExtract,
6306           // which might be able to optimize the extraction.
6307           VecVT = MVT::getVectorVT(MVT::getIntegerVT(TruncBytes * 8),
6308                                    VecVT.getStoreSize() / TruncBytes);
6309           EVT ResVT = (TruncBytes < 4 ? MVT::i32 : TruncVT);
6310           return combineExtract(DL, ResVT, VecVT, Vec, NewIndex, DCI, true);
6311         }
6312       }
6313     }
6314   }
6315   return SDValue();
6316 }
6317 
6318 SDValue SystemZTargetLowering::combineZERO_EXTEND(
6319     SDNode *N, DAGCombinerInfo &DCI) const {
6320   // Convert (zext (select_ccmask C1, C2)) into (select_ccmask C1', C2')
6321   SelectionDAG &DAG = DCI.DAG;
6322   SDValue N0 = N->getOperand(0);
6323   EVT VT = N->getValueType(0);
6324   if (N0.getOpcode() == SystemZISD::SELECT_CCMASK) {
6325     auto *TrueOp = dyn_cast<ConstantSDNode>(N0.getOperand(0));
6326     auto *FalseOp = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6327     if (TrueOp && FalseOp) {
6328       SDLoc DL(N0);
6329       SDValue Ops[] = { DAG.getConstant(TrueOp->getZExtValue(), DL, VT),
6330                         DAG.getConstant(FalseOp->getZExtValue(), DL, VT),
6331                         N0.getOperand(2), N0.getOperand(3), N0.getOperand(4) };
6332       SDValue NewSelect = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VT, Ops);
6333       // If N0 has multiple uses, change other uses as well.
6334       if (!N0.hasOneUse()) {
6335         SDValue TruncSelect =
6336           DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), NewSelect);
6337         DCI.CombineTo(N0.getNode(), TruncSelect);
6338       }
6339       return NewSelect;
6340     }
6341   }
6342   return SDValue();
6343 }
6344 
6345 SDValue SystemZTargetLowering::combineSIGN_EXTEND_INREG(
6346     SDNode *N, DAGCombinerInfo &DCI) const {
6347   // Convert (sext_in_reg (setcc LHS, RHS, COND), i1)
6348   // and (sext_in_reg (any_extend (setcc LHS, RHS, COND)), i1)
6349   // into (select_cc LHS, RHS, -1, 0, COND)
6350   SelectionDAG &DAG = DCI.DAG;
6351   SDValue N0 = N->getOperand(0);
6352   EVT VT = N->getValueType(0);
6353   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
6354   if (N0.hasOneUse() && N0.getOpcode() == ISD::ANY_EXTEND)
6355     N0 = N0.getOperand(0);
6356   if (EVT == MVT::i1 && N0.hasOneUse() && N0.getOpcode() == ISD::SETCC) {
6357     SDLoc DL(N0);
6358     SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1),
6359                       DAG.getConstant(-1, DL, VT), DAG.getConstant(0, DL, VT),
6360                       N0.getOperand(2) };
6361     return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
6362   }
6363   return SDValue();
6364 }
6365 
6366 SDValue SystemZTargetLowering::combineSIGN_EXTEND(
6367     SDNode *N, DAGCombinerInfo &DCI) const {
6368   // Convert (sext (ashr (shl X, C1), C2)) to
6369   // (ashr (shl (anyext X), C1'), C2')), since wider shifts are as
6370   // cheap as narrower ones.
6371   SelectionDAG &DAG = DCI.DAG;
6372   SDValue N0 = N->getOperand(0);
6373   EVT VT = N->getValueType(0);
6374   if (N0.hasOneUse() && N0.getOpcode() == ISD::SRA) {
6375     auto *SraAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6376     SDValue Inner = N0.getOperand(0);
6377     if (SraAmt && Inner.hasOneUse() && Inner.getOpcode() == ISD::SHL) {
6378       if (auto *ShlAmt = dyn_cast<ConstantSDNode>(Inner.getOperand(1))) {
6379         unsigned Extra = (VT.getSizeInBits() - N0.getValueSizeInBits());
6380         unsigned NewShlAmt = ShlAmt->getZExtValue() + Extra;
6381         unsigned NewSraAmt = SraAmt->getZExtValue() + Extra;
6382         EVT ShiftVT = N0.getOperand(1).getValueType();
6383         SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SDLoc(Inner), VT,
6384                                   Inner.getOperand(0));
6385         SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(Inner), VT, Ext,
6386                                   DAG.getConstant(NewShlAmt, SDLoc(Inner),
6387                                                   ShiftVT));
6388         return DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl,
6389                            DAG.getConstant(NewSraAmt, SDLoc(N0), ShiftVT));
6390       }
6391     }
6392   }
6393   return SDValue();
6394 }
6395 
6396 SDValue SystemZTargetLowering::combineMERGE(
6397     SDNode *N, DAGCombinerInfo &DCI) const {
6398   SelectionDAG &DAG = DCI.DAG;
6399   unsigned Opcode = N->getOpcode();
6400   SDValue Op0 = N->getOperand(0);
6401   SDValue Op1 = N->getOperand(1);
6402   if (Op0.getOpcode() == ISD::BITCAST)
6403     Op0 = Op0.getOperand(0);
6404   if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
6405     // (z_merge_* 0, 0) -> 0.  This is mostly useful for using VLLEZF
6406     // for v4f32.
6407     if (Op1 == N->getOperand(0))
6408       return Op1;
6409     // (z_merge_? 0, X) -> (z_unpackl_? 0, X).
6410     EVT VT = Op1.getValueType();
6411     unsigned ElemBytes = VT.getVectorElementType().getStoreSize();
6412     if (ElemBytes <= 4) {
6413       Opcode = (Opcode == SystemZISD::MERGE_HIGH ?
6414                 SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW);
6415       EVT InVT = VT.changeVectorElementTypeToInteger();
6416       EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16),
6417                                    SystemZ::VectorBytes / ElemBytes / 2);
6418       if (VT != InVT) {
6419         Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1);
6420         DCI.AddToWorklist(Op1.getNode());
6421       }
6422       SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1);
6423       DCI.AddToWorklist(Op.getNode());
6424       return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
6425     }
6426   }
6427   return SDValue();
6428 }
6429 
6430 SDValue SystemZTargetLowering::combineLOAD(
6431     SDNode *N, DAGCombinerInfo &DCI) const {
6432   SelectionDAG &DAG = DCI.DAG;
6433   EVT LdVT = N->getValueType(0);
6434   if (LdVT.isVector() || LdVT.isInteger())
6435     return SDValue();
6436   // Transform a scalar load that is REPLICATEd as well as having other
6437   // use(s) to the form where the other use(s) use the first element of the
6438   // REPLICATE instead of the load. Otherwise instruction selection will not
6439   // produce a VLREP. Avoid extracting to a GPR, so only do this for floating
6440   // point loads.
6441 
6442   SDValue Replicate;
6443   SmallVector<SDNode*, 8> OtherUses;
6444   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
6445        UI != UE; ++UI) {
6446     if (UI->getOpcode() == SystemZISD::REPLICATE) {
6447       if (Replicate)
6448         return SDValue(); // Should never happen
6449       Replicate = SDValue(*UI, 0);
6450     }
6451     else if (UI.getUse().getResNo() == 0)
6452       OtherUses.push_back(*UI);
6453   }
6454   if (!Replicate || OtherUses.empty())
6455     return SDValue();
6456 
6457   SDLoc DL(N);
6458   SDValue Extract0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, LdVT,
6459                               Replicate, DAG.getConstant(0, DL, MVT::i32));
6460   // Update uses of the loaded Value while preserving old chains.
6461   for (SDNode *U : OtherUses) {
6462     SmallVector<SDValue, 8> Ops;
6463     for (SDValue Op : U->ops())
6464       Ops.push_back((Op.getNode() == N && Op.getResNo() == 0) ? Extract0 : Op);
6465     DAG.UpdateNodeOperands(U, Ops);
6466   }
6467   return SDValue(N, 0);
6468 }
6469 
6470 bool SystemZTargetLowering::canLoadStoreByteSwapped(EVT VT) const {
6471   if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)
6472     return true;
6473   if (Subtarget.hasVectorEnhancements2())
6474     if (VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64)
6475       return true;
6476   return false;
6477 }
6478 
6479 static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
6480   if (!VT.isVector() || !VT.isSimple() ||
6481       VT.getSizeInBits() != 128 ||
6482       VT.getScalarSizeInBits() % 8 != 0)
6483     return false;
6484 
6485   unsigned NumElts = VT.getVectorNumElements();
6486   for (unsigned i = 0; i < NumElts; ++i) {
6487     if (M[i] < 0) continue; // ignore UNDEF indices
6488     if ((unsigned) M[i] != NumElts - 1 - i)
6489       return false;
6490   }
6491 
6492   return true;
6493 }
6494 
6495 static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
6496   for (auto *U : StoredVal->uses()) {
6497     if (StoreSDNode *ST = dyn_cast<StoreSDNode>(U)) {
6498       EVT CurrMemVT = ST->getMemoryVT().getScalarType();
6499       if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16)
6500         continue;
6501     } else if (isa<BuildVectorSDNode>(U)) {
6502       SDValue BuildVector = SDValue(U, 0);
6503       if (DAG.isSplatValue(BuildVector, true/*AllowUndefs*/) &&
6504           isOnlyUsedByStores(BuildVector, DAG))
6505         continue;
6506     }
6507     return false;
6508   }
6509   return true;
6510 }
6511 
6512 SDValue SystemZTargetLowering::combineSTORE(
6513     SDNode *N, DAGCombinerInfo &DCI) const {
6514   SelectionDAG &DAG = DCI.DAG;
6515   auto *SN = cast<StoreSDNode>(N);
6516   auto &Op1 = N->getOperand(1);
6517   EVT MemVT = SN->getMemoryVT();
6518   // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better
6519   // for the extraction to be done on a vMiN value, so that we can use VSTE.
6520   // If X has wider elements then convert it to:
6521   // (truncstoreiN (extract_vector_elt (bitcast X), Y2), Z).
6522   if (MemVT.isInteger() && SN->isTruncatingStore()) {
6523     if (SDValue Value =
6524             combineTruncateExtract(SDLoc(N), MemVT, SN->getValue(), DCI)) {
6525       DCI.AddToWorklist(Value.getNode());
6526 
6527       // Rewrite the store with the new form of stored value.
6528       return DAG.getTruncStore(SN->getChain(), SDLoc(SN), Value,
6529                                SN->getBasePtr(), SN->getMemoryVT(),
6530                                SN->getMemOperand());
6531     }
6532   }
6533   // Combine STORE (BSWAP) into STRVH/STRV/STRVG/VSTBR
6534   if (!SN->isTruncatingStore() &&
6535       Op1.getOpcode() == ISD::BSWAP &&
6536       Op1.getNode()->hasOneUse() &&
6537       canLoadStoreByteSwapped(Op1.getValueType())) {
6538 
6539       SDValue BSwapOp = Op1.getOperand(0);
6540 
6541       if (BSwapOp.getValueType() == MVT::i16)
6542         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), MVT::i32, BSwapOp);
6543 
6544       SDValue Ops[] = {
6545         N->getOperand(0), BSwapOp, N->getOperand(2)
6546       };
6547 
6548       return
6549         DAG.getMemIntrinsicNode(SystemZISD::STRV, SDLoc(N), DAG.getVTList(MVT::Other),
6550                                 Ops, MemVT, SN->getMemOperand());
6551     }
6552   // Combine STORE (element-swap) into VSTER
6553   if (!SN->isTruncatingStore() &&
6554       Op1.getOpcode() == ISD::VECTOR_SHUFFLE &&
6555       Op1.getNode()->hasOneUse() &&
6556       Subtarget.hasVectorEnhancements2()) {
6557     ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op1.getNode());
6558     ArrayRef<int> ShuffleMask = SVN->getMask();
6559     if (isVectorElementSwap(ShuffleMask, Op1.getValueType())) {
6560       SDValue Ops[] = {
6561         N->getOperand(0), Op1.getOperand(0), N->getOperand(2)
6562       };
6563 
6564       return DAG.getMemIntrinsicNode(SystemZISD::VSTER, SDLoc(N),
6565                                      DAG.getVTList(MVT::Other),
6566                                      Ops, MemVT, SN->getMemOperand());
6567     }
6568   }
6569 
6570   // Replicate a reg or immediate with VREP instead of scalar multiply or
6571   // immediate load. It seems best to do this during the first DAGCombine as
6572   // it is straight-forward to handle the zero-extend node in the initial
6573   // DAG, and also not worry about the keeping the new MemVT legal (e.g. when
6574   // extracting an i16 element from a v16i8 vector).
6575   if (Subtarget.hasVector() && DCI.Level == BeforeLegalizeTypes &&
6576       isOnlyUsedByStores(Op1, DAG)) {
6577     SDValue Word = SDValue();
6578     EVT WordVT;
6579 
6580     // Find a replicated immediate and return it if found in Word and its
6581     // type in WordVT.
6582     auto FindReplicatedImm = [&](ConstantSDNode *C, unsigned TotBytes) {
6583       // Some constants are better handled with a scalar store.
6584       if (C->getAPIntValue().getBitWidth() > 64 || C->isAllOnes() ||
6585           isInt<16>(C->getSExtValue()) || MemVT.getStoreSize() <= 2)
6586         return;
6587       SystemZVectorConstantInfo VCI(APInt(TotBytes * 8, C->getZExtValue()));
6588       if (VCI.isVectorConstantLegal(Subtarget) &&
6589           VCI.Opcode == SystemZISD::REPLICATE) {
6590         Word = DAG.getConstant(VCI.OpVals[0], SDLoc(SN), MVT::i32);
6591         WordVT = VCI.VecVT.getScalarType();
6592       }
6593     };
6594 
6595     // Find a replicated register and return it if found in Word and its type
6596     // in WordVT.
6597     auto FindReplicatedReg = [&](SDValue MulOp) {
6598       EVT MulVT = MulOp.getValueType();
6599       if (MulOp->getOpcode() == ISD::MUL &&
6600           (MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) {
6601         // Find a zero extended value and its type.
6602         SDValue LHS = MulOp->getOperand(0);
6603         if (LHS->getOpcode() == ISD::ZERO_EXTEND)
6604           WordVT = LHS->getOperand(0).getValueType();
6605         else if (LHS->getOpcode() == ISD::AssertZext)
6606           WordVT = cast<VTSDNode>(LHS->getOperand(1))->getVT();
6607         else
6608           return;
6609         // Find a replicating constant, e.g. 0x00010001.
6610         if (auto *C = dyn_cast<ConstantSDNode>(MulOp->getOperand(1))) {
6611           SystemZVectorConstantInfo VCI(
6612               APInt(MulVT.getSizeInBits(), C->getZExtValue()));
6613           if (VCI.isVectorConstantLegal(Subtarget) &&
6614               VCI.Opcode == SystemZISD::REPLICATE && VCI.OpVals[0] == 1 &&
6615               WordVT == VCI.VecVT.getScalarType())
6616             Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT);
6617         }
6618       }
6619     };
6620 
6621     if (isa<BuildVectorSDNode>(Op1) &&
6622         DAG.isSplatValue(Op1, true/*AllowUndefs*/)) {
6623       SDValue SplatVal = Op1->getOperand(0);
6624       if (auto *C = dyn_cast<ConstantSDNode>(SplatVal))
6625         FindReplicatedImm(C, SplatVal.getValueType().getStoreSize());
6626       else
6627         FindReplicatedReg(SplatVal);
6628     } else {
6629       if (auto *C = dyn_cast<ConstantSDNode>(Op1))
6630         FindReplicatedImm(C, MemVT.getStoreSize());
6631       else
6632         FindReplicatedReg(Op1);
6633     }
6634 
6635     if (Word != SDValue()) {
6636       assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 &&
6637              "Bad type handling");
6638       unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits();
6639       EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts);
6640       SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word);
6641       return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal,
6642                           SN->getBasePtr(), SN->getMemOperand());
6643     }
6644   }
6645 
6646   return SDValue();
6647 }
6648 
6649 SDValue SystemZTargetLowering::combineVECTOR_SHUFFLE(
6650     SDNode *N, DAGCombinerInfo &DCI) const {
6651   SelectionDAG &DAG = DCI.DAG;
6652   // Combine element-swap (LOAD) into VLER
6653   if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
6654       N->getOperand(0).hasOneUse() &&
6655       Subtarget.hasVectorEnhancements2()) {
6656     ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
6657     ArrayRef<int> ShuffleMask = SVN->getMask();
6658     if (isVectorElementSwap(ShuffleMask, N->getValueType(0))) {
6659       SDValue Load = N->getOperand(0);
6660       LoadSDNode *LD = cast<LoadSDNode>(Load);
6661 
6662       // Create the element-swapping load.
6663       SDValue Ops[] = {
6664         LD->getChain(),    // Chain
6665         LD->getBasePtr()   // Ptr
6666       };
6667       SDValue ESLoad =
6668         DAG.getMemIntrinsicNode(SystemZISD::VLER, SDLoc(N),
6669                                 DAG.getVTList(LD->getValueType(0), MVT::Other),
6670                                 Ops, LD->getMemoryVT(), LD->getMemOperand());
6671 
6672       // First, combine the VECTOR_SHUFFLE away.  This makes the value produced
6673       // by the load dead.
6674       DCI.CombineTo(N, ESLoad);
6675 
6676       // Next, combine the load away, we give it a bogus result value but a real
6677       // chain result.  The result value is dead because the shuffle is dead.
6678       DCI.CombineTo(Load.getNode(), ESLoad, ESLoad.getValue(1));
6679 
6680       // Return N so it doesn't get rechecked!
6681       return SDValue(N, 0);
6682     }
6683   }
6684 
6685   return SDValue();
6686 }
6687 
6688 SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
6689     SDNode *N, DAGCombinerInfo &DCI) const {
6690   SelectionDAG &DAG = DCI.DAG;
6691 
6692   if (!Subtarget.hasVector())
6693     return SDValue();
6694 
6695   // Look through bitcasts that retain the number of vector elements.
6696   SDValue Op = N->getOperand(0);
6697   if (Op.getOpcode() == ISD::BITCAST &&
6698       Op.getValueType().isVector() &&
6699       Op.getOperand(0).getValueType().isVector() &&
6700       Op.getValueType().getVectorNumElements() ==
6701       Op.getOperand(0).getValueType().getVectorNumElements())
6702     Op = Op.getOperand(0);
6703 
6704   // Pull BSWAP out of a vector extraction.
6705   if (Op.getOpcode() == ISD::BSWAP && Op.hasOneUse()) {
6706     EVT VecVT = Op.getValueType();
6707     EVT EltVT = VecVT.getVectorElementType();
6708     Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), EltVT,
6709                      Op.getOperand(0), N->getOperand(1));
6710     DCI.AddToWorklist(Op.getNode());
6711     Op = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Op);
6712     if (EltVT != N->getValueType(0)) {
6713       DCI.AddToWorklist(Op.getNode());
6714       Op = DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op);
6715     }
6716     return Op;
6717   }
6718 
6719   // Try to simplify a vector extraction.
6720   if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
6721     SDValue Op0 = N->getOperand(0);
6722     EVT VecVT = Op0.getValueType();
6723     return combineExtract(SDLoc(N), N->getValueType(0), VecVT, Op0,
6724                           IndexN->getZExtValue(), DCI, false);
6725   }
6726   return SDValue();
6727 }
6728 
6729 SDValue SystemZTargetLowering::combineJOIN_DWORDS(
6730     SDNode *N, DAGCombinerInfo &DCI) const {
6731   SelectionDAG &DAG = DCI.DAG;
6732   // (join_dwords X, X) == (replicate X)
6733   if (N->getOperand(0) == N->getOperand(1))
6734     return DAG.getNode(SystemZISD::REPLICATE, SDLoc(N), N->getValueType(0),
6735                        N->getOperand(0));
6736   return SDValue();
6737 }
6738 
6739 static SDValue MergeInputChains(SDNode *N1, SDNode *N2) {
6740   SDValue Chain1 = N1->getOperand(0);
6741   SDValue Chain2 = N2->getOperand(0);
6742 
6743   // Trivial case: both nodes take the same chain.
6744   if (Chain1 == Chain2)
6745     return Chain1;
6746 
6747   // FIXME - we could handle more complex cases via TokenFactor,
6748   // assuming we can verify that this would not create a cycle.
6749   return SDValue();
6750 }
6751 
6752 SDValue SystemZTargetLowering::combineFP_ROUND(
6753     SDNode *N, DAGCombinerInfo &DCI) const {
6754 
6755   if (!Subtarget.hasVector())
6756     return SDValue();
6757 
6758   // (fpround (extract_vector_elt X 0))
6759   // (fpround (extract_vector_elt X 1)) ->
6760   // (extract_vector_elt (VROUND X) 0)
6761   // (extract_vector_elt (VROUND X) 2)
6762   //
6763   // This is a special case since the target doesn't really support v2f32s.
6764   unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
6765   SelectionDAG &DAG = DCI.DAG;
6766   SDValue Op0 = N->getOperand(OpNo);
6767   if (N->getValueType(0) == MVT::f32 &&
6768       Op0.hasOneUse() &&
6769       Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6770       Op0.getOperand(0).getValueType() == MVT::v2f64 &&
6771       Op0.getOperand(1).getOpcode() == ISD::Constant &&
6772       cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
6773     SDValue Vec = Op0.getOperand(0);
6774     for (auto *U : Vec->uses()) {
6775       if (U != Op0.getNode() &&
6776           U->hasOneUse() &&
6777           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6778           U->getOperand(0) == Vec &&
6779           U->getOperand(1).getOpcode() == ISD::Constant &&
6780           cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 1) {
6781         SDValue OtherRound = SDValue(*U->use_begin(), 0);
6782         if (OtherRound.getOpcode() == N->getOpcode() &&
6783             OtherRound.getOperand(OpNo) == SDValue(U, 0) &&
6784             OtherRound.getValueType() == MVT::f32) {
6785           SDValue VRound, Chain;
6786           if (N->isStrictFPOpcode()) {
6787             Chain = MergeInputChains(N, OtherRound.getNode());
6788             if (!Chain)
6789               continue;
6790             VRound = DAG.getNode(SystemZISD::STRICT_VROUND, SDLoc(N),
6791                                  {MVT::v4f32, MVT::Other}, {Chain, Vec});
6792             Chain = VRound.getValue(1);
6793           } else
6794             VRound = DAG.getNode(SystemZISD::VROUND, SDLoc(N),
6795                                  MVT::v4f32, Vec);
6796           DCI.AddToWorklist(VRound.getNode());
6797           SDValue Extract1 =
6798             DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f32,
6799                         VRound, DAG.getConstant(2, SDLoc(U), MVT::i32));
6800           DCI.AddToWorklist(Extract1.getNode());
6801           DAG.ReplaceAllUsesOfValueWith(OtherRound, Extract1);
6802           if (Chain)
6803             DAG.ReplaceAllUsesOfValueWith(OtherRound.getValue(1), Chain);
6804           SDValue Extract0 =
6805             DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32,
6806                         VRound, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
6807           if (Chain)
6808             return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op0),
6809                                N->getVTList(), Extract0, Chain);
6810           return Extract0;
6811         }
6812       }
6813     }
6814   }
6815   return SDValue();
6816 }
6817 
6818 SDValue SystemZTargetLowering::combineFP_EXTEND(
6819     SDNode *N, DAGCombinerInfo &DCI) const {
6820 
6821   if (!Subtarget.hasVector())
6822     return SDValue();
6823 
6824   // (fpextend (extract_vector_elt X 0))
6825   // (fpextend (extract_vector_elt X 2)) ->
6826   // (extract_vector_elt (VEXTEND X) 0)
6827   // (extract_vector_elt (VEXTEND X) 1)
6828   //
6829   // This is a special case since the target doesn't really support v2f32s.
6830   unsigned OpNo = N->isStrictFPOpcode() ? 1 : 0;
6831   SelectionDAG &DAG = DCI.DAG;
6832   SDValue Op0 = N->getOperand(OpNo);
6833   if (N->getValueType(0) == MVT::f64 &&
6834       Op0.hasOneUse() &&
6835       Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6836       Op0.getOperand(0).getValueType() == MVT::v4f32 &&
6837       Op0.getOperand(1).getOpcode() == ISD::Constant &&
6838       cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue() == 0) {
6839     SDValue Vec = Op0.getOperand(0);
6840     for (auto *U : Vec->uses()) {
6841       if (U != Op0.getNode() &&
6842           U->hasOneUse() &&
6843           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6844           U->getOperand(0) == Vec &&
6845           U->getOperand(1).getOpcode() == ISD::Constant &&
6846           cast<ConstantSDNode>(U->getOperand(1))->getZExtValue() == 2) {
6847         SDValue OtherExtend = SDValue(*U->use_begin(), 0);
6848         if (OtherExtend.getOpcode() == N->getOpcode() &&
6849             OtherExtend.getOperand(OpNo) == SDValue(U, 0) &&
6850             OtherExtend.getValueType() == MVT::f64) {
6851           SDValue VExtend, Chain;
6852           if (N->isStrictFPOpcode()) {
6853             Chain = MergeInputChains(N, OtherExtend.getNode());
6854             if (!Chain)
6855               continue;
6856             VExtend = DAG.getNode(SystemZISD::STRICT_VEXTEND, SDLoc(N),
6857                                   {MVT::v2f64, MVT::Other}, {Chain, Vec});
6858             Chain = VExtend.getValue(1);
6859           } else
6860             VExtend = DAG.getNode(SystemZISD::VEXTEND, SDLoc(N),
6861                                   MVT::v2f64, Vec);
6862           DCI.AddToWorklist(VExtend.getNode());
6863           SDValue Extract1 =
6864             DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(U), MVT::f64,
6865                         VExtend, DAG.getConstant(1, SDLoc(U), MVT::i32));
6866           DCI.AddToWorklist(Extract1.getNode());
6867           DAG.ReplaceAllUsesOfValueWith(OtherExtend, Extract1);
6868           if (Chain)
6869             DAG.ReplaceAllUsesOfValueWith(OtherExtend.getValue(1), Chain);
6870           SDValue Extract0 =
6871             DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f64,
6872                         VExtend, DAG.getConstant(0, SDLoc(Op0), MVT::i32));
6873           if (Chain)
6874             return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op0),
6875                                N->getVTList(), Extract0, Chain);
6876           return Extract0;
6877         }
6878       }
6879     }
6880   }
6881   return SDValue();
6882 }
6883 
6884 SDValue SystemZTargetLowering::combineINT_TO_FP(
6885     SDNode *N, DAGCombinerInfo &DCI) const {
6886   if (DCI.Level != BeforeLegalizeTypes)
6887     return SDValue();
6888   SelectionDAG &DAG = DCI.DAG;
6889   LLVMContext &Ctx = *DAG.getContext();
6890   unsigned Opcode = N->getOpcode();
6891   EVT OutVT = N->getValueType(0);
6892   Type *OutLLVMTy = OutVT.getTypeForEVT(Ctx);
6893   SDValue Op = N->getOperand(0);
6894   unsigned OutScalarBits = OutLLVMTy->getScalarSizeInBits();
6895   unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits();
6896 
6897   // Insert an extension before type-legalization to avoid scalarization, e.g.:
6898   // v2f64 = uint_to_fp v2i16
6899   // =>
6900   // v2f64 = uint_to_fp (v2i64 zero_extend v2i16)
6901   if (OutLLVMTy->isVectorTy() && OutScalarBits > InScalarBits &&
6902       OutScalarBits <= 64) {
6903     unsigned NumElts = cast<FixedVectorType>(OutLLVMTy)->getNumElements();
6904     EVT ExtVT = EVT::getVectorVT(
6905         Ctx, EVT::getIntegerVT(Ctx, OutLLVMTy->getScalarSizeInBits()), NumElts);
6906     unsigned ExtOpcode =
6907         (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
6908     SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op);
6909     return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp);
6910   }
6911   return SDValue();
6912 }
6913 
6914 SDValue SystemZTargetLowering::combineBSWAP(
6915     SDNode *N, DAGCombinerInfo &DCI) const {
6916   SelectionDAG &DAG = DCI.DAG;
6917   // Combine BSWAP (LOAD) into LRVH/LRV/LRVG/VLBR
6918   if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
6919       N->getOperand(0).hasOneUse() &&
6920       canLoadStoreByteSwapped(N->getValueType(0))) {
6921       SDValue Load = N->getOperand(0);
6922       LoadSDNode *LD = cast<LoadSDNode>(Load);
6923 
6924       // Create the byte-swapping load.
6925       SDValue Ops[] = {
6926         LD->getChain(),    // Chain
6927         LD->getBasePtr()   // Ptr
6928       };
6929       EVT LoadVT = N->getValueType(0);
6930       if (LoadVT == MVT::i16)
6931         LoadVT = MVT::i32;
6932       SDValue BSLoad =
6933         DAG.getMemIntrinsicNode(SystemZISD::LRV, SDLoc(N),
6934                                 DAG.getVTList(LoadVT, MVT::Other),
6935                                 Ops, LD->getMemoryVT(), LD->getMemOperand());
6936 
6937       // If this is an i16 load, insert the truncate.
6938       SDValue ResVal = BSLoad;
6939       if (N->getValueType(0) == MVT::i16)
6940         ResVal = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i16, BSLoad);
6941 
6942       // First, combine the bswap away.  This makes the value produced by the
6943       // load dead.
6944       DCI.CombineTo(N, ResVal);
6945 
6946       // Next, combine the load away, we give it a bogus result value but a real
6947       // chain result.  The result value is dead because the bswap is dead.
6948       DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
6949 
6950       // Return N so it doesn't get rechecked!
6951       return SDValue(N, 0);
6952     }
6953 
6954   // Look through bitcasts that retain the number of vector elements.
6955   SDValue Op = N->getOperand(0);
6956   if (Op.getOpcode() == ISD::BITCAST &&
6957       Op.getValueType().isVector() &&
6958       Op.getOperand(0).getValueType().isVector() &&
6959       Op.getValueType().getVectorNumElements() ==
6960       Op.getOperand(0).getValueType().getVectorNumElements())
6961     Op = Op.getOperand(0);
6962 
6963   // Push BSWAP into a vector insertion if at least one side then simplifies.
6964   if (Op.getOpcode() == ISD::INSERT_VECTOR_ELT && Op.hasOneUse()) {
6965     SDValue Vec = Op.getOperand(0);
6966     SDValue Elt = Op.getOperand(1);
6967     SDValue Idx = Op.getOperand(2);
6968 
6969     if (DAG.isConstantIntBuildVectorOrConstantInt(Vec) ||
6970         Vec.getOpcode() == ISD::BSWAP || Vec.isUndef() ||
6971         DAG.isConstantIntBuildVectorOrConstantInt(Elt) ||
6972         Elt.getOpcode() == ISD::BSWAP || Elt.isUndef() ||
6973         (canLoadStoreByteSwapped(N->getValueType(0)) &&
6974          ISD::isNON_EXTLoad(Elt.getNode()) && Elt.hasOneUse())) {
6975       EVT VecVT = N->getValueType(0);
6976       EVT EltVT = N->getValueType(0).getVectorElementType();
6977       if (VecVT != Vec.getValueType()) {
6978         Vec = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Vec);
6979         DCI.AddToWorklist(Vec.getNode());
6980       }
6981       if (EltVT != Elt.getValueType()) {
6982         Elt = DAG.getNode(ISD::BITCAST, SDLoc(N), EltVT, Elt);
6983         DCI.AddToWorklist(Elt.getNode());
6984       }
6985       Vec = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Vec);
6986       DCI.AddToWorklist(Vec.getNode());
6987       Elt = DAG.getNode(ISD::BSWAP, SDLoc(N), EltVT, Elt);
6988       DCI.AddToWorklist(Elt.getNode());
6989       return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VecVT,
6990                          Vec, Elt, Idx);
6991     }
6992   }
6993 
6994   // Push BSWAP into a vector shuffle if at least one side then simplifies.
6995   ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(Op);
6996   if (SV && Op.hasOneUse()) {
6997     SDValue Op0 = Op.getOperand(0);
6998     SDValue Op1 = Op.getOperand(1);
6999 
7000     if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
7001         Op0.getOpcode() == ISD::BSWAP || Op0.isUndef() ||
7002         DAG.isConstantIntBuildVectorOrConstantInt(Op1) ||
7003         Op1.getOpcode() == ISD::BSWAP || Op1.isUndef()) {
7004       EVT VecVT = N->getValueType(0);
7005       if (VecVT != Op0.getValueType()) {
7006         Op0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op0);
7007         DCI.AddToWorklist(Op0.getNode());
7008       }
7009       if (VecVT != Op1.getValueType()) {
7010         Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), VecVT, Op1);
7011         DCI.AddToWorklist(Op1.getNode());
7012       }
7013       Op0 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op0);
7014       DCI.AddToWorklist(Op0.getNode());
7015       Op1 = DAG.getNode(ISD::BSWAP, SDLoc(N), VecVT, Op1);
7016       DCI.AddToWorklist(Op1.getNode());
7017       return DAG.getVectorShuffle(VecVT, SDLoc(N), Op0, Op1, SV->getMask());
7018     }
7019   }
7020 
7021   return SDValue();
7022 }
7023 
7024 static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
7025   // We have a SELECT_CCMASK or BR_CCMASK comparing the condition code
7026   // set by the CCReg instruction using the CCValid / CCMask masks,
7027   // If the CCReg instruction is itself a ICMP testing the condition
7028   // code set by some other instruction, see whether we can directly
7029   // use that condition code.
7030 
7031   // Verify that we have an ICMP against some constant.
7032   if (CCValid != SystemZ::CCMASK_ICMP)
7033     return false;
7034   auto *ICmp = CCReg.getNode();
7035   if (ICmp->getOpcode() != SystemZISD::ICMP)
7036     return false;
7037   auto *CompareLHS = ICmp->getOperand(0).getNode();
7038   auto *CompareRHS = dyn_cast<ConstantSDNode>(ICmp->getOperand(1));
7039   if (!CompareRHS)
7040     return false;
7041 
7042   // Optimize the case where CompareLHS is a SELECT_CCMASK.
7043   if (CompareLHS->getOpcode() == SystemZISD::SELECT_CCMASK) {
7044     // Verify that we have an appropriate mask for a EQ or NE comparison.
7045     bool Invert = false;
7046     if (CCMask == SystemZ::CCMASK_CMP_NE)
7047       Invert = !Invert;
7048     else if (CCMask != SystemZ::CCMASK_CMP_EQ)
7049       return false;
7050 
7051     // Verify that the ICMP compares against one of select values.
7052     auto *TrueVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(0));
7053     if (!TrueVal)
7054       return false;
7055     auto *FalseVal = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
7056     if (!FalseVal)
7057       return false;
7058     if (CompareRHS->getZExtValue() == FalseVal->getZExtValue())
7059       Invert = !Invert;
7060     else if (CompareRHS->getZExtValue() != TrueVal->getZExtValue())
7061       return false;
7062 
7063     // Compute the effective CC mask for the new branch or select.
7064     auto *NewCCValid = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(2));
7065     auto *NewCCMask = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(3));
7066     if (!NewCCValid || !NewCCMask)
7067       return false;
7068     CCValid = NewCCValid->getZExtValue();
7069     CCMask = NewCCMask->getZExtValue();
7070     if (Invert)
7071       CCMask ^= CCValid;
7072 
7073     // Return the updated CCReg link.
7074     CCReg = CompareLHS->getOperand(4);
7075     return true;
7076   }
7077 
7078   // Optimize the case where CompareRHS is (SRA (SHL (IPM))).
7079   if (CompareLHS->getOpcode() == ISD::SRA) {
7080     auto *SRACount = dyn_cast<ConstantSDNode>(CompareLHS->getOperand(1));
7081     if (!SRACount || SRACount->getZExtValue() != 30)
7082       return false;
7083     auto *SHL = CompareLHS->getOperand(0).getNode();
7084     if (SHL->getOpcode() != ISD::SHL)
7085       return false;
7086     auto *SHLCount = dyn_cast<ConstantSDNode>(SHL->getOperand(1));
7087     if (!SHLCount || SHLCount->getZExtValue() != 30 - SystemZ::IPM_CC)
7088       return false;
7089     auto *IPM = SHL->getOperand(0).getNode();
7090     if (IPM->getOpcode() != SystemZISD::IPM)
7091       return false;
7092 
7093     // Avoid introducing CC spills (because SRA would clobber CC).
7094     if (!CompareLHS->hasOneUse())
7095       return false;
7096     // Verify that the ICMP compares against zero.
7097     if (CompareRHS->getZExtValue() != 0)
7098       return false;
7099 
7100     // Compute the effective CC mask for the new branch or select.
7101     CCMask = SystemZ::reverseCCMask(CCMask);
7102 
7103     // Return the updated CCReg link.
7104     CCReg = IPM->getOperand(0);
7105     return true;
7106   }
7107 
7108   return false;
7109 }
7110 
7111 SDValue SystemZTargetLowering::combineBR_CCMASK(
7112     SDNode *N, DAGCombinerInfo &DCI) const {
7113   SelectionDAG &DAG = DCI.DAG;
7114 
7115   // Combine BR_CCMASK (ICMP (SELECT_CCMASK)) into a single BR_CCMASK.
7116   auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
7117   auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
7118   if (!CCValid || !CCMask)
7119     return SDValue();
7120 
7121   int CCValidVal = CCValid->getZExtValue();
7122   int CCMaskVal = CCMask->getZExtValue();
7123   SDValue Chain = N->getOperand(0);
7124   SDValue CCReg = N->getOperand(4);
7125 
7126   if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
7127     return DAG.getNode(SystemZISD::BR_CCMASK, SDLoc(N), N->getValueType(0),
7128                        Chain,
7129                        DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
7130                        DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
7131                        N->getOperand(3), CCReg);
7132   return SDValue();
7133 }
7134 
7135 SDValue SystemZTargetLowering::combineSELECT_CCMASK(
7136     SDNode *N, DAGCombinerInfo &DCI) const {
7137   SelectionDAG &DAG = DCI.DAG;
7138 
7139   // Combine SELECT_CCMASK (ICMP (SELECT_CCMASK)) into a single SELECT_CCMASK.
7140   auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(2));
7141   auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(3));
7142   if (!CCValid || !CCMask)
7143     return SDValue();
7144 
7145   int CCValidVal = CCValid->getZExtValue();
7146   int CCMaskVal = CCMask->getZExtValue();
7147   SDValue CCReg = N->getOperand(4);
7148 
7149   if (combineCCMask(CCReg, CCValidVal, CCMaskVal))
7150     return DAG.getNode(SystemZISD::SELECT_CCMASK, SDLoc(N), N->getValueType(0),
7151                        N->getOperand(0), N->getOperand(1),
7152                        DAG.getTargetConstant(CCValidVal, SDLoc(N), MVT::i32),
7153                        DAG.getTargetConstant(CCMaskVal, SDLoc(N), MVT::i32),
7154                        CCReg);
7155   return SDValue();
7156 }
7157 
7158 
7159 SDValue SystemZTargetLowering::combineGET_CCMASK(
7160     SDNode *N, DAGCombinerInfo &DCI) const {
7161 
7162   // Optimize away GET_CCMASK (SELECT_CCMASK) if the CC masks are compatible
7163   auto *CCValid = dyn_cast<ConstantSDNode>(N->getOperand(1));
7164   auto *CCMask = dyn_cast<ConstantSDNode>(N->getOperand(2));
7165   if (!CCValid || !CCMask)
7166     return SDValue();
7167   int CCValidVal = CCValid->getZExtValue();
7168   int CCMaskVal = CCMask->getZExtValue();
7169 
7170   SDValue Select = N->getOperand(0);
7171   if (Select->getOpcode() == ISD::TRUNCATE)
7172     Select = Select->getOperand(0);
7173   if (Select->getOpcode() != SystemZISD::SELECT_CCMASK)
7174     return SDValue();
7175 
7176   auto *SelectCCValid = dyn_cast<ConstantSDNode>(Select->getOperand(2));
7177   auto *SelectCCMask = dyn_cast<ConstantSDNode>(Select->getOperand(3));
7178   if (!SelectCCValid || !SelectCCMask)
7179     return SDValue();
7180   int SelectCCValidVal = SelectCCValid->getZExtValue();
7181   int SelectCCMaskVal = SelectCCMask->getZExtValue();
7182 
7183   auto *TrueVal = dyn_cast<ConstantSDNode>(Select->getOperand(0));
7184   auto *FalseVal = dyn_cast<ConstantSDNode>(Select->getOperand(1));
7185   if (!TrueVal || !FalseVal)
7186     return SDValue();
7187   if (TrueVal->getZExtValue() == 1 && FalseVal->getZExtValue() == 0)
7188     ;
7189   else if (TrueVal->getZExtValue() == 0 && FalseVal->getZExtValue() == 1)
7190     SelectCCMaskVal ^= SelectCCValidVal;
7191   else
7192     return SDValue();
7193 
7194   if (SelectCCValidVal & ~CCValidVal)
7195     return SDValue();
7196   if (SelectCCMaskVal != (CCMaskVal & SelectCCValidVal))
7197     return SDValue();
7198 
7199   return Select->getOperand(4);
7200 }
7201 
7202 SDValue SystemZTargetLowering::combineIntDIVREM(
7203     SDNode *N, DAGCombinerInfo &DCI) const {
7204   SelectionDAG &DAG = DCI.DAG;
7205   EVT VT = N->getValueType(0);
7206   // In the case where the divisor is a vector of constants a cheaper
7207   // sequence of instructions can replace the divide. BuildSDIV is called to
7208   // do this during DAG combining, but it only succeeds when it can build a
7209   // multiplication node. The only option for SystemZ is ISD::SMUL_LOHI, and
7210   // since it is not Legal but Custom it can only happen before
7211   // legalization. Therefore we must scalarize this early before Combine
7212   // 1. For widened vectors, this is already the result of type legalization.
7213   if (DCI.Level == BeforeLegalizeTypes && VT.isVector() && isTypeLegal(VT) &&
7214       DAG.isConstantIntBuildVectorOrConstantInt(N->getOperand(1)))
7215     return DAG.UnrollVectorOp(N);
7216   return SDValue();
7217 }
7218 
7219 SDValue SystemZTargetLowering::combineINTRINSIC(
7220     SDNode *N, DAGCombinerInfo &DCI) const {
7221   SelectionDAG &DAG = DCI.DAG;
7222 
7223   unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
7224   switch (Id) {
7225   // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
7226   // or larger is simply a vector load.
7227   case Intrinsic::s390_vll:
7228   case Intrinsic::s390_vlrl:
7229     if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
7230       if (C->getZExtValue() >= 15)
7231         return DAG.getLoad(N->getValueType(0), SDLoc(N), N->getOperand(0),
7232                            N->getOperand(3), MachinePointerInfo());
7233     break;
7234   // Likewise for VECTOR STORE (RIGHTMOST) WITH LENGTH.
7235   case Intrinsic::s390_vstl:
7236   case Intrinsic::s390_vstrl:
7237     if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
7238       if (C->getZExtValue() >= 15)
7239         return DAG.getStore(N->getOperand(0), SDLoc(N), N->getOperand(2),
7240                             N->getOperand(4), MachinePointerInfo());
7241     break;
7242   }
7243 
7244   return SDValue();
7245 }
7246 
7247 SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
7248   if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
7249     return N->getOperand(0);
7250   return N;
7251 }
7252 
7253 SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
7254                                                  DAGCombinerInfo &DCI) const {
7255   switch(N->getOpcode()) {
7256   default: break;
7257   case ISD::ZERO_EXTEND:        return combineZERO_EXTEND(N, DCI);
7258   case ISD::SIGN_EXTEND:        return combineSIGN_EXTEND(N, DCI);
7259   case ISD::SIGN_EXTEND_INREG:  return combineSIGN_EXTEND_INREG(N, DCI);
7260   case SystemZISD::MERGE_HIGH:
7261   case SystemZISD::MERGE_LOW:   return combineMERGE(N, DCI);
7262   case ISD::LOAD:               return combineLOAD(N, DCI);
7263   case ISD::STORE:              return combineSTORE(N, DCI);
7264   case ISD::VECTOR_SHUFFLE:     return combineVECTOR_SHUFFLE(N, DCI);
7265   case ISD::EXTRACT_VECTOR_ELT: return combineEXTRACT_VECTOR_ELT(N, DCI);
7266   case SystemZISD::JOIN_DWORDS: return combineJOIN_DWORDS(N, DCI);
7267   case ISD::STRICT_FP_ROUND:
7268   case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
7269   case ISD::STRICT_FP_EXTEND:
7270   case ISD::FP_EXTEND:          return combineFP_EXTEND(N, DCI);
7271   case ISD::SINT_TO_FP:
7272   case ISD::UINT_TO_FP:         return combineINT_TO_FP(N, DCI);
7273   case ISD::BSWAP:              return combineBSWAP(N, DCI);
7274   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
7275   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
7276   case SystemZISD::GET_CCMASK:  return combineGET_CCMASK(N, DCI);
7277   case ISD::SDIV:
7278   case ISD::UDIV:
7279   case ISD::SREM:
7280   case ISD::UREM:               return combineIntDIVREM(N, DCI);
7281   case ISD::INTRINSIC_W_CHAIN:
7282   case ISD::INTRINSIC_VOID:     return combineINTRINSIC(N, DCI);
7283   }
7284 
7285   return SDValue();
7286 }
7287 
7288 // Return the demanded elements for the OpNo source operand of Op. DemandedElts
7289 // are for Op.
7290 static APInt getDemandedSrcElements(SDValue Op, const APInt &DemandedElts,
7291                                     unsigned OpNo) {
7292   EVT VT = Op.getValueType();
7293   unsigned NumElts = (VT.isVector() ? VT.getVectorNumElements() : 1);
7294   APInt SrcDemE;
7295   unsigned Opcode = Op.getOpcode();
7296   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
7297     unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7298     switch (Id) {
7299     case Intrinsic::s390_vpksh:   // PACKS
7300     case Intrinsic::s390_vpksf:
7301     case Intrinsic::s390_vpksg:
7302     case Intrinsic::s390_vpkshs:  // PACKS_CC
7303     case Intrinsic::s390_vpksfs:
7304     case Intrinsic::s390_vpksgs:
7305     case Intrinsic::s390_vpklsh:  // PACKLS
7306     case Intrinsic::s390_vpklsf:
7307     case Intrinsic::s390_vpklsg:
7308     case Intrinsic::s390_vpklshs: // PACKLS_CC
7309     case Intrinsic::s390_vpklsfs:
7310     case Intrinsic::s390_vpklsgs:
7311       // VECTOR PACK truncates the elements of two source vectors into one.
7312       SrcDemE = DemandedElts;
7313       if (OpNo == 2)
7314         SrcDemE.lshrInPlace(NumElts / 2);
7315       SrcDemE = SrcDemE.trunc(NumElts / 2);
7316       break;
7317       // VECTOR UNPACK extends half the elements of the source vector.
7318     case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
7319     case Intrinsic::s390_vuphh:
7320     case Intrinsic::s390_vuphf:
7321     case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
7322     case Intrinsic::s390_vuplhh:
7323     case Intrinsic::s390_vuplhf:
7324       SrcDemE = APInt(NumElts * 2, 0);
7325       SrcDemE.insertBits(DemandedElts, 0);
7326       break;
7327     case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
7328     case Intrinsic::s390_vuplhw:
7329     case Intrinsic::s390_vuplf:
7330     case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
7331     case Intrinsic::s390_vupllh:
7332     case Intrinsic::s390_vupllf:
7333       SrcDemE = APInt(NumElts * 2, 0);
7334       SrcDemE.insertBits(DemandedElts, NumElts);
7335       break;
7336     case Intrinsic::s390_vpdi: {
7337       // VECTOR PERMUTE DWORD IMMEDIATE selects one element from each source.
7338       SrcDemE = APInt(NumElts, 0);
7339       if (!DemandedElts[OpNo - 1])
7340         break;
7341       unsigned Mask = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
7342       unsigned MaskBit = ((OpNo - 1) ? 1 : 4);
7343       // Demand input element 0 or 1, given by the mask bit value.
7344       SrcDemE.setBit((Mask & MaskBit)? 1 : 0);
7345       break;
7346     }
7347     case Intrinsic::s390_vsldb: {
7348       // VECTOR SHIFT LEFT DOUBLE BY BYTE
7349       assert(VT == MVT::v16i8 && "Unexpected type.");
7350       unsigned FirstIdx = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
7351       assert (FirstIdx > 0 && FirstIdx < 16 && "Unused operand.");
7352       unsigned NumSrc0Els = 16 - FirstIdx;
7353       SrcDemE = APInt(NumElts, 0);
7354       if (OpNo == 1) {
7355         APInt DemEls = DemandedElts.trunc(NumSrc0Els);
7356         SrcDemE.insertBits(DemEls, FirstIdx);
7357       } else {
7358         APInt DemEls = DemandedElts.lshr(NumSrc0Els);
7359         SrcDemE.insertBits(DemEls, 0);
7360       }
7361       break;
7362     }
7363     case Intrinsic::s390_vperm:
7364       SrcDemE = APInt(NumElts, 1);
7365       break;
7366     default:
7367       llvm_unreachable("Unhandled intrinsic.");
7368       break;
7369     }
7370   } else {
7371     switch (Opcode) {
7372     case SystemZISD::JOIN_DWORDS:
7373       // Scalar operand.
7374       SrcDemE = APInt(1, 1);
7375       break;
7376     case SystemZISD::SELECT_CCMASK:
7377       SrcDemE = DemandedElts;
7378       break;
7379     default:
7380       llvm_unreachable("Unhandled opcode.");
7381       break;
7382     }
7383   }
7384   return SrcDemE;
7385 }
7386 
7387 static void computeKnownBitsBinOp(const SDValue Op, KnownBits &Known,
7388                                   const APInt &DemandedElts,
7389                                   const SelectionDAG &DAG, unsigned Depth,
7390                                   unsigned OpNo) {
7391   APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
7392   APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
7393   KnownBits LHSKnown =
7394       DAG.computeKnownBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
7395   KnownBits RHSKnown =
7396       DAG.computeKnownBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
7397   Known = LHSKnown.intersectWith(RHSKnown);
7398 }
7399 
7400 void
7401 SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
7402                                                      KnownBits &Known,
7403                                                      const APInt &DemandedElts,
7404                                                      const SelectionDAG &DAG,
7405                                                      unsigned Depth) const {
7406   Known.resetAll();
7407 
7408   // Intrinsic CC result is returned in the two low bits.
7409   unsigned tmp0, tmp1; // not used
7410   if (Op.getResNo() == 1 && isIntrinsicWithCC(Op, tmp0, tmp1)) {
7411     Known.Zero.setBitsFrom(2);
7412     return;
7413   }
7414   EVT VT = Op.getValueType();
7415   if (Op.getResNo() != 0 || VT == MVT::Untyped)
7416     return;
7417   assert (Known.getBitWidth() == VT.getScalarSizeInBits() &&
7418           "KnownBits does not match VT in bitwidth");
7419   assert ((!VT.isVector() ||
7420            (DemandedElts.getBitWidth() == VT.getVectorNumElements())) &&
7421           "DemandedElts does not match VT number of elements");
7422   unsigned BitWidth = Known.getBitWidth();
7423   unsigned Opcode = Op.getOpcode();
7424   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
7425     bool IsLogical = false;
7426     unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7427     switch (Id) {
7428     case Intrinsic::s390_vpksh:   // PACKS
7429     case Intrinsic::s390_vpksf:
7430     case Intrinsic::s390_vpksg:
7431     case Intrinsic::s390_vpkshs:  // PACKS_CC
7432     case Intrinsic::s390_vpksfs:
7433     case Intrinsic::s390_vpksgs:
7434     case Intrinsic::s390_vpklsh:  // PACKLS
7435     case Intrinsic::s390_vpklsf:
7436     case Intrinsic::s390_vpklsg:
7437     case Intrinsic::s390_vpklshs: // PACKLS_CC
7438     case Intrinsic::s390_vpklsfs:
7439     case Intrinsic::s390_vpklsgs:
7440     case Intrinsic::s390_vpdi:
7441     case Intrinsic::s390_vsldb:
7442     case Intrinsic::s390_vperm:
7443       computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 1);
7444       break;
7445     case Intrinsic::s390_vuplhb: // VECTOR UNPACK LOGICAL HIGH
7446     case Intrinsic::s390_vuplhh:
7447     case Intrinsic::s390_vuplhf:
7448     case Intrinsic::s390_vupllb: // VECTOR UNPACK LOGICAL LOW
7449     case Intrinsic::s390_vupllh:
7450     case Intrinsic::s390_vupllf:
7451       IsLogical = true;
7452       [[fallthrough]];
7453     case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
7454     case Intrinsic::s390_vuphh:
7455     case Intrinsic::s390_vuphf:
7456     case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
7457     case Intrinsic::s390_vuplhw:
7458     case Intrinsic::s390_vuplf: {
7459       SDValue SrcOp = Op.getOperand(1);
7460       APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
7461       Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1);
7462       if (IsLogical) {
7463         Known = Known.zext(BitWidth);
7464       } else
7465         Known = Known.sext(BitWidth);
7466       break;
7467     }
7468     default:
7469       break;
7470     }
7471   } else {
7472     switch (Opcode) {
7473     case SystemZISD::JOIN_DWORDS:
7474     case SystemZISD::SELECT_CCMASK:
7475       computeKnownBitsBinOp(Op, Known, DemandedElts, DAG, Depth, 0);
7476       break;
7477     case SystemZISD::REPLICATE: {
7478       SDValue SrcOp = Op.getOperand(0);
7479       Known = DAG.computeKnownBits(SrcOp, Depth + 1);
7480       if (Known.getBitWidth() < BitWidth && isa<ConstantSDNode>(SrcOp))
7481         Known = Known.sext(BitWidth); // VREPI sign extends the immedate.
7482       break;
7483     }
7484     default:
7485       break;
7486     }
7487   }
7488 
7489   // Known has the width of the source operand(s). Adjust if needed to match
7490   // the passed bitwidth.
7491   if (Known.getBitWidth() != BitWidth)
7492     Known = Known.anyextOrTrunc(BitWidth);
7493 }
7494 
7495 static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
7496                                         const SelectionDAG &DAG, unsigned Depth,
7497                                         unsigned OpNo) {
7498   APInt Src0DemE = getDemandedSrcElements(Op, DemandedElts, OpNo);
7499   unsigned LHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo), Src0DemE, Depth + 1);
7500   if (LHS == 1) return 1; // Early out.
7501   APInt Src1DemE = getDemandedSrcElements(Op, DemandedElts, OpNo + 1);
7502   unsigned RHS = DAG.ComputeNumSignBits(Op.getOperand(OpNo + 1), Src1DemE, Depth + 1);
7503   if (RHS == 1) return 1; // Early out.
7504   unsigned Common = std::min(LHS, RHS);
7505   unsigned SrcBitWidth = Op.getOperand(OpNo).getScalarValueSizeInBits();
7506   EVT VT = Op.getValueType();
7507   unsigned VTBits = VT.getScalarSizeInBits();
7508   if (SrcBitWidth > VTBits) { // PACK
7509     unsigned SrcExtraBits = SrcBitWidth - VTBits;
7510     if (Common > SrcExtraBits)
7511       return (Common - SrcExtraBits);
7512     return 1;
7513   }
7514   assert (SrcBitWidth == VTBits && "Expected operands of same bitwidth.");
7515   return Common;
7516 }
7517 
7518 unsigned
7519 SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
7520     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
7521     unsigned Depth) const {
7522   if (Op.getResNo() != 0)
7523     return 1;
7524   unsigned Opcode = Op.getOpcode();
7525   if (Opcode == ISD::INTRINSIC_WO_CHAIN) {
7526     unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7527     switch (Id) {
7528     case Intrinsic::s390_vpksh:   // PACKS
7529     case Intrinsic::s390_vpksf:
7530     case Intrinsic::s390_vpksg:
7531     case Intrinsic::s390_vpkshs:  // PACKS_CC
7532     case Intrinsic::s390_vpksfs:
7533     case Intrinsic::s390_vpksgs:
7534     case Intrinsic::s390_vpklsh:  // PACKLS
7535     case Intrinsic::s390_vpklsf:
7536     case Intrinsic::s390_vpklsg:
7537     case Intrinsic::s390_vpklshs: // PACKLS_CC
7538     case Intrinsic::s390_vpklsfs:
7539     case Intrinsic::s390_vpklsgs:
7540     case Intrinsic::s390_vpdi:
7541     case Intrinsic::s390_vsldb:
7542     case Intrinsic::s390_vperm:
7543       return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 1);
7544     case Intrinsic::s390_vuphb:  // VECTOR UNPACK HIGH
7545     case Intrinsic::s390_vuphh:
7546     case Intrinsic::s390_vuphf:
7547     case Intrinsic::s390_vuplb:  // VECTOR UNPACK LOW
7548     case Intrinsic::s390_vuplhw:
7549     case Intrinsic::s390_vuplf: {
7550       SDValue PackedOp = Op.getOperand(1);
7551       APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 1);
7552       unsigned Tmp = DAG.ComputeNumSignBits(PackedOp, SrcDemE, Depth + 1);
7553       EVT VT = Op.getValueType();
7554       unsigned VTBits = VT.getScalarSizeInBits();
7555       Tmp += VTBits - PackedOp.getScalarValueSizeInBits();
7556       return Tmp;
7557     }
7558     default:
7559       break;
7560     }
7561   } else {
7562     switch (Opcode) {
7563     case SystemZISD::SELECT_CCMASK:
7564       return computeNumSignBitsBinOp(Op, DemandedElts, DAG, Depth, 0);
7565     default:
7566       break;
7567     }
7568   }
7569 
7570   return 1;
7571 }
7572 
7573 bool SystemZTargetLowering::
7574 isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op,
7575          const APInt &DemandedElts, const SelectionDAG &DAG,
7576          bool PoisonOnly, unsigned Depth) const {
7577   switch (Op->getOpcode()) {
7578   case SystemZISD::PCREL_WRAPPER:
7579   case SystemZISD::PCREL_OFFSET:
7580     return true;
7581   }
7582   return false;
7583 }
7584 
7585 unsigned
7586 SystemZTargetLowering::getStackProbeSize(const MachineFunction &MF) const {
7587   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
7588   unsigned StackAlign = TFI->getStackAlignment();
7589   assert(StackAlign >=1 && isPowerOf2_32(StackAlign) &&
7590          "Unexpected stack alignment");
7591   // The default stack probe size is 4096 if the function has no
7592   // stack-probe-size attribute.
7593   unsigned StackProbeSize =
7594       MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size", 4096);
7595   // Round down to the stack alignment.
7596   StackProbeSize &= ~(StackAlign - 1);
7597   return StackProbeSize ? StackProbeSize : StackAlign;
7598 }
7599 
7600 //===----------------------------------------------------------------------===//
7601 // Custom insertion
7602 //===----------------------------------------------------------------------===//
7603 
7604 // Force base value Base into a register before MI.  Return the register.
7605 static Register forceReg(MachineInstr &MI, MachineOperand &Base,
7606                          const SystemZInstrInfo *TII) {
7607   MachineBasicBlock *MBB = MI.getParent();
7608   MachineFunction &MF = *MBB->getParent();
7609   MachineRegisterInfo &MRI = MF.getRegInfo();
7610 
7611   if (Base.isReg()) {
7612     // Copy Base into a new virtual register to help register coalescing in
7613     // cases with multiple uses.
7614     Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
7615     BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::COPY), Reg)
7616       .add(Base);
7617     return Reg;
7618   }
7619 
7620   Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
7621   BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LA), Reg)
7622       .add(Base)
7623       .addImm(0)
7624       .addReg(0);
7625   return Reg;
7626 }
7627 
7628 // The CC operand of MI might be missing a kill marker because there
7629 // were multiple uses of CC, and ISel didn't know which to mark.
7630 // Figure out whether MI should have had a kill marker.
7631 static bool checkCCKill(MachineInstr &MI, MachineBasicBlock *MBB) {
7632   // Scan forward through BB for a use/def of CC.
7633   MachineBasicBlock::iterator miI(std::next(MachineBasicBlock::iterator(MI)));
7634   for (MachineBasicBlock::iterator miE = MBB->end(); miI != miE; ++miI) {
7635     const MachineInstr& mi = *miI;
7636     if (mi.readsRegister(SystemZ::CC))
7637       return false;
7638     if (mi.definesRegister(SystemZ::CC))
7639       break; // Should have kill-flag - update below.
7640   }
7641 
7642   // If we hit the end of the block, check whether CC is live into a
7643   // successor.
7644   if (miI == MBB->end()) {
7645     for (const MachineBasicBlock *Succ : MBB->successors())
7646       if (Succ->isLiveIn(SystemZ::CC))
7647         return false;
7648   }
7649 
7650   return true;
7651 }
7652 
7653 // Return true if it is OK for this Select pseudo-opcode to be cascaded
7654 // together with other Select pseudo-opcodes into a single basic-block with
7655 // a conditional jump around it.
7656 static bool isSelectPseudo(MachineInstr &MI) {
7657   switch (MI.getOpcode()) {
7658   case SystemZ::Select32:
7659   case SystemZ::Select64:
7660   case SystemZ::SelectF32:
7661   case SystemZ::SelectF64:
7662   case SystemZ::SelectF128:
7663   case SystemZ::SelectVR32:
7664   case SystemZ::SelectVR64:
7665   case SystemZ::SelectVR128:
7666     return true;
7667 
7668   default:
7669     return false;
7670   }
7671 }
7672 
7673 // Helper function, which inserts PHI functions into SinkMBB:
7674 //   %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
7675 // where %FalseValue(i) and %TrueValue(i) are taken from Selects.
7676 static void createPHIsForSelects(SmallVector<MachineInstr*, 8> &Selects,
7677                                  MachineBasicBlock *TrueMBB,
7678                                  MachineBasicBlock *FalseMBB,
7679                                  MachineBasicBlock *SinkMBB) {
7680   MachineFunction *MF = TrueMBB->getParent();
7681   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
7682 
7683   MachineInstr *FirstMI = Selects.front();
7684   unsigned CCValid = FirstMI->getOperand(3).getImm();
7685   unsigned CCMask = FirstMI->getOperand(4).getImm();
7686 
7687   MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
7688 
7689   // As we are creating the PHIs, we have to be careful if there is more than
7690   // one.  Later Selects may reference the results of earlier Selects, but later
7691   // PHIs have to reference the individual true/false inputs from earlier PHIs.
7692   // That also means that PHI construction must work forward from earlier to
7693   // later, and that the code must maintain a mapping from earlier PHI's
7694   // destination registers, and the registers that went into the PHI.
7695   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
7696 
7697   for (auto *MI : Selects) {
7698     Register DestReg = MI->getOperand(0).getReg();
7699     Register TrueReg = MI->getOperand(1).getReg();
7700     Register FalseReg = MI->getOperand(2).getReg();
7701 
7702     // If this Select we are generating is the opposite condition from
7703     // the jump we generated, then we have to swap the operands for the
7704     // PHI that is going to be generated.
7705     if (MI->getOperand(4).getImm() == (CCValid ^ CCMask))
7706       std::swap(TrueReg, FalseReg);
7707 
7708     if (RegRewriteTable.contains(TrueReg))
7709       TrueReg = RegRewriteTable[TrueReg].first;
7710 
7711     if (RegRewriteTable.contains(FalseReg))
7712       FalseReg = RegRewriteTable[FalseReg].second;
7713 
7714     DebugLoc DL = MI->getDebugLoc();
7715     BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(SystemZ::PHI), DestReg)
7716       .addReg(TrueReg).addMBB(TrueMBB)
7717       .addReg(FalseReg).addMBB(FalseMBB);
7718 
7719     // Add this PHI to the rewrite table.
7720     RegRewriteTable[DestReg] = std::make_pair(TrueReg, FalseReg);
7721   }
7722 
7723   MF->getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
7724 }
7725 
7726 // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
7727 MachineBasicBlock *
7728 SystemZTargetLowering::emitSelect(MachineInstr &MI,
7729                                   MachineBasicBlock *MBB) const {
7730   assert(isSelectPseudo(MI) && "Bad call to emitSelect()");
7731   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
7732 
7733   unsigned CCValid = MI.getOperand(3).getImm();
7734   unsigned CCMask = MI.getOperand(4).getImm();
7735 
7736   // If we have a sequence of Select* pseudo instructions using the
7737   // same condition code value, we want to expand all of them into
7738   // a single pair of basic blocks using the same condition.
7739   SmallVector<MachineInstr*, 8> Selects;
7740   SmallVector<MachineInstr*, 8> DbgValues;
7741   Selects.push_back(&MI);
7742   unsigned Count = 0;
7743   for (MachineInstr &NextMI : llvm::make_range(
7744            std::next(MachineBasicBlock::iterator(MI)), MBB->end())) {
7745     if (isSelectPseudo(NextMI)) {
7746       assert(NextMI.getOperand(3).getImm() == CCValid &&
7747              "Bad CCValid operands since CC was not redefined.");
7748       if (NextMI.getOperand(4).getImm() == CCMask ||
7749           NextMI.getOperand(4).getImm() == (CCValid ^ CCMask)) {
7750         Selects.push_back(&NextMI);
7751         continue;
7752       }
7753       break;
7754     }
7755     if (NextMI.definesRegister(SystemZ::CC) || NextMI.usesCustomInsertionHook())
7756       break;
7757     bool User = false;
7758     for (auto *SelMI : Selects)
7759       if (NextMI.readsVirtualRegister(SelMI->getOperand(0).getReg())) {
7760         User = true;
7761         break;
7762       }
7763     if (NextMI.isDebugInstr()) {
7764       if (User) {
7765         assert(NextMI.isDebugValue() && "Unhandled debug opcode.");
7766         DbgValues.push_back(&NextMI);
7767       }
7768     } else if (User || ++Count > 20)
7769       break;
7770   }
7771 
7772   MachineInstr *LastMI = Selects.back();
7773   bool CCKilled =
7774       (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB));
7775   MachineBasicBlock *StartMBB = MBB;
7776   MachineBasicBlock *JoinMBB  = SystemZ::splitBlockAfter(LastMI, MBB);
7777   MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB);
7778 
7779   // Unless CC was killed in the last Select instruction, mark it as
7780   // live-in to both FalseMBB and JoinMBB.
7781   if (!CCKilled) {
7782     FalseMBB->addLiveIn(SystemZ::CC);
7783     JoinMBB->addLiveIn(SystemZ::CC);
7784   }
7785 
7786   //  StartMBB:
7787   //   BRC CCMask, JoinMBB
7788   //   # fallthrough to FalseMBB
7789   MBB = StartMBB;
7790   BuildMI(MBB, MI.getDebugLoc(), TII->get(SystemZ::BRC))
7791     .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
7792   MBB->addSuccessor(JoinMBB);
7793   MBB->addSuccessor(FalseMBB);
7794 
7795   //  FalseMBB:
7796   //   # fallthrough to JoinMBB
7797   MBB = FalseMBB;
7798   MBB->addSuccessor(JoinMBB);
7799 
7800   //  JoinMBB:
7801   //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
7802   //  ...
7803   MBB = JoinMBB;
7804   createPHIsForSelects(Selects, StartMBB, FalseMBB, MBB);
7805   for (auto *SelMI : Selects)
7806     SelMI->eraseFromParent();
7807 
7808   MachineBasicBlock::iterator InsertPos = MBB->getFirstNonPHI();
7809   for (auto *DbgMI : DbgValues)
7810     MBB->splice(InsertPos, StartMBB, DbgMI);
7811 
7812   return JoinMBB;
7813 }
7814 
7815 // Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
7816 // StoreOpcode is the store to use and Invert says whether the store should
7817 // happen when the condition is false rather than true.  If a STORE ON
7818 // CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
7819 MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
7820                                                         MachineBasicBlock *MBB,
7821                                                         unsigned StoreOpcode,
7822                                                         unsigned STOCOpcode,
7823                                                         bool Invert) const {
7824   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
7825 
7826   Register SrcReg = MI.getOperand(0).getReg();
7827   MachineOperand Base = MI.getOperand(1);
7828   int64_t Disp = MI.getOperand(2).getImm();
7829   Register IndexReg = MI.getOperand(3).getReg();
7830   unsigned CCValid = MI.getOperand(4).getImm();
7831   unsigned CCMask = MI.getOperand(5).getImm();
7832   DebugLoc DL = MI.getDebugLoc();
7833 
7834   StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
7835 
7836   // ISel pattern matching also adds a load memory operand of the same
7837   // address, so take special care to find the storing memory operand.
7838   MachineMemOperand *MMO = nullptr;
7839   for (auto *I : MI.memoperands())
7840     if (I->isStore()) {
7841       MMO = I;
7842       break;
7843     }
7844 
7845   // Use STOCOpcode if possible.  We could use different store patterns in
7846   // order to avoid matching the index register, but the performance trade-offs
7847   // might be more complicated in that case.
7848   if (STOCOpcode && !IndexReg && Subtarget.hasLoadStoreOnCond()) {
7849     if (Invert)
7850       CCMask ^= CCValid;
7851 
7852     BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
7853       .addReg(SrcReg)
7854       .add(Base)
7855       .addImm(Disp)
7856       .addImm(CCValid)
7857       .addImm(CCMask)
7858       .addMemOperand(MMO);
7859 
7860     MI.eraseFromParent();
7861     return MBB;
7862   }
7863 
7864   // Get the condition needed to branch around the store.
7865   if (!Invert)
7866     CCMask ^= CCValid;
7867 
7868   MachineBasicBlock *StartMBB = MBB;
7869   MachineBasicBlock *JoinMBB  = SystemZ::splitBlockBefore(MI, MBB);
7870   MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB);
7871 
7872   // Unless CC was killed in the CondStore instruction, mark it as
7873   // live-in to both FalseMBB and JoinMBB.
7874   if (!MI.killsRegister(SystemZ::CC) && !checkCCKill(MI, JoinMBB)) {
7875     FalseMBB->addLiveIn(SystemZ::CC);
7876     JoinMBB->addLiveIn(SystemZ::CC);
7877   }
7878 
7879   //  StartMBB:
7880   //   BRC CCMask, JoinMBB
7881   //   # fallthrough to FalseMBB
7882   MBB = StartMBB;
7883   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
7884     .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
7885   MBB->addSuccessor(JoinMBB);
7886   MBB->addSuccessor(FalseMBB);
7887 
7888   //  FalseMBB:
7889   //   store %SrcReg, %Disp(%Index,%Base)
7890   //   # fallthrough to JoinMBB
7891   MBB = FalseMBB;
7892   BuildMI(MBB, DL, TII->get(StoreOpcode))
7893       .addReg(SrcReg)
7894       .add(Base)
7895       .addImm(Disp)
7896       .addReg(IndexReg)
7897       .addMemOperand(MMO);
7898   MBB->addSuccessor(JoinMBB);
7899 
7900   MI.eraseFromParent();
7901   return JoinMBB;
7902 }
7903 
7904 // Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
7905 // or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
7906 // performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
7907 // BitSize is the width of the field in bits, or 0 if this is a partword
7908 // ATOMIC_LOADW_* or ATOMIC_SWAPW instruction, in which case the bitsize
7909 // is one of the operands.  Invert says whether the field should be
7910 // inverted after performing BinOpcode (e.g. for NAND).
7911 MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
7912     MachineInstr &MI, MachineBasicBlock *MBB, unsigned BinOpcode,
7913     unsigned BitSize, bool Invert) const {
7914   MachineFunction &MF = *MBB->getParent();
7915   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
7916   MachineRegisterInfo &MRI = MF.getRegInfo();
7917   bool IsSubWord = (BitSize < 32);
7918 
7919   // Extract the operands.  Base can be a register or a frame index.
7920   // Src2 can be a register or immediate.
7921   Register Dest = MI.getOperand(0).getReg();
7922   MachineOperand Base = earlyUseOperand(MI.getOperand(1));
7923   int64_t Disp = MI.getOperand(2).getImm();
7924   MachineOperand Src2 = earlyUseOperand(MI.getOperand(3));
7925   Register BitShift = IsSubWord ? MI.getOperand(4).getReg() : Register();
7926   Register NegBitShift = IsSubWord ? MI.getOperand(5).getReg() : Register();
7927   DebugLoc DL = MI.getDebugLoc();
7928   if (IsSubWord)
7929     BitSize = MI.getOperand(6).getImm();
7930 
7931   // Subword operations use 32-bit registers.
7932   const TargetRegisterClass *RC = (BitSize <= 32 ?
7933                                    &SystemZ::GR32BitRegClass :
7934                                    &SystemZ::GR64BitRegClass);
7935   unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
7936   unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
7937 
7938   // Get the right opcodes for the displacement.
7939   LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
7940   CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
7941   assert(LOpcode && CSOpcode && "Displacement out of range");
7942 
7943   // Create virtual registers for temporary results.
7944   Register OrigVal       = MRI.createVirtualRegister(RC);
7945   Register OldVal        = MRI.createVirtualRegister(RC);
7946   Register NewVal        = (BinOpcode || IsSubWord ?
7947                             MRI.createVirtualRegister(RC) : Src2.getReg());
7948   Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
7949   Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
7950 
7951   // Insert a basic block for the main loop.
7952   MachineBasicBlock *StartMBB = MBB;
7953   MachineBasicBlock *DoneMBB  = SystemZ::splitBlockBefore(MI, MBB);
7954   MachineBasicBlock *LoopMBB  = SystemZ::emitBlockAfter(StartMBB);
7955 
7956   //  StartMBB:
7957   //   ...
7958   //   %OrigVal = L Disp(%Base)
7959   //   # fall through to LoopMBB
7960   MBB = StartMBB;
7961   BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
7962   MBB->addSuccessor(LoopMBB);
7963 
7964   //  LoopMBB:
7965   //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, LoopMBB ]
7966   //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
7967   //   %RotatedNewVal = OP %RotatedOldVal, %Src2
7968   //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
7969   //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
7970   //   JNE LoopMBB
7971   //   # fall through to DoneMBB
7972   MBB = LoopMBB;
7973   BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
7974     .addReg(OrigVal).addMBB(StartMBB)
7975     .addReg(Dest).addMBB(LoopMBB);
7976   if (IsSubWord)
7977     BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
7978       .addReg(OldVal).addReg(BitShift).addImm(0);
7979   if (Invert) {
7980     // Perform the operation normally and then invert every bit of the field.
7981     Register Tmp = MRI.createVirtualRegister(RC);
7982     BuildMI(MBB, DL, TII->get(BinOpcode), Tmp).addReg(RotatedOldVal).add(Src2);
7983     if (BitSize <= 32)
7984       // XILF with the upper BitSize bits set.
7985       BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
7986         .addReg(Tmp).addImm(-1U << (32 - BitSize));
7987     else {
7988       // Use LCGR and add -1 to the result, which is more compact than
7989       // an XILF, XILH pair.
7990       Register Tmp2 = MRI.createVirtualRegister(RC);
7991       BuildMI(MBB, DL, TII->get(SystemZ::LCGR), Tmp2).addReg(Tmp);
7992       BuildMI(MBB, DL, TII->get(SystemZ::AGHI), RotatedNewVal)
7993         .addReg(Tmp2).addImm(-1);
7994     }
7995   } else if (BinOpcode)
7996     // A simply binary operation.
7997     BuildMI(MBB, DL, TII->get(BinOpcode), RotatedNewVal)
7998         .addReg(RotatedOldVal)
7999         .add(Src2);
8000   else if (IsSubWord)
8001     // Use RISBG to rotate Src2 into position and use it to replace the
8002     // field in RotatedOldVal.
8003     BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedNewVal)
8004       .addReg(RotatedOldVal).addReg(Src2.getReg())
8005       .addImm(32).addImm(31 + BitSize).addImm(32 - BitSize);
8006   if (IsSubWord)
8007     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
8008       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
8009   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
8010       .addReg(OldVal)
8011       .addReg(NewVal)
8012       .add(Base)
8013       .addImm(Disp);
8014   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8015     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
8016   MBB->addSuccessor(LoopMBB);
8017   MBB->addSuccessor(DoneMBB);
8018 
8019   MI.eraseFromParent();
8020   return DoneMBB;
8021 }
8022 
8023 // Implement EmitInstrWithCustomInserter for pseudo
8024 // ATOMIC_LOAD{,W}_{,U}{MIN,MAX} instruction MI.  CompareOpcode is the
8025 // instruction that should be used to compare the current field with the
8026 // minimum or maximum value.  KeepOldMask is the BRC condition-code mask
8027 // for when the current field should be kept.  BitSize is the width of
8028 // the field in bits, or 0 if this is a partword ATOMIC_LOADW_* instruction.
8029 MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
8030     MachineInstr &MI, MachineBasicBlock *MBB, unsigned CompareOpcode,
8031     unsigned KeepOldMask, unsigned BitSize) const {
8032   MachineFunction &MF = *MBB->getParent();
8033   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8034   MachineRegisterInfo &MRI = MF.getRegInfo();
8035   bool IsSubWord = (BitSize < 32);
8036 
8037   // Extract the operands.  Base can be a register or a frame index.
8038   Register Dest = MI.getOperand(0).getReg();
8039   MachineOperand Base = earlyUseOperand(MI.getOperand(1));
8040   int64_t Disp = MI.getOperand(2).getImm();
8041   Register Src2 = MI.getOperand(3).getReg();
8042   Register BitShift = (IsSubWord ? MI.getOperand(4).getReg() : Register());
8043   Register NegBitShift = (IsSubWord ? MI.getOperand(5).getReg() : Register());
8044   DebugLoc DL = MI.getDebugLoc();
8045   if (IsSubWord)
8046     BitSize = MI.getOperand(6).getImm();
8047 
8048   // Subword operations use 32-bit registers.
8049   const TargetRegisterClass *RC = (BitSize <= 32 ?
8050                                    &SystemZ::GR32BitRegClass :
8051                                    &SystemZ::GR64BitRegClass);
8052   unsigned LOpcode  = BitSize <= 32 ? SystemZ::L  : SystemZ::LG;
8053   unsigned CSOpcode = BitSize <= 32 ? SystemZ::CS : SystemZ::CSG;
8054 
8055   // Get the right opcodes for the displacement.
8056   LOpcode  = TII->getOpcodeForOffset(LOpcode,  Disp);
8057   CSOpcode = TII->getOpcodeForOffset(CSOpcode, Disp);
8058   assert(LOpcode && CSOpcode && "Displacement out of range");
8059 
8060   // Create virtual registers for temporary results.
8061   Register OrigVal       = MRI.createVirtualRegister(RC);
8062   Register OldVal        = MRI.createVirtualRegister(RC);
8063   Register NewVal        = MRI.createVirtualRegister(RC);
8064   Register RotatedOldVal = (IsSubWord ? MRI.createVirtualRegister(RC) : OldVal);
8065   Register RotatedAltVal = (IsSubWord ? MRI.createVirtualRegister(RC) : Src2);
8066   Register RotatedNewVal = (IsSubWord ? MRI.createVirtualRegister(RC) : NewVal);
8067 
8068   // Insert 3 basic blocks for the loop.
8069   MachineBasicBlock *StartMBB  = MBB;
8070   MachineBasicBlock *DoneMBB   = SystemZ::splitBlockBefore(MI, MBB);
8071   MachineBasicBlock *LoopMBB   = SystemZ::emitBlockAfter(StartMBB);
8072   MachineBasicBlock *UseAltMBB = SystemZ::emitBlockAfter(LoopMBB);
8073   MachineBasicBlock *UpdateMBB = SystemZ::emitBlockAfter(UseAltMBB);
8074 
8075   //  StartMBB:
8076   //   ...
8077   //   %OrigVal     = L Disp(%Base)
8078   //   # fall through to LoopMBB
8079   MBB = StartMBB;
8080   BuildMI(MBB, DL, TII->get(LOpcode), OrigVal).add(Base).addImm(Disp).addReg(0);
8081   MBB->addSuccessor(LoopMBB);
8082 
8083   //  LoopMBB:
8084   //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
8085   //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
8086   //   CompareOpcode %RotatedOldVal, %Src2
8087   //   BRC KeepOldMask, UpdateMBB
8088   MBB = LoopMBB;
8089   BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
8090     .addReg(OrigVal).addMBB(StartMBB)
8091     .addReg(Dest).addMBB(UpdateMBB);
8092   if (IsSubWord)
8093     BuildMI(MBB, DL, TII->get(SystemZ::RLL), RotatedOldVal)
8094       .addReg(OldVal).addReg(BitShift).addImm(0);
8095   BuildMI(MBB, DL, TII->get(CompareOpcode))
8096     .addReg(RotatedOldVal).addReg(Src2);
8097   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8098     .addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
8099   MBB->addSuccessor(UpdateMBB);
8100   MBB->addSuccessor(UseAltMBB);
8101 
8102   //  UseAltMBB:
8103   //   %RotatedAltVal = RISBG %RotatedOldVal, %Src2, 32, 31 + BitSize, 0
8104   //   # fall through to UpdateMBB
8105   MBB = UseAltMBB;
8106   if (IsSubWord)
8107     BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RotatedAltVal)
8108       .addReg(RotatedOldVal).addReg(Src2)
8109       .addImm(32).addImm(31 + BitSize).addImm(0);
8110   MBB->addSuccessor(UpdateMBB);
8111 
8112   //  UpdateMBB:
8113   //   %RotatedNewVal = PHI [ %RotatedOldVal, LoopMBB ],
8114   //                        [ %RotatedAltVal, UseAltMBB ]
8115   //   %NewVal        = RLL %RotatedNewVal, 0(%NegBitShift)
8116   //   %Dest          = CS %OldVal, %NewVal, Disp(%Base)
8117   //   JNE LoopMBB
8118   //   # fall through to DoneMBB
8119   MBB = UpdateMBB;
8120   BuildMI(MBB, DL, TII->get(SystemZ::PHI), RotatedNewVal)
8121     .addReg(RotatedOldVal).addMBB(LoopMBB)
8122     .addReg(RotatedAltVal).addMBB(UseAltMBB);
8123   if (IsSubWord)
8124     BuildMI(MBB, DL, TII->get(SystemZ::RLL), NewVal)
8125       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
8126   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
8127       .addReg(OldVal)
8128       .addReg(NewVal)
8129       .add(Base)
8130       .addImm(Disp);
8131   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8132     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
8133   MBB->addSuccessor(LoopMBB);
8134   MBB->addSuccessor(DoneMBB);
8135 
8136   MI.eraseFromParent();
8137   return DoneMBB;
8138 }
8139 
8140 // Implement EmitInstrWithCustomInserter for pseudo ATOMIC_CMP_SWAPW
8141 // instruction MI.
8142 MachineBasicBlock *
8143 SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
8144                                           MachineBasicBlock *MBB) const {
8145   MachineFunction &MF = *MBB->getParent();
8146   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8147   MachineRegisterInfo &MRI = MF.getRegInfo();
8148 
8149   // Extract the operands.  Base can be a register or a frame index.
8150   Register Dest = MI.getOperand(0).getReg();
8151   MachineOperand Base = earlyUseOperand(MI.getOperand(1));
8152   int64_t Disp = MI.getOperand(2).getImm();
8153   Register CmpVal = MI.getOperand(3).getReg();
8154   Register OrigSwapVal = MI.getOperand(4).getReg();
8155   Register BitShift = MI.getOperand(5).getReg();
8156   Register NegBitShift = MI.getOperand(6).getReg();
8157   int64_t BitSize = MI.getOperand(7).getImm();
8158   DebugLoc DL = MI.getDebugLoc();
8159 
8160   const TargetRegisterClass *RC = &SystemZ::GR32BitRegClass;
8161 
8162   // Get the right opcodes for the displacement and zero-extension.
8163   unsigned LOpcode  = TII->getOpcodeForOffset(SystemZ::L,  Disp);
8164   unsigned CSOpcode = TII->getOpcodeForOffset(SystemZ::CS, Disp);
8165   unsigned ZExtOpcode  = BitSize == 8 ? SystemZ::LLCR : SystemZ::LLHR;
8166   assert(LOpcode && CSOpcode && "Displacement out of range");
8167 
8168   // Create virtual registers for temporary results.
8169   Register OrigOldVal = MRI.createVirtualRegister(RC);
8170   Register OldVal = MRI.createVirtualRegister(RC);
8171   Register SwapVal = MRI.createVirtualRegister(RC);
8172   Register StoreVal = MRI.createVirtualRegister(RC);
8173   Register OldValRot = MRI.createVirtualRegister(RC);
8174   Register RetryOldVal = MRI.createVirtualRegister(RC);
8175   Register RetrySwapVal = MRI.createVirtualRegister(RC);
8176 
8177   // Insert 2 basic blocks for the loop.
8178   MachineBasicBlock *StartMBB = MBB;
8179   MachineBasicBlock *DoneMBB  = SystemZ::splitBlockBefore(MI, MBB);
8180   MachineBasicBlock *LoopMBB  = SystemZ::emitBlockAfter(StartMBB);
8181   MachineBasicBlock *SetMBB   = SystemZ::emitBlockAfter(LoopMBB);
8182 
8183   //  StartMBB:
8184   //   ...
8185   //   %OrigOldVal     = L Disp(%Base)
8186   //   # fall through to LoopMBB
8187   MBB = StartMBB;
8188   BuildMI(MBB, DL, TII->get(LOpcode), OrigOldVal)
8189       .add(Base)
8190       .addImm(Disp)
8191       .addReg(0);
8192   MBB->addSuccessor(LoopMBB);
8193 
8194   //  LoopMBB:
8195   //   %OldVal        = phi [ %OrigOldVal, EntryBB ], [ %RetryOldVal, SetMBB ]
8196   //   %SwapVal       = phi [ %OrigSwapVal, EntryBB ], [ %RetrySwapVal, SetMBB ]
8197   //   %OldValRot     = RLL %OldVal, BitSize(%BitShift)
8198   //                      ^^ The low BitSize bits contain the field
8199   //                         of interest.
8200   //   %RetrySwapVal = RISBG32 %SwapVal, %OldValRot, 32, 63-BitSize, 0
8201   //                      ^^ Replace the upper 32-BitSize bits of the
8202   //                         swap value with those that we loaded and rotated.
8203   //   %Dest = LL[CH] %OldValRot
8204   //   CR %Dest, %CmpVal
8205   //   JNE DoneMBB
8206   //   # Fall through to SetMBB
8207   MBB = LoopMBB;
8208   BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
8209     .addReg(OrigOldVal).addMBB(StartMBB)
8210     .addReg(RetryOldVal).addMBB(SetMBB);
8211   BuildMI(MBB, DL, TII->get(SystemZ::PHI), SwapVal)
8212     .addReg(OrigSwapVal).addMBB(StartMBB)
8213     .addReg(RetrySwapVal).addMBB(SetMBB);
8214   BuildMI(MBB, DL, TII->get(SystemZ::RLL), OldValRot)
8215     .addReg(OldVal).addReg(BitShift).addImm(BitSize);
8216   BuildMI(MBB, DL, TII->get(SystemZ::RISBG32), RetrySwapVal)
8217     .addReg(SwapVal).addReg(OldValRot).addImm(32).addImm(63 - BitSize).addImm(0);
8218   BuildMI(MBB, DL, TII->get(ZExtOpcode), Dest)
8219     .addReg(OldValRot);
8220   BuildMI(MBB, DL, TII->get(SystemZ::CR))
8221     .addReg(Dest).addReg(CmpVal);
8222   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8223     .addImm(SystemZ::CCMASK_ICMP)
8224     .addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
8225   MBB->addSuccessor(DoneMBB);
8226   MBB->addSuccessor(SetMBB);
8227 
8228   //  SetMBB:
8229   //   %StoreVal     = RLL %RetrySwapVal, -BitSize(%NegBitShift)
8230   //                      ^^ Rotate the new field to its proper position.
8231   //   %RetryOldVal  = CS %OldVal, %StoreVal, Disp(%Base)
8232   //   JNE LoopMBB
8233   //   # fall through to ExitMBB
8234   MBB = SetMBB;
8235   BuildMI(MBB, DL, TII->get(SystemZ::RLL), StoreVal)
8236     .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
8237   BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
8238       .addReg(OldVal)
8239       .addReg(StoreVal)
8240       .add(Base)
8241       .addImm(Disp);
8242   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8243     .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
8244   MBB->addSuccessor(LoopMBB);
8245   MBB->addSuccessor(DoneMBB);
8246 
8247   // If the CC def wasn't dead in the ATOMIC_CMP_SWAPW, mark CC as live-in
8248   // to the block after the loop.  At this point, CC may have been defined
8249   // either by the CR in LoopMBB or by the CS in SetMBB.
8250   if (!MI.registerDefIsDead(SystemZ::CC))
8251     DoneMBB->addLiveIn(SystemZ::CC);
8252 
8253   MI.eraseFromParent();
8254   return DoneMBB;
8255 }
8256 
8257 // Emit a move from two GR64s to a GR128.
8258 MachineBasicBlock *
8259 SystemZTargetLowering::emitPair128(MachineInstr &MI,
8260                                    MachineBasicBlock *MBB) const {
8261   MachineFunction &MF = *MBB->getParent();
8262   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8263   MachineRegisterInfo &MRI = MF.getRegInfo();
8264   DebugLoc DL = MI.getDebugLoc();
8265 
8266   Register Dest = MI.getOperand(0).getReg();
8267   Register Hi = MI.getOperand(1).getReg();
8268   Register Lo = MI.getOperand(2).getReg();
8269   Register Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
8270   Register Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
8271 
8272   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1);
8273   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2)
8274     .addReg(Tmp1).addReg(Hi).addImm(SystemZ::subreg_h64);
8275   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
8276     .addReg(Tmp2).addReg(Lo).addImm(SystemZ::subreg_l64);
8277 
8278   MI.eraseFromParent();
8279   return MBB;
8280 }
8281 
8282 // Emit an extension from a GR64 to a GR128.  ClearEven is true
8283 // if the high register of the GR128 value must be cleared or false if
8284 // it's "don't care".
8285 MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
8286                                                      MachineBasicBlock *MBB,
8287                                                      bool ClearEven) const {
8288   MachineFunction &MF = *MBB->getParent();
8289   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8290   MachineRegisterInfo &MRI = MF.getRegInfo();
8291   DebugLoc DL = MI.getDebugLoc();
8292 
8293   Register Dest = MI.getOperand(0).getReg();
8294   Register Src = MI.getOperand(1).getReg();
8295   Register In128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
8296 
8297   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), In128);
8298   if (ClearEven) {
8299     Register NewIn128 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass);
8300     Register Zero64 = MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
8301 
8302     BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
8303       .addImm(0);
8304     BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
8305       .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
8306     In128 = NewIn128;
8307   }
8308   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
8309     .addReg(In128).addReg(Src).addImm(SystemZ::subreg_l64);
8310 
8311   MI.eraseFromParent();
8312   return MBB;
8313 }
8314 
8315 MachineBasicBlock *
8316 SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
8317                                          MachineBasicBlock *MBB,
8318                                          unsigned Opcode, bool IsMemset) const {
8319   MachineFunction &MF = *MBB->getParent();
8320   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8321   MachineRegisterInfo &MRI = MF.getRegInfo();
8322   DebugLoc DL = MI.getDebugLoc();
8323 
8324   MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
8325   uint64_t DestDisp = MI.getOperand(1).getImm();
8326   MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
8327   uint64_t SrcDisp;
8328 
8329   // Fold the displacement Disp if it is out of range.
8330   auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
8331     if (!isUInt<12>(Disp)) {
8332       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8333       unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
8334       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
8335         .add(Base).addImm(Disp).addReg(0);
8336       Base = MachineOperand::CreateReg(Reg, false);
8337       Disp = 0;
8338     }
8339   };
8340 
8341   if (!IsMemset) {
8342     SrcBase = earlyUseOperand(MI.getOperand(2));
8343     SrcDisp = MI.getOperand(3).getImm();
8344   } else {
8345     SrcBase = DestBase;
8346     SrcDisp = DestDisp++;
8347     foldDisplIfNeeded(DestBase, DestDisp);
8348   }
8349 
8350   MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
8351   bool IsImmForm = LengthMO.isImm();
8352   bool IsRegForm = !IsImmForm;
8353 
8354   // Build and insert one Opcode of Length, with special treatment for memset.
8355   auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
8356                             MachineBasicBlock::iterator InsPos,
8357                             MachineOperand DBase, uint64_t DDisp,
8358                             MachineOperand SBase, uint64_t SDisp,
8359                             unsigned Length) -> void {
8360     assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
8361     if (IsMemset) {
8362       MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
8363       if (ByteMO.isImm())
8364         BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
8365           .add(SBase).addImm(SDisp).add(ByteMO);
8366       else
8367         BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
8368           .add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
8369       if (--Length == 0)
8370         return;
8371     }
8372     BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
8373       .add(DBase).addImm(DDisp).addImm(Length)
8374       .add(SBase).addImm(SDisp)
8375       .setMemRefs(MI.memoperands());
8376   };
8377 
8378   bool NeedsLoop = false;
8379   uint64_t ImmLength = 0;
8380   Register LenAdjReg = SystemZ::NoRegister;
8381   if (IsImmForm) {
8382     ImmLength = LengthMO.getImm();
8383     ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
8384     if (ImmLength == 0) {
8385       MI.eraseFromParent();
8386       return MBB;
8387     }
8388     if (Opcode == SystemZ::CLC) {
8389       if (ImmLength > 3 * 256)
8390         // A two-CLC sequence is a clear win over a loop, not least because
8391         // it needs only one branch.  A three-CLC sequence needs the same
8392         // number of branches as a loop (i.e. 2), but is shorter.  That
8393         // brings us to lengths greater than 768 bytes.  It seems relatively
8394         // likely that a difference will be found within the first 768 bytes,
8395         // so we just optimize for the smallest number of branch
8396         // instructions, in order to avoid polluting the prediction buffer
8397         // too much.
8398         NeedsLoop = true;
8399     } else if (ImmLength > 6 * 256)
8400       // The heuristic we use is to prefer loops for anything that would
8401       // require 7 or more MVCs.  With these kinds of sizes there isn't much
8402       // to choose between straight-line code and looping code, since the
8403       // time will be dominated by the MVCs themselves.
8404       NeedsLoop = true;
8405   } else {
8406     NeedsLoop = true;
8407     LenAdjReg = LengthMO.getReg();
8408   }
8409 
8410   // When generating more than one CLC, all but the last will need to
8411   // branch to the end when a difference is found.
8412   MachineBasicBlock *EndMBB =
8413       (Opcode == SystemZ::CLC && (ImmLength > 256 || NeedsLoop)
8414            ? SystemZ::splitBlockAfter(MI, MBB)
8415            : nullptr);
8416 
8417   if (NeedsLoop) {
8418     Register StartCountReg =
8419       MRI.createVirtualRegister(&SystemZ::GR64BitRegClass);
8420     if (IsImmForm) {
8421       TII->loadImmediate(*MBB, MI, StartCountReg, ImmLength / 256);
8422       ImmLength &= 255;
8423     } else {
8424       BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
8425         .addReg(LenAdjReg)
8426         .addReg(0)
8427         .addImm(8);
8428     }
8429 
8430     bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
8431     auto loadZeroAddress = [&]() -> MachineOperand {
8432       Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8433       BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
8434       return MachineOperand::CreateReg(Reg, false);
8435     };
8436     if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
8437       DestBase = loadZeroAddress();
8438     if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
8439       SrcBase = HaveSingleBase ? DestBase : loadZeroAddress();
8440 
8441     MachineBasicBlock *StartMBB = nullptr;
8442     MachineBasicBlock *LoopMBB = nullptr;
8443     MachineBasicBlock *NextMBB = nullptr;
8444     MachineBasicBlock *DoneMBB = nullptr;
8445     MachineBasicBlock *AllDoneMBB = nullptr;
8446 
8447     Register StartSrcReg = forceReg(MI, SrcBase, TII);
8448     Register StartDestReg =
8449         (HaveSingleBase ? StartSrcReg : forceReg(MI, DestBase, TII));
8450 
8451     const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
8452     Register ThisSrcReg  = MRI.createVirtualRegister(RC);
8453     Register ThisDestReg =
8454         (HaveSingleBase ? ThisSrcReg : MRI.createVirtualRegister(RC));
8455     Register NextSrcReg  = MRI.createVirtualRegister(RC);
8456     Register NextDestReg =
8457         (HaveSingleBase ? NextSrcReg : MRI.createVirtualRegister(RC));
8458     RC = &SystemZ::GR64BitRegClass;
8459     Register ThisCountReg = MRI.createVirtualRegister(RC);
8460     Register NextCountReg = MRI.createVirtualRegister(RC);
8461 
8462     if (IsRegForm) {
8463       AllDoneMBB = SystemZ::splitBlockBefore(MI, MBB);
8464       StartMBB = SystemZ::emitBlockAfter(MBB);
8465       LoopMBB = SystemZ::emitBlockAfter(StartMBB);
8466       NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
8467       DoneMBB = SystemZ::emitBlockAfter(NextMBB);
8468 
8469       //  MBB:
8470       //   # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
8471       BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8472         .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
8473       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8474         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
8475         .addMBB(AllDoneMBB);
8476       MBB->addSuccessor(AllDoneMBB);
8477       if (!IsMemset)
8478         MBB->addSuccessor(StartMBB);
8479       else {
8480         // MemsetOneCheckMBB:
8481         // # Jump to MemsetOneMBB for a memset of length 1, or
8482         // # fall thru to StartMBB.
8483         MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
8484         MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
8485         MBB->addSuccessor(MemsetOneCheckMBB);
8486         MBB = MemsetOneCheckMBB;
8487         BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8488           .addReg(LenAdjReg).addImm(-1);
8489         BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8490           .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
8491           .addMBB(MemsetOneMBB);
8492         MBB->addSuccessor(MemsetOneMBB, {10, 100});
8493         MBB->addSuccessor(StartMBB, {90, 100});
8494 
8495         // MemsetOneMBB:
8496         // # Jump back to AllDoneMBB after a single MVI or STC.
8497         MBB = MemsetOneMBB;
8498         insertMemMemOp(MBB, MBB->end(),
8499                        MachineOperand::CreateReg(StartDestReg, false), DestDisp,
8500                        MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
8501                        1);
8502         BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
8503         MBB->addSuccessor(AllDoneMBB);
8504       }
8505 
8506       // StartMBB:
8507       // # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
8508       MBB = StartMBB;
8509       BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8510         .addReg(StartCountReg).addImm(0);
8511       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8512         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
8513         .addMBB(DoneMBB);
8514       MBB->addSuccessor(DoneMBB);
8515       MBB->addSuccessor(LoopMBB);
8516     }
8517     else {
8518       StartMBB = MBB;
8519       DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
8520       LoopMBB = SystemZ::emitBlockAfter(StartMBB);
8521       NextMBB = (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
8522 
8523       //  StartMBB:
8524       //   # fall through to LoopMBB
8525       MBB->addSuccessor(LoopMBB);
8526 
8527       DestBase = MachineOperand::CreateReg(NextDestReg, false);
8528       SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
8529       if (EndMBB && !ImmLength)
8530         // If the loop handled the whole CLC range, DoneMBB will be empty with
8531         // CC live-through into EndMBB, so add it as live-in.
8532         DoneMBB->addLiveIn(SystemZ::CC);
8533     }
8534 
8535     //  LoopMBB:
8536     //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
8537     //                      [ %NextDestReg, NextMBB ]
8538     //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
8539     //                     [ %NextSrcReg, NextMBB ]
8540     //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
8541     //                       [ %NextCountReg, NextMBB ]
8542     //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
8543     //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
8544     //   ( JLH EndMBB )
8545     //
8546     // The prefetch is used only for MVC.  The JLH is used only for CLC.
8547     MBB = LoopMBB;
8548     BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
8549       .addReg(StartDestReg).addMBB(StartMBB)
8550       .addReg(NextDestReg).addMBB(NextMBB);
8551     if (!HaveSingleBase)
8552       BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
8553         .addReg(StartSrcReg).addMBB(StartMBB)
8554         .addReg(NextSrcReg).addMBB(NextMBB);
8555     BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
8556       .addReg(StartCountReg).addMBB(StartMBB)
8557       .addReg(NextCountReg).addMBB(NextMBB);
8558     if (Opcode == SystemZ::MVC)
8559       BuildMI(MBB, DL, TII->get(SystemZ::PFD))
8560         .addImm(SystemZ::PFD_WRITE)
8561         .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
8562     insertMemMemOp(MBB, MBB->end(),
8563                    MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
8564                    MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
8565     if (EndMBB) {
8566       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8567         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
8568         .addMBB(EndMBB);
8569       MBB->addSuccessor(EndMBB);
8570       MBB->addSuccessor(NextMBB);
8571     }
8572 
8573     // NextMBB:
8574     //   %NextDestReg = LA 256(%ThisDestReg)
8575     //   %NextSrcReg = LA 256(%ThisSrcReg)
8576     //   %NextCountReg = AGHI %ThisCountReg, -1
8577     //   CGHI %NextCountReg, 0
8578     //   JLH LoopMBB
8579     //   # fall through to DoneMBB
8580     //
8581     // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
8582     MBB = NextMBB;
8583     BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
8584       .addReg(ThisDestReg).addImm(256).addReg(0);
8585     if (!HaveSingleBase)
8586       BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
8587         .addReg(ThisSrcReg).addImm(256).addReg(0);
8588     BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
8589       .addReg(ThisCountReg).addImm(-1);
8590     BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8591       .addReg(NextCountReg).addImm(0);
8592     BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8593       .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
8594       .addMBB(LoopMBB);
8595     MBB->addSuccessor(LoopMBB);
8596     MBB->addSuccessor(DoneMBB);
8597 
8598     MBB = DoneMBB;
8599     if (IsRegForm) {
8600       // DoneMBB:
8601       // # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
8602       // # Use EXecute Relative Long for the remainder of the bytes. The target
8603       //   instruction of the EXRL will have a length field of 1 since 0 is an
8604       //   illegal value. The number of bytes processed becomes (%LenAdjReg &
8605       //   0xff) + 1.
8606       // # Fall through to AllDoneMBB.
8607       Register RemSrcReg  = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8608       Register RemDestReg = HaveSingleBase ? RemSrcReg
8609         : MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8610       BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemDestReg)
8611         .addReg(StartDestReg).addMBB(StartMBB)
8612         .addReg(NextDestReg).addMBB(NextMBB);
8613       if (!HaveSingleBase)
8614         BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
8615           .addReg(StartSrcReg).addMBB(StartMBB)
8616           .addReg(NextSrcReg).addMBB(NextMBB);
8617       if (IsMemset)
8618         insertMemMemOp(MBB, MBB->end(),
8619                        MachineOperand::CreateReg(RemDestReg, false), DestDisp,
8620                        MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
8621       MachineInstrBuilder EXRL_MIB =
8622         BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
8623           .addImm(Opcode)
8624           .addReg(LenAdjReg)
8625           .addReg(RemDestReg).addImm(DestDisp)
8626           .addReg(RemSrcReg).addImm(SrcDisp);
8627       MBB->addSuccessor(AllDoneMBB);
8628       MBB = AllDoneMBB;
8629       if (Opcode != SystemZ::MVC) {
8630         EXRL_MIB.addReg(SystemZ::CC, RegState::ImplicitDefine);
8631         if (EndMBB)
8632           MBB->addLiveIn(SystemZ::CC);
8633       }
8634     }
8635     MF.getProperties().reset(MachineFunctionProperties::Property::NoPHIs);
8636   }
8637 
8638   // Handle any remaining bytes with straight-line code.
8639   while (ImmLength > 0) {
8640     uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
8641     // The previous iteration might have created out-of-range displacements.
8642     // Apply them using LA/LAY if so.
8643     foldDisplIfNeeded(DestBase, DestDisp);
8644     foldDisplIfNeeded(SrcBase, SrcDisp);
8645     insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
8646     DestDisp += ThisLength;
8647     SrcDisp += ThisLength;
8648     ImmLength -= ThisLength;
8649     // If there's another CLC to go, branch to the end if a difference
8650     // was found.
8651     if (EndMBB && ImmLength > 0) {
8652       MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB);
8653       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8654         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
8655         .addMBB(EndMBB);
8656       MBB->addSuccessor(EndMBB);
8657       MBB->addSuccessor(NextMBB);
8658       MBB = NextMBB;
8659     }
8660   }
8661   if (EndMBB) {
8662     MBB->addSuccessor(EndMBB);
8663     MBB = EndMBB;
8664     MBB->addLiveIn(SystemZ::CC);
8665   }
8666 
8667   MI.eraseFromParent();
8668   return MBB;
8669 }
8670 
8671 // Decompose string pseudo-instruction MI into a loop that continually performs
8672 // Opcode until CC != 3.
8673 MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
8674     MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
8675   MachineFunction &MF = *MBB->getParent();
8676   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8677   MachineRegisterInfo &MRI = MF.getRegInfo();
8678   DebugLoc DL = MI.getDebugLoc();
8679 
8680   uint64_t End1Reg = MI.getOperand(0).getReg();
8681   uint64_t Start1Reg = MI.getOperand(1).getReg();
8682   uint64_t Start2Reg = MI.getOperand(2).getReg();
8683   uint64_t CharReg = MI.getOperand(3).getReg();
8684 
8685   const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
8686   uint64_t This1Reg = MRI.createVirtualRegister(RC);
8687   uint64_t This2Reg = MRI.createVirtualRegister(RC);
8688   uint64_t End2Reg  = MRI.createVirtualRegister(RC);
8689 
8690   MachineBasicBlock *StartMBB = MBB;
8691   MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
8692   MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
8693 
8694   //  StartMBB:
8695   //   # fall through to LoopMBB
8696   MBB->addSuccessor(LoopMBB);
8697 
8698   //  LoopMBB:
8699   //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
8700   //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
8701   //   R0L = %CharReg
8702   //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
8703   //   JO LoopMBB
8704   //   # fall through to DoneMBB
8705   //
8706   // The load of R0L can be hoisted by post-RA LICM.
8707   MBB = LoopMBB;
8708 
8709   BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
8710     .addReg(Start1Reg).addMBB(StartMBB)
8711     .addReg(End1Reg).addMBB(LoopMBB);
8712   BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
8713     .addReg(Start2Reg).addMBB(StartMBB)
8714     .addReg(End2Reg).addMBB(LoopMBB);
8715   BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
8716   BuildMI(MBB, DL, TII->get(Opcode))
8717     .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
8718     .addReg(This1Reg).addReg(This2Reg);
8719   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8720     .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
8721   MBB->addSuccessor(LoopMBB);
8722   MBB->addSuccessor(DoneMBB);
8723 
8724   DoneMBB->addLiveIn(SystemZ::CC);
8725 
8726   MI.eraseFromParent();
8727   return DoneMBB;
8728 }
8729 
8730 // Update TBEGIN instruction with final opcode and register clobbers.
8731 MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
8732     MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode,
8733     bool NoFloat) const {
8734   MachineFunction &MF = *MBB->getParent();
8735   const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
8736   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8737 
8738   // Update opcode.
8739   MI.setDesc(TII->get(Opcode));
8740 
8741   // We cannot handle a TBEGIN that clobbers the stack or frame pointer.
8742   // Make sure to add the corresponding GRSM bits if they are missing.
8743   uint64_t Control = MI.getOperand(2).getImm();
8744   static const unsigned GPRControlBit[16] = {
8745     0x8000, 0x8000, 0x4000, 0x4000, 0x2000, 0x2000, 0x1000, 0x1000,
8746     0x0800, 0x0800, 0x0400, 0x0400, 0x0200, 0x0200, 0x0100, 0x0100
8747   };
8748   Control |= GPRControlBit[15];
8749   if (TFI->hasFP(MF))
8750     Control |= GPRControlBit[11];
8751   MI.getOperand(2).setImm(Control);
8752 
8753   // Add GPR clobbers.
8754   for (int I = 0; I < 16; I++) {
8755     if ((Control & GPRControlBit[I]) == 0) {
8756       unsigned Reg = SystemZMC::GR64Regs[I];
8757       MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
8758     }
8759   }
8760 
8761   // Add FPR/VR clobbers.
8762   if (!NoFloat && (Control & 4) != 0) {
8763     if (Subtarget.hasVector()) {
8764       for (unsigned Reg : SystemZMC::VR128Regs) {
8765         MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
8766       }
8767     } else {
8768       for (unsigned Reg : SystemZMC::FP64Regs) {
8769         MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
8770       }
8771     }
8772   }
8773 
8774   return MBB;
8775 }
8776 
8777 MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
8778     MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
8779   MachineFunction &MF = *MBB->getParent();
8780   MachineRegisterInfo *MRI = &MF.getRegInfo();
8781   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8782   DebugLoc DL = MI.getDebugLoc();
8783 
8784   Register SrcReg = MI.getOperand(0).getReg();
8785 
8786   // Create new virtual register of the same class as source.
8787   const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
8788   Register DstReg = MRI->createVirtualRegister(RC);
8789 
8790   // Replace pseudo with a normal load-and-test that models the def as
8791   // well.
8792   BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg)
8793     .addReg(SrcReg)
8794     .setMIFlags(MI.getFlags());
8795   MI.eraseFromParent();
8796 
8797   return MBB;
8798 }
8799 
8800 MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
8801     MachineInstr &MI, MachineBasicBlock *MBB) const {
8802   MachineFunction &MF = *MBB->getParent();
8803   MachineRegisterInfo *MRI = &MF.getRegInfo();
8804   const SystemZInstrInfo *TII = Subtarget.getInstrInfo();
8805   DebugLoc DL = MI.getDebugLoc();
8806   const unsigned ProbeSize = getStackProbeSize(MF);
8807   Register DstReg = MI.getOperand(0).getReg();
8808   Register SizeReg = MI.getOperand(2).getReg();
8809 
8810   MachineBasicBlock *StartMBB = MBB;
8811   MachineBasicBlock *DoneMBB  = SystemZ::splitBlockAfter(MI, MBB);
8812   MachineBasicBlock *LoopTestMBB  = SystemZ::emitBlockAfter(StartMBB);
8813   MachineBasicBlock *LoopBodyMBB = SystemZ::emitBlockAfter(LoopTestMBB);
8814   MachineBasicBlock *TailTestMBB = SystemZ::emitBlockAfter(LoopBodyMBB);
8815   MachineBasicBlock *TailMBB = SystemZ::emitBlockAfter(TailTestMBB);
8816 
8817   MachineMemOperand *VolLdMMO = MF.getMachineMemOperand(MachinePointerInfo(),
8818     MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1));
8819 
8820   Register PHIReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8821   Register IncReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
8822 
8823   //  LoopTestMBB
8824   //  BRC TailTestMBB
8825   //  # fallthrough to LoopBodyMBB
8826   StartMBB->addSuccessor(LoopTestMBB);
8827   MBB = LoopTestMBB;
8828   BuildMI(MBB, DL, TII->get(SystemZ::PHI), PHIReg)
8829     .addReg(SizeReg)
8830     .addMBB(StartMBB)
8831     .addReg(IncReg)
8832     .addMBB(LoopBodyMBB);
8833   BuildMI(MBB, DL, TII->get(SystemZ::CLGFI))
8834     .addReg(PHIReg)
8835     .addImm(ProbeSize);
8836   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8837     .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_LT)
8838     .addMBB(TailTestMBB);
8839   MBB->addSuccessor(LoopBodyMBB);
8840   MBB->addSuccessor(TailTestMBB);
8841 
8842   //  LoopBodyMBB: Allocate and probe by means of a volatile compare.
8843   //  J LoopTestMBB
8844   MBB = LoopBodyMBB;
8845   BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), IncReg)
8846     .addReg(PHIReg)
8847     .addImm(ProbeSize);
8848   BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), SystemZ::R15D)
8849     .addReg(SystemZ::R15D)
8850     .addImm(ProbeSize);
8851   BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
8852     .addReg(SystemZ::R15D).addImm(ProbeSize - 8).addReg(0)
8853     .setMemRefs(VolLdMMO);
8854   BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(LoopTestMBB);
8855   MBB->addSuccessor(LoopTestMBB);
8856 
8857   //  TailTestMBB
8858   //  BRC DoneMBB
8859   //  # fallthrough to TailMBB
8860   MBB = TailTestMBB;
8861   BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
8862     .addReg(PHIReg)
8863     .addImm(0);
8864   BuildMI(MBB, DL, TII->get(SystemZ::BRC))
8865     .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
8866     .addMBB(DoneMBB);
8867   MBB->addSuccessor(TailMBB);
8868   MBB->addSuccessor(DoneMBB);
8869 
8870   //  TailMBB
8871   //  # fallthrough to DoneMBB
8872   MBB = TailMBB;
8873   BuildMI(MBB, DL, TII->get(SystemZ::SLGR), SystemZ::R15D)
8874     .addReg(SystemZ::R15D)
8875     .addReg(PHIReg);
8876   BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
8877     .addReg(SystemZ::R15D).addImm(-8).addReg(PHIReg)
8878     .setMemRefs(VolLdMMO);
8879   MBB->addSuccessor(DoneMBB);
8880 
8881   //  DoneMBB
8882   MBB = DoneMBB;
8883   BuildMI(*MBB, MBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg)
8884     .addReg(SystemZ::R15D);
8885 
8886   MI.eraseFromParent();
8887   return DoneMBB;
8888 }
8889 
8890 SDValue SystemZTargetLowering::
8891 getBackchainAddress(SDValue SP, SelectionDAG &DAG) const {
8892   MachineFunction &MF = DAG.getMachineFunction();
8893   auto *TFL = Subtarget.getFrameLowering<SystemZELFFrameLowering>();
8894   SDLoc DL(SP);
8895   return DAG.getNode(ISD::ADD, DL, MVT::i64, SP,
8896                      DAG.getIntPtrConstant(TFL->getBackchainOffset(MF), DL));
8897 }
8898 
8899 MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
8900     MachineInstr &MI, MachineBasicBlock *MBB) const {
8901   switch (MI.getOpcode()) {
8902   case SystemZ::Select32:
8903   case SystemZ::Select64:
8904   case SystemZ::SelectF32:
8905   case SystemZ::SelectF64:
8906   case SystemZ::SelectF128:
8907   case SystemZ::SelectVR32:
8908   case SystemZ::SelectVR64:
8909   case SystemZ::SelectVR128:
8910     return emitSelect(MI, MBB);
8911 
8912   case SystemZ::CondStore8Mux:
8913     return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
8914   case SystemZ::CondStore8MuxInv:
8915     return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
8916   case SystemZ::CondStore16Mux:
8917     return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
8918   case SystemZ::CondStore16MuxInv:
8919     return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
8920   case SystemZ::CondStore32Mux:
8921     return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, false);
8922   case SystemZ::CondStore32MuxInv:
8923     return emitCondStore(MI, MBB, SystemZ::STMux, SystemZ::STOCMux, true);
8924   case SystemZ::CondStore8:
8925     return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
8926   case SystemZ::CondStore8Inv:
8927     return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
8928   case SystemZ::CondStore16:
8929     return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
8930   case SystemZ::CondStore16Inv:
8931     return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
8932   case SystemZ::CondStore32:
8933     return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
8934   case SystemZ::CondStore32Inv:
8935     return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
8936   case SystemZ::CondStore64:
8937     return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
8938   case SystemZ::CondStore64Inv:
8939     return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
8940   case SystemZ::CondStoreF32:
8941     return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
8942   case SystemZ::CondStoreF32Inv:
8943     return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
8944   case SystemZ::CondStoreF64:
8945     return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
8946   case SystemZ::CondStoreF64Inv:
8947     return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
8948 
8949   case SystemZ::PAIR128:
8950     return emitPair128(MI, MBB);
8951   case SystemZ::AEXT128:
8952     return emitExt128(MI, MBB, false);
8953   case SystemZ::ZEXT128:
8954     return emitExt128(MI, MBB, true);
8955 
8956   case SystemZ::ATOMIC_SWAPW:
8957     return emitAtomicLoadBinary(MI, MBB, 0, 0);
8958   case SystemZ::ATOMIC_SWAP_32:
8959     return emitAtomicLoadBinary(MI, MBB, 0, 32);
8960   case SystemZ::ATOMIC_SWAP_64:
8961     return emitAtomicLoadBinary(MI, MBB, 0, 64);
8962 
8963   case SystemZ::ATOMIC_LOADW_AR:
8964     return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 0);
8965   case SystemZ::ATOMIC_LOADW_AFI:
8966     return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 0);
8967   case SystemZ::ATOMIC_LOAD_AR:
8968     return emitAtomicLoadBinary(MI, MBB, SystemZ::AR, 32);
8969   case SystemZ::ATOMIC_LOAD_AHI:
8970     return emitAtomicLoadBinary(MI, MBB, SystemZ::AHI, 32);
8971   case SystemZ::ATOMIC_LOAD_AFI:
8972     return emitAtomicLoadBinary(MI, MBB, SystemZ::AFI, 32);
8973   case SystemZ::ATOMIC_LOAD_AGR:
8974     return emitAtomicLoadBinary(MI, MBB, SystemZ::AGR, 64);
8975   case SystemZ::ATOMIC_LOAD_AGHI:
8976     return emitAtomicLoadBinary(MI, MBB, SystemZ::AGHI, 64);
8977   case SystemZ::ATOMIC_LOAD_AGFI:
8978     return emitAtomicLoadBinary(MI, MBB, SystemZ::AGFI, 64);
8979 
8980   case SystemZ::ATOMIC_LOADW_SR:
8981     return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 0);
8982   case SystemZ::ATOMIC_LOAD_SR:
8983     return emitAtomicLoadBinary(MI, MBB, SystemZ::SR, 32);
8984   case SystemZ::ATOMIC_LOAD_SGR:
8985     return emitAtomicLoadBinary(MI, MBB, SystemZ::SGR, 64);
8986 
8987   case SystemZ::ATOMIC_LOADW_NR:
8988     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
8989   case SystemZ::ATOMIC_LOADW_NILH:
8990     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
8991   case SystemZ::ATOMIC_LOAD_NR:
8992     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
8993   case SystemZ::ATOMIC_LOAD_NILL:
8994     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
8995   case SystemZ::ATOMIC_LOAD_NILH:
8996     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
8997   case SystemZ::ATOMIC_LOAD_NILF:
8998     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
8999   case SystemZ::ATOMIC_LOAD_NGR:
9000     return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
9001   case SystemZ::ATOMIC_LOAD_NILL64:
9002     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
9003   case SystemZ::ATOMIC_LOAD_NILH64:
9004     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
9005   case SystemZ::ATOMIC_LOAD_NIHL64:
9006     return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
9007   case SystemZ::ATOMIC_LOAD_NIHH64:
9008     return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
9009   case SystemZ::ATOMIC_LOAD_NILF64:
9010     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
9011   case SystemZ::ATOMIC_LOAD_NIHF64:
9012     return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
9013 
9014   case SystemZ::ATOMIC_LOADW_OR:
9015     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
9016   case SystemZ::ATOMIC_LOADW_OILH:
9017     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
9018   case SystemZ::ATOMIC_LOAD_OR:
9019     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
9020   case SystemZ::ATOMIC_LOAD_OILL:
9021     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
9022   case SystemZ::ATOMIC_LOAD_OILH:
9023     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
9024   case SystemZ::ATOMIC_LOAD_OILF:
9025     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
9026   case SystemZ::ATOMIC_LOAD_OGR:
9027     return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
9028   case SystemZ::ATOMIC_LOAD_OILL64:
9029     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
9030   case SystemZ::ATOMIC_LOAD_OILH64:
9031     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
9032   case SystemZ::ATOMIC_LOAD_OIHL64:
9033     return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
9034   case SystemZ::ATOMIC_LOAD_OIHH64:
9035     return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
9036   case SystemZ::ATOMIC_LOAD_OILF64:
9037     return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
9038   case SystemZ::ATOMIC_LOAD_OIHF64:
9039     return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
9040 
9041   case SystemZ::ATOMIC_LOADW_XR:
9042     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
9043   case SystemZ::ATOMIC_LOADW_XILF:
9044     return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
9045   case SystemZ::ATOMIC_LOAD_XR:
9046     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
9047   case SystemZ::ATOMIC_LOAD_XILF:
9048     return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
9049   case SystemZ::ATOMIC_LOAD_XGR:
9050     return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
9051   case SystemZ::ATOMIC_LOAD_XILF64:
9052     return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
9053   case SystemZ::ATOMIC_LOAD_XIHF64:
9054     return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
9055 
9056   case SystemZ::ATOMIC_LOADW_NRi:
9057     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
9058   case SystemZ::ATOMIC_LOADW_NILHi:
9059     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
9060   case SystemZ::ATOMIC_LOAD_NRi:
9061     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
9062   case SystemZ::ATOMIC_LOAD_NILLi:
9063     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
9064   case SystemZ::ATOMIC_LOAD_NILHi:
9065     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
9066   case SystemZ::ATOMIC_LOAD_NILFi:
9067     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
9068   case SystemZ::ATOMIC_LOAD_NGRi:
9069     return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
9070   case SystemZ::ATOMIC_LOAD_NILL64i:
9071     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
9072   case SystemZ::ATOMIC_LOAD_NILH64i:
9073     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
9074   case SystemZ::ATOMIC_LOAD_NIHL64i:
9075     return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
9076   case SystemZ::ATOMIC_LOAD_NIHH64i:
9077     return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
9078   case SystemZ::ATOMIC_LOAD_NILF64i:
9079     return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
9080   case SystemZ::ATOMIC_LOAD_NIHF64i:
9081     return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
9082 
9083   case SystemZ::ATOMIC_LOADW_MIN:
9084     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
9085                                 SystemZ::CCMASK_CMP_LE, 0);
9086   case SystemZ::ATOMIC_LOAD_MIN_32:
9087     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
9088                                 SystemZ::CCMASK_CMP_LE, 32);
9089   case SystemZ::ATOMIC_LOAD_MIN_64:
9090     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
9091                                 SystemZ::CCMASK_CMP_LE, 64);
9092 
9093   case SystemZ::ATOMIC_LOADW_MAX:
9094     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
9095                                 SystemZ::CCMASK_CMP_GE, 0);
9096   case SystemZ::ATOMIC_LOAD_MAX_32:
9097     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
9098                                 SystemZ::CCMASK_CMP_GE, 32);
9099   case SystemZ::ATOMIC_LOAD_MAX_64:
9100     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CGR,
9101                                 SystemZ::CCMASK_CMP_GE, 64);
9102 
9103   case SystemZ::ATOMIC_LOADW_UMIN:
9104     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
9105                                 SystemZ::CCMASK_CMP_LE, 0);
9106   case SystemZ::ATOMIC_LOAD_UMIN_32:
9107     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
9108                                 SystemZ::CCMASK_CMP_LE, 32);
9109   case SystemZ::ATOMIC_LOAD_UMIN_64:
9110     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
9111                                 SystemZ::CCMASK_CMP_LE, 64);
9112 
9113   case SystemZ::ATOMIC_LOADW_UMAX:
9114     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
9115                                 SystemZ::CCMASK_CMP_GE, 0);
9116   case SystemZ::ATOMIC_LOAD_UMAX_32:
9117     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLR,
9118                                 SystemZ::CCMASK_CMP_GE, 32);
9119   case SystemZ::ATOMIC_LOAD_UMAX_64:
9120     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CLGR,
9121                                 SystemZ::CCMASK_CMP_GE, 64);
9122 
9123   case SystemZ::ATOMIC_CMP_SWAPW:
9124     return emitAtomicCmpSwapW(MI, MBB);
9125   case SystemZ::MVCImm:
9126   case SystemZ::MVCReg:
9127     return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
9128   case SystemZ::NCImm:
9129     return emitMemMemWrapper(MI, MBB, SystemZ::NC);
9130   case SystemZ::OCImm:
9131     return emitMemMemWrapper(MI, MBB, SystemZ::OC);
9132   case SystemZ::XCImm:
9133   case SystemZ::XCReg:
9134     return emitMemMemWrapper(MI, MBB, SystemZ::XC);
9135   case SystemZ::CLCImm:
9136   case SystemZ::CLCReg:
9137     return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
9138   case SystemZ::MemsetImmImm:
9139   case SystemZ::MemsetImmReg:
9140   case SystemZ::MemsetRegImm:
9141   case SystemZ::MemsetRegReg:
9142     return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
9143   case SystemZ::CLSTLoop:
9144     return emitStringWrapper(MI, MBB, SystemZ::CLST);
9145   case SystemZ::MVSTLoop:
9146     return emitStringWrapper(MI, MBB, SystemZ::MVST);
9147   case SystemZ::SRSTLoop:
9148     return emitStringWrapper(MI, MBB, SystemZ::SRST);
9149   case SystemZ::TBEGIN:
9150     return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, false);
9151   case SystemZ::TBEGIN_nofloat:
9152     return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true);
9153   case SystemZ::TBEGINC:
9154     return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true);
9155   case SystemZ::LTEBRCompare_VecPseudo:
9156     return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR);
9157   case SystemZ::LTDBRCompare_VecPseudo:
9158     return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR);
9159   case SystemZ::LTXBRCompare_VecPseudo:
9160     return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
9161 
9162   case SystemZ::PROBED_ALLOCA:
9163     return emitProbedAlloca(MI, MBB);
9164 
9165   case TargetOpcode::STACKMAP:
9166   case TargetOpcode::PATCHPOINT:
9167     return emitPatchPoint(MI, MBB);
9168 
9169   default:
9170     llvm_unreachable("Unexpected instr type to insert");
9171   }
9172 }
9173 
9174 // This is only used by the isel schedulers, and is needed only to prevent
9175 // compiler from crashing when list-ilp is used.
9176 const TargetRegisterClass *
9177 SystemZTargetLowering::getRepRegClassFor(MVT VT) const {
9178   if (VT == MVT::Untyped)
9179     return &SystemZ::ADDR128BitRegClass;
9180   return TargetLowering::getRepRegClassFor(VT);
9181 }
9182 
9183 SDValue SystemZTargetLowering::lowerGET_ROUNDING(SDValue Op,
9184                                                  SelectionDAG &DAG) const {
9185   SDLoc dl(Op);
9186   /*
9187    The rounding method is in FPC Byte 3 bits 6-7, and has the following
9188    settings:
9189      00 Round to nearest
9190      01 Round to 0
9191      10 Round to +inf
9192      11 Round to -inf
9193 
9194   FLT_ROUNDS, on the other hand, expects the following:
9195     -1 Undefined
9196      0 Round to 0
9197      1 Round to nearest
9198      2 Round to +inf
9199      3 Round to -inf
9200   */
9201 
9202   // Save FPC to register.
9203   SDValue Chain = Op.getOperand(0);
9204   SDValue EFPC(
9205       DAG.getMachineNode(SystemZ::EFPC, dl, {MVT::i32, MVT::Other}, Chain), 0);
9206   Chain = EFPC.getValue(1);
9207 
9208   // Transform as necessary
9209   SDValue CWD1 = DAG.getNode(ISD::AND, dl, MVT::i32, EFPC,
9210                              DAG.getConstant(3, dl, MVT::i32));
9211   // RetVal = (CWD1 ^ (CWD1 >> 1)) ^ 1
9212   SDValue CWD2 = DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1,
9213                              DAG.getNode(ISD::SRL, dl, MVT::i32, CWD1,
9214                                          DAG.getConstant(1, dl, MVT::i32)));
9215 
9216   SDValue RetVal = DAG.getNode(ISD::XOR, dl, MVT::i32, CWD2,
9217                                DAG.getConstant(1, dl, MVT::i32));
9218   RetVal = DAG.getZExtOrTrunc(RetVal, dl, Op.getValueType());
9219 
9220   return DAG.getMergeValues({RetVal, Chain}, dl);
9221 }
9222