xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h (revision e92ffd9b626833ebdbf2742c8ffddc6cd94b963e)
1 //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
15 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
16 
17 #include "llvm/CodeGen/TargetLowering.h"
18 
19 namespace llvm {
20   class X86Subtarget;
21   class X86TargetMachine;
22 
23   namespace X86ISD {
24     // X86 Specific DAG Nodes
25   enum NodeType : unsigned {
26     // Start the numbering where the builtin ops leave off.
27     FIRST_NUMBER = ISD::BUILTIN_OP_END,
28 
29     /// Bit scan forward.
30     BSF,
31     /// Bit scan reverse.
32     BSR,
33 
34     /// X86 funnel/double shift i16 instructions. These correspond to
35     /// X86::SHLDW and X86::SHRDW instructions which have different amt
36     /// modulo rules to generic funnel shifts.
37     /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
38     FSHL,
39     FSHR,
40 
41     /// Bitwise logical AND of floating point values. This corresponds
42     /// to X86::ANDPS or X86::ANDPD.
43     FAND,
44 
45     /// Bitwise logical OR of floating point values. This corresponds
46     /// to X86::ORPS or X86::ORPD.
47     FOR,
48 
49     /// Bitwise logical XOR of floating point values. This corresponds
50     /// to X86::XORPS or X86::XORPD.
51     FXOR,
52 
53     ///  Bitwise logical ANDNOT of floating point values. This
54     /// corresponds to X86::ANDNPS or X86::ANDNPD.
55     FANDN,
56 
57     /// These operations represent an abstract X86 call
58     /// instruction, which includes a bunch of information.  In particular the
59     /// operands of these node are:
60     ///
61     ///     #0 - The incoming token chain
62     ///     #1 - The callee
63     ///     #2 - The number of arg bytes the caller pushes on the stack.
64     ///     #3 - The number of arg bytes the callee pops off the stack.
65     ///     #4 - The value to pass in AL/AX/EAX (optional)
66     ///     #5 - The value to pass in DL/DX/EDX (optional)
67     ///
68     /// The result values of these nodes are:
69     ///
70     ///     #0 - The outgoing token chain
71     ///     #1 - The first register result value (optional)
72     ///     #2 - The second register result value (optional)
73     ///
74     CALL,
75 
76     /// Same as call except it adds the NoTrack prefix.
77     NT_CALL,
78 
79     // Pseudo for a OBJC call that gets emitted together with a special
80     // marker instruction.
81     CALL_RVMARKER,
82 
83     /// X86 compare and logical compare instructions.
84     CMP,
85     FCMP,
86     COMI,
87     UCOMI,
88 
89     /// X86 bit-test instructions.
90     BT,
91 
92     /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
93     /// operand, usually produced by a CMP instruction.
94     SETCC,
95 
96     /// X86 Select
97     SELECTS,
98 
99     // Same as SETCC except it's materialized with a sbb and the value is all
100     // one's or all zero's.
101     SETCC_CARRY, // R = carry_bit ? ~0 : 0
102 
103     /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
104     /// Operands are two FP values to compare; result is a mask of
105     /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
106     FSETCC,
107 
108     /// X86 FP SETCC, similar to above, but with output as an i1 mask and
109     /// and a version with SAE.
110     FSETCCM,
111     FSETCCM_SAE,
112 
113     /// X86 conditional moves. Operand 0 and operand 1 are the two values
114     /// to select from. Operand 2 is the condition code, and operand 3 is the
115     /// flag operand produced by a CMP or TEST instruction.
116     CMOV,
117 
118     /// X86 conditional branches. Operand 0 is the chain operand, operand 1
119     /// is the block to branch if condition is true, operand 2 is the
120     /// condition code, and operand 3 is the flag operand produced by a CMP
121     /// or TEST instruction.
122     BRCOND,
123 
124     /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
125     /// operand 1 is the target address.
126     NT_BRIND,
127 
128     /// Return with a flag operand. Operand 0 is the chain operand, operand
129     /// 1 is the number of bytes of stack to pop.
130     RET_FLAG,
131 
132     /// Return from interrupt. Operand 0 is the number of bytes to pop.
133     IRET,
134 
135     /// Repeat fill, corresponds to X86::REP_STOSx.
136     REP_STOS,
137 
138     /// Repeat move, corresponds to X86::REP_MOVSx.
139     REP_MOVS,
140 
141     /// On Darwin, this node represents the result of the popl
142     /// at function entry, used for PIC code.
143     GlobalBaseReg,
144 
145     /// A wrapper node for TargetConstantPool, TargetJumpTable,
146     /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
147     /// MCSymbol and TargetBlockAddress.
148     Wrapper,
149 
150     /// Special wrapper used under X86-64 PIC mode for RIP
151     /// relative displacements.
152     WrapperRIP,
153 
154     /// Copies a 64-bit value from an MMX vector to the low word
155     /// of an XMM vector, with the high word zero filled.
156     MOVQ2DQ,
157 
158     /// Copies a 64-bit value from the low word of an XMM vector
159     /// to an MMX vector.
160     MOVDQ2Q,
161 
162     /// Copies a 32-bit value from the low word of a MMX
163     /// vector to a GPR.
164     MMX_MOVD2W,
165 
166     /// Copies a GPR into the low 32-bit word of a MMX vector
167     /// and zero out the high word.
168     MMX_MOVW2D,
169 
170     /// Extract an 8-bit value from a vector and zero extend it to
171     /// i32, corresponds to X86::PEXTRB.
172     PEXTRB,
173 
174     /// Extract a 16-bit value from a vector and zero extend it to
175     /// i32, corresponds to X86::PEXTRW.
176     PEXTRW,
177 
178     /// Insert any element of a 4 x float vector into any element
179     /// of a destination 4 x floatvector.
180     INSERTPS,
181 
182     /// Insert the lower 8-bits of a 32-bit value to a vector,
183     /// corresponds to X86::PINSRB.
184     PINSRB,
185 
186     /// Insert the lower 16-bits of a 32-bit value to a vector,
187     /// corresponds to X86::PINSRW.
188     PINSRW,
189 
190     /// Shuffle 16 8-bit values within a vector.
191     PSHUFB,
192 
193     /// Compute Sum of Absolute Differences.
194     PSADBW,
195     /// Compute Double Block Packed Sum-Absolute-Differences
196     DBPSADBW,
197 
198     /// Bitwise Logical AND NOT of Packed FP values.
199     ANDNP,
200 
201     /// Blend where the selector is an immediate.
202     BLENDI,
203 
204     /// Dynamic (non-constant condition) vector blend where only the sign bits
205     /// of the condition elements are used. This is used to enforce that the
206     /// condition mask is not valid for generic VSELECT optimizations. This
207     /// is also used to implement the intrinsics.
208     /// Operands are in VSELECT order: MASK, TRUE, FALSE
209     BLENDV,
210 
211     /// Combined add and sub on an FP vector.
212     ADDSUB,
213 
214     //  FP vector ops with rounding mode.
215     FADD_RND,
216     FADDS,
217     FADDS_RND,
218     FSUB_RND,
219     FSUBS,
220     FSUBS_RND,
221     FMUL_RND,
222     FMULS,
223     FMULS_RND,
224     FDIV_RND,
225     FDIVS,
226     FDIVS_RND,
227     FMAX_SAE,
228     FMAXS_SAE,
229     FMIN_SAE,
230     FMINS_SAE,
231     FSQRT_RND,
232     FSQRTS,
233     FSQRTS_RND,
234 
235     // FP vector get exponent.
236     FGETEXP,
237     FGETEXP_SAE,
238     FGETEXPS,
239     FGETEXPS_SAE,
240     // Extract Normalized Mantissas.
241     VGETMANT,
242     VGETMANT_SAE,
243     VGETMANTS,
244     VGETMANTS_SAE,
245     // FP Scale.
246     SCALEF,
247     SCALEF_RND,
248     SCALEFS,
249     SCALEFS_RND,
250 
251     // Unsigned Integer average.
252     AVG,
253 
254     /// Integer horizontal add/sub.
255     HADD,
256     HSUB,
257 
258     /// Floating point horizontal add/sub.
259     FHADD,
260     FHSUB,
261 
262     // Detect Conflicts Within a Vector
263     CONFLICT,
264 
265     /// Floating point max and min.
266     FMAX,
267     FMIN,
268 
269     /// Commutative FMIN and FMAX.
270     FMAXC,
271     FMINC,
272 
273     /// Scalar intrinsic floating point max and min.
274     FMAXS,
275     FMINS,
276 
277     /// Floating point reciprocal-sqrt and reciprocal approximation.
278     /// Note that these typically require refinement
279     /// in order to obtain suitable precision.
280     FRSQRT,
281     FRCP,
282 
283     // AVX-512 reciprocal approximations with a little more precision.
284     RSQRT14,
285     RSQRT14S,
286     RCP14,
287     RCP14S,
288 
289     // Thread Local Storage.
290     TLSADDR,
291 
292     // Thread Local Storage. A call to get the start address
293     // of the TLS block for the current module.
294     TLSBASEADDR,
295 
296     // Thread Local Storage.  When calling to an OS provided
297     // thunk at the address from an earlier relocation.
298     TLSCALL,
299 
300     // Exception Handling helpers.
301     EH_RETURN,
302 
303     // SjLj exception handling setjmp.
304     EH_SJLJ_SETJMP,
305 
306     // SjLj exception handling longjmp.
307     EH_SJLJ_LONGJMP,
308 
309     // SjLj exception handling dispatch.
310     EH_SJLJ_SETUP_DISPATCH,
311 
312     /// Tail call return. See X86TargetLowering::LowerCall for
313     /// the list of operands.
314     TC_RETURN,
315 
316     // Vector move to low scalar and zero higher vector elements.
317     VZEXT_MOVL,
318 
319     // Vector integer truncate.
320     VTRUNC,
321     // Vector integer truncate with unsigned/signed saturation.
322     VTRUNCUS,
323     VTRUNCS,
324 
325     // Masked version of the above. Used when less than a 128-bit result is
326     // produced since the mask only applies to the lower elements and can't
327     // be represented by a select.
328     // SRC, PASSTHRU, MASK
329     VMTRUNC,
330     VMTRUNCUS,
331     VMTRUNCS,
332 
333     // Vector FP extend.
334     VFPEXT,
335     VFPEXT_SAE,
336     VFPEXTS,
337     VFPEXTS_SAE,
338 
339     // Vector FP round.
340     VFPROUND,
341     VFPROUND_RND,
342     VFPROUNDS,
343     VFPROUNDS_RND,
344 
345     // Masked version of above. Used for v2f64->v4f32.
346     // SRC, PASSTHRU, MASK
347     VMFPROUND,
348 
349     // 128-bit vector logical left / right shift
350     VSHLDQ,
351     VSRLDQ,
352 
353     // Vector shift elements
354     VSHL,
355     VSRL,
356     VSRA,
357 
358     // Vector variable shift
359     VSHLV,
360     VSRLV,
361     VSRAV,
362 
363     // Vector shift elements by immediate
364     VSHLI,
365     VSRLI,
366     VSRAI,
367 
368     // Shifts of mask registers.
369     KSHIFTL,
370     KSHIFTR,
371 
372     // Bit rotate by immediate
373     VROTLI,
374     VROTRI,
375 
376     // Vector packed double/float comparison.
377     CMPP,
378 
379     // Vector integer comparisons.
380     PCMPEQ,
381     PCMPGT,
382 
383     // v8i16 Horizontal minimum and position.
384     PHMINPOS,
385 
386     MULTISHIFT,
387 
388     /// Vector comparison generating mask bits for fp and
389     /// integer signed and unsigned data types.
390     CMPM,
391     // Vector mask comparison generating mask bits for FP values.
392     CMPMM,
393     // Vector mask comparison with SAE for FP values.
394     CMPMM_SAE,
395 
396     // Arithmetic operations with FLAGS results.
397     ADD,
398     SUB,
399     ADC,
400     SBB,
401     SMUL,
402     UMUL,
403     OR,
404     XOR,
405     AND,
406 
407     // Bit field extract.
408     BEXTR,
409     BEXTRI,
410 
411     // Zero High Bits Starting with Specified Bit Position.
412     BZHI,
413 
414     // Parallel extract and deposit.
415     PDEP,
416     PEXT,
417 
418     // X86-specific multiply by immediate.
419     MUL_IMM,
420 
421     // Vector sign bit extraction.
422     MOVMSK,
423 
424     // Vector bitwise comparisons.
425     PTEST,
426 
427     // Vector packed fp sign bitwise comparisons.
428     TESTP,
429 
430     // OR/AND test for masks.
431     KORTEST,
432     KTEST,
433 
434     // ADD for masks.
435     KADD,
436 
437     // Several flavors of instructions with vector shuffle behaviors.
438     // Saturated signed/unnsigned packing.
439     PACKSS,
440     PACKUS,
441     // Intra-lane alignr.
442     PALIGNR,
443     // AVX512 inter-lane alignr.
444     VALIGN,
445     PSHUFD,
446     PSHUFHW,
447     PSHUFLW,
448     SHUFP,
449     // VBMI2 Concat & Shift.
450     VSHLD,
451     VSHRD,
452     VSHLDV,
453     VSHRDV,
454     // Shuffle Packed Values at 128-bit granularity.
455     SHUF128,
456     MOVDDUP,
457     MOVSHDUP,
458     MOVSLDUP,
459     MOVLHPS,
460     MOVHLPS,
461     MOVSD,
462     MOVSS,
463     UNPCKL,
464     UNPCKH,
465     VPERMILPV,
466     VPERMILPI,
467     VPERMI,
468     VPERM2X128,
469 
470     // Variable Permute (VPERM).
471     // Res = VPERMV MaskV, V0
472     VPERMV,
473 
474     // 3-op Variable Permute (VPERMT2).
475     // Res = VPERMV3 V0, MaskV, V1
476     VPERMV3,
477 
478     // Bitwise ternary logic.
479     VPTERNLOG,
480     // Fix Up Special Packed Float32/64 values.
481     VFIXUPIMM,
482     VFIXUPIMM_SAE,
483     VFIXUPIMMS,
484     VFIXUPIMMS_SAE,
485     // Range Restriction Calculation For Packed Pairs of Float32/64 values.
486     VRANGE,
487     VRANGE_SAE,
488     VRANGES,
489     VRANGES_SAE,
490     // Reduce - Perform Reduction Transformation on scalar\packed FP.
491     VREDUCE,
492     VREDUCE_SAE,
493     VREDUCES,
494     VREDUCES_SAE,
495     // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
496     // Also used by the legacy (V)ROUND intrinsics where we mask out the
497     // scaling part of the immediate.
498     VRNDSCALE,
499     VRNDSCALE_SAE,
500     VRNDSCALES,
501     VRNDSCALES_SAE,
502     // Tests Types Of a FP Values for packed types.
503     VFPCLASS,
504     // Tests Types Of a FP Values for scalar types.
505     VFPCLASSS,
506 
507     // Broadcast (splat) scalar or element 0 of a vector. If the operand is
508     // a vector, this node may change the vector length as part of the splat.
509     VBROADCAST,
510     // Broadcast mask to vector.
511     VBROADCASTM,
512 
513     /// SSE4A Extraction and Insertion.
514     EXTRQI,
515     INSERTQI,
516 
517     // XOP arithmetic/logical shifts.
518     VPSHA,
519     VPSHL,
520     // XOP signed/unsigned integer comparisons.
521     VPCOM,
522     VPCOMU,
523     // XOP packed permute bytes.
524     VPPERM,
525     // XOP two source permutation.
526     VPERMIL2,
527 
528     // Vector multiply packed unsigned doubleword integers.
529     PMULUDQ,
530     // Vector multiply packed signed doubleword integers.
531     PMULDQ,
532     // Vector Multiply Packed UnsignedIntegers with Round and Scale.
533     MULHRS,
534 
535     // Multiply and Add Packed Integers.
536     VPMADDUBSW,
537     VPMADDWD,
538 
539     // AVX512IFMA multiply and add.
540     // NOTE: These are different than the instruction and perform
541     // op0 x op1 + op2.
542     VPMADD52L,
543     VPMADD52H,
544 
545     // VNNI
546     VPDPBUSD,
547     VPDPBUSDS,
548     VPDPWSSD,
549     VPDPWSSDS,
550 
551     // FMA nodes.
552     // We use the target independent ISD::FMA for the non-inverted case.
553     FNMADD,
554     FMSUB,
555     FNMSUB,
556     FMADDSUB,
557     FMSUBADD,
558 
559     // FMA with rounding mode.
560     FMADD_RND,
561     FNMADD_RND,
562     FMSUB_RND,
563     FNMSUB_RND,
564     FMADDSUB_RND,
565     FMSUBADD_RND,
566 
567     // Compress and expand.
568     COMPRESS,
569     EXPAND,
570 
571     // Bits shuffle
572     VPSHUFBITQMB,
573 
574     // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
575     SINT_TO_FP_RND,
576     UINT_TO_FP_RND,
577     SCALAR_SINT_TO_FP,
578     SCALAR_UINT_TO_FP,
579     SCALAR_SINT_TO_FP_RND,
580     SCALAR_UINT_TO_FP_RND,
581 
582     // Vector float/double to signed/unsigned integer.
583     CVTP2SI,
584     CVTP2UI,
585     CVTP2SI_RND,
586     CVTP2UI_RND,
587     // Scalar float/double to signed/unsigned integer.
588     CVTS2SI,
589     CVTS2UI,
590     CVTS2SI_RND,
591     CVTS2UI_RND,
592 
593     // Vector float/double to signed/unsigned integer with truncation.
594     CVTTP2SI,
595     CVTTP2UI,
596     CVTTP2SI_SAE,
597     CVTTP2UI_SAE,
598     // Scalar float/double to signed/unsigned integer with truncation.
599     CVTTS2SI,
600     CVTTS2UI,
601     CVTTS2SI_SAE,
602     CVTTS2UI_SAE,
603 
604     // Vector signed/unsigned integer to float/double.
605     CVTSI2P,
606     CVTUI2P,
607 
608     // Masked versions of above. Used for v2f64->v4f32.
609     // SRC, PASSTHRU, MASK
610     MCVTP2SI,
611     MCVTP2UI,
612     MCVTTP2SI,
613     MCVTTP2UI,
614     MCVTSI2P,
615     MCVTUI2P,
616 
617     // Vector float to bfloat16.
618     // Convert TWO packed single data to one packed BF16 data
619     CVTNE2PS2BF16,
620     // Convert packed single data to packed BF16 data
621     CVTNEPS2BF16,
622     // Masked version of above.
623     // SRC, PASSTHRU, MASK
624     MCVTNEPS2BF16,
625 
626     // Dot product of BF16 pairs to accumulated into
627     // packed single precision.
628     DPBF16PS,
629 
630     // Save xmm argument registers to the stack, according to %al. An operator
631     // is needed so that this can be expanded with control flow.
632     VASTART_SAVE_XMM_REGS,
633 
634     // Windows's _chkstk call to do stack probing.
635     WIN_ALLOCA,
636 
637     // For allocating variable amounts of stack space when using
638     // segmented stacks. Check if the current stacklet has enough space, and
639     // falls back to heap allocation if not.
640     SEG_ALLOCA,
641 
642     // For allocating stack space when using stack clash protector.
643     // Allocation is performed by block, and each block is probed.
644     PROBED_ALLOCA,
645 
646     // Memory barriers.
647     MEMBARRIER,
648     MFENCE,
649 
650     // Get a random integer and indicate whether it is valid in CF.
651     RDRAND,
652 
653     // Get a NIST SP800-90B & C compliant random integer and
654     // indicate whether it is valid in CF.
655     RDSEED,
656 
657     // Protection keys
658     // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
659     // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
660     // value for ECX.
661     RDPKRU,
662     WRPKRU,
663 
664     // SSE42 string comparisons.
665     // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
666     // will emit one or two instructions based on which results are used. If
667     // flags and index/mask this allows us to use a single instruction since
668     // we won't have to pick and opcode for flags. Instead we can rely on the
669     // DAG to CSE everything and decide at isel.
670     PCMPISTR,
671     PCMPESTR,
672 
673     // Test if in transactional execution.
674     XTEST,
675 
676     // ERI instructions.
677     RSQRT28,
678     RSQRT28_SAE,
679     RSQRT28S,
680     RSQRT28S_SAE,
681     RCP28,
682     RCP28_SAE,
683     RCP28S,
684     RCP28S_SAE,
685     EXP2,
686     EXP2_SAE,
687 
688     // Conversions between float and half-float.
689     CVTPS2PH,
690     CVTPH2PS,
691     CVTPH2PS_SAE,
692 
693     // Masked version of above.
694     // SRC, RND, PASSTHRU, MASK
695     MCVTPS2PH,
696 
697     // Galois Field Arithmetic Instructions
698     GF2P8AFFINEINVQB,
699     GF2P8AFFINEQB,
700     GF2P8MULB,
701 
702     // LWP insert record.
703     LWPINS,
704 
705     // User level wait
706     UMWAIT,
707     TPAUSE,
708 
709     // Enqueue Stores Instructions
710     ENQCMD,
711     ENQCMDS,
712 
713     // For avx512-vp2intersect
714     VP2INTERSECT,
715 
716     // User level interrupts - testui
717     TESTUI,
718 
719     /// X86 strict FP compare instructions.
720     STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
721     STRICT_FCMPS,
722 
723     // Vector packed double/float comparison.
724     STRICT_CMPP,
725 
726     /// Vector comparison generating mask bits for fp and
727     /// integer signed and unsigned data types.
728     STRICT_CMPM,
729 
730     // Vector float/double to signed/unsigned integer with truncation.
731     STRICT_CVTTP2SI,
732     STRICT_CVTTP2UI,
733 
734     // Vector FP extend.
735     STRICT_VFPEXT,
736 
737     // Vector FP round.
738     STRICT_VFPROUND,
739 
740     // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
741     // Also used by the legacy (V)ROUND intrinsics where we mask out the
742     // scaling part of the immediate.
743     STRICT_VRNDSCALE,
744 
745     // Vector signed/unsigned integer to float/double.
746     STRICT_CVTSI2P,
747     STRICT_CVTUI2P,
748 
749     // Strict FMA nodes.
750     STRICT_FNMADD,
751     STRICT_FMSUB,
752     STRICT_FNMSUB,
753 
754     // Conversions between float and half-float.
755     STRICT_CVTPS2PH,
756     STRICT_CVTPH2PS,
757 
758     // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
759     // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
760 
761     // Compare and swap.
762     LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
763     LCMPXCHG8_DAG,
764     LCMPXCHG16_DAG,
765     LCMPXCHG16_SAVE_RBX_DAG,
766 
767     /// LOCK-prefixed arithmetic read-modify-write instructions.
768     /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
769     LADD,
770     LSUB,
771     LOR,
772     LXOR,
773     LAND,
774 
775     // Load, scalar_to_vector, and zero extend.
776     VZEXT_LOAD,
777 
778     // extract_vector_elt, store.
779     VEXTRACT_STORE,
780 
781     // scalar broadcast from memory.
782     VBROADCAST_LOAD,
783 
784     // subvector broadcast from memory.
785     SUBV_BROADCAST_LOAD,
786 
787     // Store FP control word into i16 memory.
788     FNSTCW16m,
789 
790     // Load FP control word from i16 memory.
791     FLDCW16m,
792 
793     /// This instruction implements FP_TO_SINT with the
794     /// integer destination in memory and a FP reg source.  This corresponds
795     /// to the X86::FIST*m instructions and the rounding mode change stuff. It
796     /// has two inputs (token chain and address) and two outputs (int value
797     /// and token chain). Memory VT specifies the type to store to.
798     FP_TO_INT_IN_MEM,
799 
800     /// This instruction implements SINT_TO_FP with the
801     /// integer source in memory and FP reg result.  This corresponds to the
802     /// X86::FILD*m instructions. It has two inputs (token chain and address)
803     /// and two outputs (FP value and token chain). The integer source type is
804     /// specified by the memory VT.
805     FILD,
806 
807     /// This instruction implements a fp->int store from FP stack
808     /// slots. This corresponds to the fist instruction. It takes a
809     /// chain operand, value to store, address, and glue. The memory VT
810     /// specifies the type to store as.
811     FIST,
812 
813     /// This instruction implements an extending load to FP stack slots.
814     /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
815     /// operand, and ptr to load from. The memory VT specifies the type to
816     /// load from.
817     FLD,
818 
819     /// This instruction implements a truncating store from FP stack
820     /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
821     /// chain operand, value to store, address, and glue. The memory VT
822     /// specifies the type to store as.
823     FST,
824 
825     /// These instructions grab the address of the next argument
826     /// from a va_list. (reads and modifies the va_list in memory)
827     VAARG_64,
828     VAARG_X32,
829 
830     // Vector truncating store with unsigned/signed saturation
831     VTRUNCSTOREUS,
832     VTRUNCSTORES,
833     // Vector truncating masked store with unsigned/signed saturation
834     VMTRUNCSTOREUS,
835     VMTRUNCSTORES,
836 
837     // X86 specific gather and scatter
838     MGATHER,
839     MSCATTER,
840 
841     // Key locker nodes that produce flags.
842     AESENC128KL,
843     AESDEC128KL,
844     AESENC256KL,
845     AESDEC256KL,
846     AESENCWIDE128KL,
847     AESDECWIDE128KL,
848     AESENCWIDE256KL,
849     AESDECWIDE256KL,
850 
851     // WARNING: Do not add anything in the end unless you want the node to
852     // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
853     // opcodes will be thought as target memory ops!
854   };
855   } // end namespace X86ISD
856 
857   namespace X86 {
858     /// Current rounding mode is represented in bits 11:10 of FPSR. These
859     /// values are same as corresponding constants for rounding mode used
860     /// in glibc.
861     enum RoundingMode {
862       rmToNearest   = 0,        // FE_TONEAREST
863       rmDownward    = 1 << 10,  // FE_DOWNWARD
864       rmUpward      = 2 << 10,  // FE_UPWARD
865       rmTowardZero  = 3 << 10,  // FE_TOWARDZERO
866       rmMask        = 3 << 10   // Bit mask selecting rounding mode
867     };
868   }
869 
870   /// Define some predicates that are used for node matching.
871   namespace X86 {
872     /// Returns true if Elt is a constant zero or floating point constant +0.0.
873     bool isZeroNode(SDValue Elt);
874 
875     /// Returns true of the given offset can be
876     /// fit into displacement field of the instruction.
877     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
878                                       bool hasSymbolicDisplacement);
879 
880     /// Determines whether the callee is required to pop its
881     /// own arguments. Callee pop is necessary to support tail calls.
882     bool isCalleePop(CallingConv::ID CallingConv,
883                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
884 
885     /// If Op is a constant whose elements are all the same constant or
886     /// undefined, return true and return the constant value in \p SplatVal.
887     /// If we have undef bits that don't cover an entire element, we treat these
888     /// as zero if AllowPartialUndefs is set, else we fail and return false.
889     bool isConstantSplat(SDValue Op, APInt &SplatVal,
890                          bool AllowPartialUndefs = true);
891   } // end namespace X86
892 
893   //===--------------------------------------------------------------------===//
894   //  X86 Implementation of the TargetLowering interface
895   class X86TargetLowering final : public TargetLowering {
896   public:
897     explicit X86TargetLowering(const X86TargetMachine &TM,
898                                const X86Subtarget &STI);
899 
900     unsigned getJumpTableEncoding() const override;
901     bool useSoftFloat() const override;
902 
903     void markLibCallAttributes(MachineFunction *MF, unsigned CC,
904                                ArgListTy &Args) const override;
905 
906     MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
907       return MVT::i8;
908     }
909 
910     const MCExpr *
911     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
912                               const MachineBasicBlock *MBB, unsigned uid,
913                               MCContext &Ctx) const override;
914 
915     /// Returns relocation base for the given PIC jumptable.
916     SDValue getPICJumpTableRelocBase(SDValue Table,
917                                      SelectionDAG &DAG) const override;
918     const MCExpr *
919     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
920                                  unsigned JTI, MCContext &Ctx) const override;
921 
922     /// Return the desired alignment for ByVal aggregate
923     /// function arguments in the caller parameter area. For X86, aggregates
924     /// that contains are placed at 16-byte boundaries while the rest are at
925     /// 4-byte boundaries.
926     unsigned getByValTypeAlignment(Type *Ty,
927                                    const DataLayout &DL) const override;
928 
929     EVT getOptimalMemOpType(const MemOp &Op,
930                             const AttributeList &FuncAttributes) const override;
931 
932     /// Returns true if it's safe to use load / store of the
933     /// specified type to expand memcpy / memset inline. This is mostly true
934     /// for all types except for some special cases. For example, on X86
935     /// targets without SSE2 f64 load / store are done with fldl / fstpl which
936     /// also does type conversion. Note the specified type doesn't have to be
937     /// legal as the hook is used before type legalization.
938     bool isSafeMemOpType(MVT VT) const override;
939 
940     /// Returns true if the target allows unaligned memory accesses of the
941     /// specified type. Returns whether it is "fast" in the last argument.
942     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
943                                         MachineMemOperand::Flags Flags,
944                                         bool *Fast) const override;
945 
946     /// Provide custom lowering hooks for some operations.
947     ///
948     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
949 
950     /// Replace the results of node with an illegal result
951     /// type with new values built out of custom code.
952     ///
953     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
954                             SelectionDAG &DAG) const override;
955 
956     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
957 
958     /// Return true if the target has native support for
959     /// the specified value type and it is 'desirable' to use the type for the
960     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
961     /// instruction encodings are longer and some i16 instructions are slow.
962     bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
963 
964     /// Return true if the target has native support for the
965     /// specified value type and it is 'desirable' to use the type. e.g. On x86
966     /// i16 is legal, but undesirable since i16 instruction encodings are longer
967     /// and some i16 instructions are slow.
968     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
969 
970     /// Return the newly negated expression if the cost is not expensive and
971     /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
972     /// do the negation.
973     SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
974                                  bool LegalOperations, bool ForCodeSize,
975                                  NegatibleCost &Cost,
976                                  unsigned Depth) const override;
977 
978     MachineBasicBlock *
979     EmitInstrWithCustomInserter(MachineInstr &MI,
980                                 MachineBasicBlock *MBB) const override;
981 
982     /// This method returns the name of a target specific DAG node.
983     const char *getTargetNodeName(unsigned Opcode) const override;
984 
985     /// Do not merge vector stores after legalization because that may conflict
986     /// with x86-specific store splitting optimizations.
987     bool mergeStoresAfterLegalization(EVT MemVT) const override {
988       return !MemVT.isVector();
989     }
990 
991     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
992                           const SelectionDAG &DAG) const override;
993 
994     bool isCheapToSpeculateCttz() const override;
995 
996     bool isCheapToSpeculateCtlz() const override;
997 
998     bool isCtlzFast() const override;
999 
1000     bool hasBitPreservingFPLogic(EVT VT) const override {
1001       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
1002     }
1003 
1004     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
1005       // If the pair to store is a mixture of float and int values, we will
1006       // save two bitwise instructions and one float-to-int instruction and
1007       // increase one store instruction. There is potentially a more
1008       // significant benefit because it avoids the float->int domain switch
1009       // for input value. So It is more likely a win.
1010       if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
1011           (LTy.isInteger() && HTy.isFloatingPoint()))
1012         return true;
1013       // If the pair only contains int values, we will save two bitwise
1014       // instructions and increase one store instruction (costing one more
1015       // store buffer). Since the benefit is more blurred so we leave
1016       // such pair out until we get testcase to prove it is a win.
1017       return false;
1018     }
1019 
1020     bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
1021 
1022     bool hasAndNotCompare(SDValue Y) const override;
1023 
1024     bool hasAndNot(SDValue Y) const override;
1025 
1026     bool hasBitTest(SDValue X, SDValue Y) const override;
1027 
1028     bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
1029         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
1030         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
1031         SelectionDAG &DAG) const override;
1032 
1033     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
1034                                            CombineLevel Level) const override;
1035 
1036     bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
1037 
1038     bool
1039     shouldTransformSignedTruncationCheck(EVT XVT,
1040                                          unsigned KeptBits) const override {
1041       // For vectors, we don't have a preference..
1042       if (XVT.isVector())
1043         return false;
1044 
1045       auto VTIsOk = [](EVT VT) -> bool {
1046         return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
1047                VT == MVT::i64;
1048       };
1049 
1050       // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
1051       // XVT will be larger than KeptBitsVT.
1052       MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
1053       return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
1054     }
1055 
1056     bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
1057 
1058     bool shouldSplatInsEltVarIndex(EVT VT) const override;
1059 
1060     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
1061       return VT.isScalarInteger();
1062     }
1063 
1064     /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
1065     MVT hasFastEqualityCompare(unsigned NumBits) const override;
1066 
1067     /// Return the value type to use for ISD::SETCC.
1068     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
1069                            EVT VT) const override;
1070 
1071     bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
1072                                       const APInt &DemandedElts,
1073                                       TargetLoweringOpt &TLO) const override;
1074 
1075     /// Determine which of the bits specified in Mask are known to be either
1076     /// zero or one and return them in the KnownZero/KnownOne bitsets.
1077     void computeKnownBitsForTargetNode(const SDValue Op,
1078                                        KnownBits &Known,
1079                                        const APInt &DemandedElts,
1080                                        const SelectionDAG &DAG,
1081                                        unsigned Depth = 0) const override;
1082 
1083     /// Determine the number of bits in the operation that are sign bits.
1084     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
1085                                              const APInt &DemandedElts,
1086                                              const SelectionDAG &DAG,
1087                                              unsigned Depth) const override;
1088 
1089     bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
1090                                                  const APInt &DemandedElts,
1091                                                  APInt &KnownUndef,
1092                                                  APInt &KnownZero,
1093                                                  TargetLoweringOpt &TLO,
1094                                                  unsigned Depth) const override;
1095 
1096     bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
1097                                                     const APInt &DemandedElts,
1098                                                     unsigned MaskIndex,
1099                                                     TargetLoweringOpt &TLO,
1100                                                     unsigned Depth) const;
1101 
1102     bool SimplifyDemandedBitsForTargetNode(SDValue Op,
1103                                            const APInt &DemandedBits,
1104                                            const APInt &DemandedElts,
1105                                            KnownBits &Known,
1106                                            TargetLoweringOpt &TLO,
1107                                            unsigned Depth) const override;
1108 
1109     SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
1110         SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1111         SelectionDAG &DAG, unsigned Depth) const override;
1112 
1113     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
1114 
1115     SDValue unwrapAddress(SDValue N) const override;
1116 
1117     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
1118 
1119     bool ExpandInlineAsm(CallInst *CI) const override;
1120 
1121     ConstraintType getConstraintType(StringRef Constraint) const override;
1122 
1123     /// Examine constraint string and operand type and determine a weight value.
1124     /// The operand object must already have been set up with the operand type.
1125     ConstraintWeight
1126       getSingleConstraintMatchWeight(AsmOperandInfo &info,
1127                                      const char *constraint) const override;
1128 
1129     const char *LowerXConstraint(EVT ConstraintVT) const override;
1130 
1131     /// Lower the specified operand into the Ops vector. If it is invalid, don't
1132     /// add anything to Ops. If hasMemory is true it means one of the asm
1133     /// constraint of the inline asm instruction being processed is 'm'.
1134     void LowerAsmOperandForConstraint(SDValue Op,
1135                                       std::string &Constraint,
1136                                       std::vector<SDValue> &Ops,
1137                                       SelectionDAG &DAG) const override;
1138 
1139     unsigned
1140     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
1141       if (ConstraintCode == "v")
1142         return InlineAsm::Constraint_v;
1143       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
1144     }
1145 
1146     /// Handle Lowering flag assembly outputs.
1147     SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
1148                                         const SDLoc &DL,
1149                                         const AsmOperandInfo &Constraint,
1150                                         SelectionDAG &DAG) const override;
1151 
1152     /// Given a physical register constraint
1153     /// (e.g. {edx}), return the register number and the register class for the
1154     /// register.  This should only be used for C_Register constraints.  On
1155     /// error, this returns a register number of 0.
1156     std::pair<unsigned, const TargetRegisterClass *>
1157     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
1158                                  StringRef Constraint, MVT VT) const override;
1159 
1160     /// Return true if the addressing mode represented
1161     /// by AM is legal for this target, for a load/store of the specified type.
1162     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
1163                                Type *Ty, unsigned AS,
1164                                Instruction *I = nullptr) const override;
1165 
1166     /// Return true if the specified immediate is legal
1167     /// icmp immediate, that is the target has icmp instructions which can
1168     /// compare a register against the immediate without having to materialize
1169     /// the immediate into a register.
1170     bool isLegalICmpImmediate(int64_t Imm) const override;
1171 
1172     /// Return true if the specified immediate is legal
1173     /// add immediate, that is the target has add instructions which can
1174     /// add a register and the immediate without having to materialize
1175     /// the immediate into a register.
1176     bool isLegalAddImmediate(int64_t Imm) const override;
1177 
1178     bool isLegalStoreImmediate(int64_t Imm) const override;
1179 
1180     /// Return the cost of the scaling factor used in the addressing
1181     /// mode represented by AM for this target, for a load/store
1182     /// of the specified type.
1183     /// If the AM is supported, the return value must be >= 0.
1184     /// If the AM is not supported, it returns a negative value.
1185     InstructionCost getScalingFactorCost(const DataLayout &DL,
1186                                          const AddrMode &AM, Type *Ty,
1187                                          unsigned AS) const override;
1188 
1189     /// This is used to enable splatted operand transforms for vector shifts
1190     /// and vector funnel shifts.
1191     bool isVectorShiftByScalarCheap(Type *Ty) const override;
1192 
1193     /// Add x86-specific opcodes to the default list.
1194     bool isBinOp(unsigned Opcode) const override;
1195 
1196     /// Returns true if the opcode is a commutative binary operation.
1197     bool isCommutativeBinOp(unsigned Opcode) const override;
1198 
1199     /// Return true if it's free to truncate a value of
1200     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
1201     /// register EAX to i16 by referencing its sub-register AX.
1202     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
1203     bool isTruncateFree(EVT VT1, EVT VT2) const override;
1204 
1205     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
1206 
1207     /// Return true if any actual instruction that defines a
1208     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
1209     /// register. This does not necessarily include registers defined in
1210     /// unknown ways, such as incoming arguments, or copies from unknown
1211     /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
1212     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
1213     /// all instructions that define 32-bit values implicit zero-extend the
1214     /// result out to 64 bits.
1215     bool isZExtFree(Type *Ty1, Type *Ty2) const override;
1216     bool isZExtFree(EVT VT1, EVT VT2) const override;
1217     bool isZExtFree(SDValue Val, EVT VT2) const override;
1218 
1219     bool shouldSinkOperands(Instruction *I,
1220                             SmallVectorImpl<Use *> &Ops) const override;
1221     bool shouldConvertPhiType(Type *From, Type *To) const override;
1222 
1223     /// Return true if folding a vector load into ExtVal (a sign, zero, or any
1224     /// extend node) is profitable.
1225     bool isVectorLoadExtDesirable(SDValue) const override;
1226 
1227     /// Return true if an FMA operation is faster than a pair of fmul and fadd
1228     /// instructions. fmuladd intrinsics will be expanded to FMAs when this
1229     /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
1230     bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
1231                                     EVT VT) const override;
1232 
1233     /// Return true if it's profitable to narrow
1234     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
1235     /// from i32 to i8 but not from i32 to i16.
1236     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
1237 
1238     /// Given an intrinsic, checks if on the target the intrinsic will need to map
1239     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
1240     /// true and stores the intrinsic information into the IntrinsicInfo that was
1241     /// passed to the function.
1242     bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
1243                             MachineFunction &MF,
1244                             unsigned Intrinsic) const override;
1245 
1246     /// Returns true if the target can instruction select the
1247     /// specified FP immediate natively. If false, the legalizer will
1248     /// materialize the FP immediate as a load from a constant pool.
1249     bool isFPImmLegal(const APFloat &Imm, EVT VT,
1250                       bool ForCodeSize) const override;
1251 
1252     /// Targets can use this to indicate that they only support *some*
1253     /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
1254     /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
1255     /// be legal.
1256     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1257 
1258     /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
1259     /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
1260     /// constant pool entry.
1261     bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1262 
1263     /// Returns true if lowering to a jump table is allowed.
1264     bool areJTsAllowed(const Function *Fn) const override;
1265 
1266     /// If true, then instruction selection should
1267     /// seek to shrink the FP constant of the specified type to a smaller type
1268     /// in order to save space and / or reduce runtime.
1269     bool ShouldShrinkFPConstant(EVT VT) const override {
1270       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
1271       // expensive than a straight movsd. On the other hand, it's important to
1272       // shrink long double fp constant since fldt is very slow.
1273       return !X86ScalarSSEf64 || VT == MVT::f80;
1274     }
1275 
1276     /// Return true if we believe it is correct and profitable to reduce the
1277     /// load node to a smaller type.
1278     bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
1279                                EVT NewVT) const override;
1280 
1281     /// Return true if the specified scalar FP type is computed in an SSE
1282     /// register, not on the X87 floating point stack.
1283     bool isScalarFPTypeInSSEReg(EVT VT) const {
1284       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
1285              (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
1286     }
1287 
1288     /// Returns true if it is beneficial to convert a load of a constant
1289     /// to just the constant itself.
1290     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
1291                                            Type *Ty) const override;
1292 
1293     bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
1294 
1295     bool convertSelectOfConstantsToMath(EVT VT) const override;
1296 
1297     bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
1298                                 SDValue C) const override;
1299 
1300     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
1301     /// with this index.
1302     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1303                                  unsigned Index) const override;
1304 
1305     /// Scalar ops always have equal or better analysis/performance/power than
1306     /// the vector equivalent, so this always makes sense if the scalar op is
1307     /// supported.
1308     bool shouldScalarizeBinop(SDValue) const override;
1309 
1310     /// Extract of a scalar FP value from index 0 of a vector is free.
1311     bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
1312       EVT EltVT = VT.getScalarType();
1313       return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
1314     }
1315 
1316     /// Overflow nodes should get combined/lowered to optimal instructions
1317     /// (they should allow eliminating explicit compares by getting flags from
1318     /// math ops).
1319     bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
1320                               bool MathUsed) const override;
1321 
1322     bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
1323                                       unsigned AddrSpace) const override {
1324       // If we can replace more than 2 scalar stores, there will be a reduction
1325       // in instructions even after we add a vector constant load.
1326       return NumElem > 2;
1327     }
1328 
1329     bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
1330                                  const SelectionDAG &DAG,
1331                                  const MachineMemOperand &MMO) const override;
1332 
1333     /// Intel processors have a unified instruction and data cache
1334     const char * getClearCacheBuiltinName() const override {
1335       return nullptr; // nothing to do, move along.
1336     }
1337 
1338     Register getRegisterByName(const char* RegName, LLT VT,
1339                                const MachineFunction &MF) const override;
1340 
1341     /// If a physical register, this returns the register that receives the
1342     /// exception address on entry to an EH pad.
1343     Register
1344     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
1345 
1346     /// If a physical register, this returns the register that receives the
1347     /// exception typeid on entry to a landing pad.
1348     Register
1349     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
1350 
1351     virtual bool needsFixedCatchObjects() const override;
1352 
1353     /// This method returns a target specific FastISel object,
1354     /// or null if the target does not support "fast" ISel.
1355     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1356                              const TargetLibraryInfo *libInfo) const override;
1357 
1358     /// If the target has a standard location for the stack protector cookie,
1359     /// returns the address of that location. Otherwise, returns nullptr.
1360     Value *getIRStackGuard(IRBuilderBase &IRB) const override;
1361 
1362     bool useLoadStackGuardNode() const override;
1363     bool useStackGuardXorFP() const override;
1364     void insertSSPDeclarations(Module &M) const override;
1365     Value *getSDagStackGuard(const Module &M) const override;
1366     Function *getSSPStackGuardCheck(const Module &M) const override;
1367     SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1368                                 const SDLoc &DL) const override;
1369 
1370 
1371     /// Return true if the target stores SafeStack pointer at a fixed offset in
1372     /// some non-standard address space, and populates the address space and
1373     /// offset as appropriate.
1374     Value *getSafeStackPointerLocation(IRBuilderBase &IRB) const override;
1375 
1376     std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
1377                                           SDValue Chain, SDValue Pointer,
1378                                           MachinePointerInfo PtrInfo,
1379                                           Align Alignment,
1380                                           SelectionDAG &DAG) const;
1381 
1382     /// Customize the preferred legalization strategy for certain types.
1383     LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
1384 
1385     bool softPromoteHalfType() const override { return true; }
1386 
1387     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
1388                                       EVT VT) const override;
1389 
1390     unsigned getNumRegistersForCallingConv(LLVMContext &Context,
1391                                            CallingConv::ID CC,
1392                                            EVT VT) const override;
1393 
1394     unsigned getVectorTypeBreakdownForCallingConv(
1395         LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1396         unsigned &NumIntermediates, MVT &RegisterVT) const override;
1397 
1398     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
1399 
1400     bool supportSwiftError() const override;
1401 
1402     bool hasStackProbeSymbol(MachineFunction &MF) const override;
1403     bool hasInlineStackProbe(MachineFunction &MF) const override;
1404     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
1405 
1406     unsigned getStackProbeSize(MachineFunction &MF) const;
1407 
1408     bool hasVectorBlend() const override { return true; }
1409 
1410     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
1411 
1412     /// Lower interleaved load(s) into target specific
1413     /// instructions/intrinsics.
1414     bool lowerInterleavedLoad(LoadInst *LI,
1415                               ArrayRef<ShuffleVectorInst *> Shuffles,
1416                               ArrayRef<unsigned> Indices,
1417                               unsigned Factor) const override;
1418 
1419     /// Lower interleaved store(s) into target specific
1420     /// instructions/intrinsics.
1421     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
1422                                unsigned Factor) const override;
1423 
1424     SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
1425                                    SDValue Addr, SelectionDAG &DAG)
1426                                    const override;
1427 
1428     Align getPrefLoopAlignment(MachineLoop *ML) const override;
1429 
1430   protected:
1431     std::pair<const TargetRegisterClass *, uint8_t>
1432     findRepresentativeClass(const TargetRegisterInfo *TRI,
1433                             MVT VT) const override;
1434 
1435   private:
1436     /// Keep a reference to the X86Subtarget around so that we can
1437     /// make the right decision when generating code for different targets.
1438     const X86Subtarget &Subtarget;
1439 
1440     /// Select between SSE or x87 floating point ops.
1441     /// When SSE is available, use it for f32 operations.
1442     /// When SSE2 is available, use it for f64 operations.
1443     bool X86ScalarSSEf32;
1444     bool X86ScalarSSEf64;
1445 
1446     /// A list of legal FP immediates.
1447     std::vector<APFloat> LegalFPImmediates;
1448 
1449     /// Indicate that this x86 target can instruction
1450     /// select the specified FP immediate natively.
1451     void addLegalFPImmediate(const APFloat& Imm) {
1452       LegalFPImmediates.push_back(Imm);
1453     }
1454 
1455     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
1456                             CallingConv::ID CallConv, bool isVarArg,
1457                             const SmallVectorImpl<ISD::InputArg> &Ins,
1458                             const SDLoc &dl, SelectionDAG &DAG,
1459                             SmallVectorImpl<SDValue> &InVals,
1460                             uint32_t *RegMask) const;
1461     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1462                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
1463                              const SDLoc &dl, SelectionDAG &DAG,
1464                              const CCValAssign &VA, MachineFrameInfo &MFI,
1465                              unsigned i) const;
1466     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
1467                              const SDLoc &dl, SelectionDAG &DAG,
1468                              const CCValAssign &VA,
1469                              ISD::ArgFlagsTy Flags, bool isByval) const;
1470 
1471     // Call lowering helpers.
1472 
1473     /// Check whether the call is eligible for tail call optimization. Targets
1474     /// that want to do tail call optimization should implement this function.
1475     bool IsEligibleForTailCallOptimization(SDValue Callee,
1476                                            CallingConv::ID CalleeCC,
1477                                            bool isVarArg,
1478                                            bool isCalleeStructRet,
1479                                            bool isCallerStructRet,
1480                                            Type *RetTy,
1481                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
1482                                     const SmallVectorImpl<SDValue> &OutVals,
1483                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1484                                            SelectionDAG& DAG) const;
1485     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
1486                                     SDValue Chain, bool IsTailCall,
1487                                     bool Is64Bit, int FPDiff,
1488                                     const SDLoc &dl) const;
1489 
1490     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
1491                                          SelectionDAG &DAG) const;
1492 
1493     unsigned getAddressSpace(void) const;
1494 
1495     SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
1496                             SDValue &Chain) const;
1497     SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
1498 
1499     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
1500     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
1501     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1502     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1503 
1504     unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
1505                                   const unsigned char OpFlags = 0) const;
1506     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
1507     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
1508     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
1509     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
1510     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
1511 
1512     /// Creates target global address or external symbol nodes for calls or
1513     /// other uses.
1514     SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
1515                                   bool ForCall) const;
1516 
1517     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1518     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1519     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
1520     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
1521     SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
1522     SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
1523     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
1524     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
1525     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
1526     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
1527     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
1528     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
1529     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
1530     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
1531     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1532     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1533     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
1534     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
1535     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
1536     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
1537     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
1538     SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
1539     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
1540     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
1541     SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
1542     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
1543     SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
1544     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
1545     SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
1546     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
1547     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
1548 
1549     SDValue
1550     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1551                          const SmallVectorImpl<ISD::InputArg> &Ins,
1552                          const SDLoc &dl, SelectionDAG &DAG,
1553                          SmallVectorImpl<SDValue> &InVals) const override;
1554     SDValue LowerCall(CallLoweringInfo &CLI,
1555                       SmallVectorImpl<SDValue> &InVals) const override;
1556 
1557     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1558                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1559                         const SmallVectorImpl<SDValue> &OutVals,
1560                         const SDLoc &dl, SelectionDAG &DAG) const override;
1561 
1562     bool supportSplitCSR(MachineFunction *MF) const override {
1563       return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
1564           MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
1565     }
1566     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
1567     void insertCopiesSplitCSR(
1568       MachineBasicBlock *Entry,
1569       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
1570 
1571     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
1572 
1573     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
1574 
1575     EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
1576                             ISD::NodeType ExtendKind) const override;
1577 
1578     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
1579                         bool isVarArg,
1580                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1581                         LLVMContext &Context) const override;
1582 
1583     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
1584 
1585     TargetLoweringBase::AtomicExpansionKind
1586     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
1587     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
1588     TargetLoweringBase::AtomicExpansionKind
1589     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
1590 
1591     LoadInst *
1592     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
1593 
1594     bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
1595     bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
1596 
1597     bool needsCmpXchgNb(Type *MemType) const;
1598 
1599     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
1600                                 MachineBasicBlock *DispatchBB, int FI) const;
1601 
1602     // Utility function to emit the low-level va_arg code for X86-64.
1603     MachineBasicBlock *
1604     EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
1605 
1606     /// Utility function to emit the xmm reg save portion of va_start.
1607     MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
1608                                                  MachineInstr &MI2,
1609                                                  MachineBasicBlock *BB) const;
1610 
1611     MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
1612                                          MachineBasicBlock *BB) const;
1613 
1614     MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
1615                                            MachineBasicBlock *BB) const;
1616 
1617     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
1618                                             MachineBasicBlock *BB) const;
1619 
1620     MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
1621                                                MachineBasicBlock *BB) const;
1622 
1623     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
1624                                           MachineBasicBlock *BB) const;
1625 
1626     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
1627                                           MachineBasicBlock *BB) const;
1628 
1629     MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
1630                                                 MachineBasicBlock *BB) const;
1631 
1632     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
1633                                         MachineBasicBlock *MBB) const;
1634 
1635     void emitSetJmpShadowStackFix(MachineInstr &MI,
1636                                   MachineBasicBlock *MBB) const;
1637 
1638     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
1639                                          MachineBasicBlock *MBB) const;
1640 
1641     MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
1642                                                  MachineBasicBlock *MBB) const;
1643 
1644     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
1645                                              MachineBasicBlock *MBB) const;
1646 
1647     /// Emit flags for the given setcc condition and operands. Also returns the
1648     /// corresponding X86 condition code constant in X86CC.
1649     SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
1650                               const SDLoc &dl, SelectionDAG &DAG,
1651                               SDValue &X86CC) const;
1652 
1653     /// Check if replacement of SQRT with RSQRT should be disabled.
1654     bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
1655 
1656     /// Use rsqrt* to speed up sqrt calculations.
1657     SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
1658                             int &RefinementSteps, bool &UseOneConstNR,
1659                             bool Reciprocal) const override;
1660 
1661     /// Use rcp* to speed up fdiv calculations.
1662     SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
1663                              int &RefinementSteps) const override;
1664 
1665     /// Reassociate floating point divisions into multiply by reciprocal.
1666     unsigned combineRepeatedFPDivisors() const override;
1667 
1668     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
1669                           SmallVectorImpl<SDNode *> &Created) const override;
1670   };
1671 
1672   namespace X86 {
1673     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1674                              const TargetLibraryInfo *libInfo);
1675   } // end namespace X86
1676 
1677   // X86 specific Gather/Scatter nodes.
1678   // The class has the same order of operands as MaskedGatherScatterSDNode for
1679   // convenience.
1680   class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
1681   public:
1682     // This is a intended as a utility and should never be directly created.
1683     X86MaskedGatherScatterSDNode() = delete;
1684     ~X86MaskedGatherScatterSDNode() = delete;
1685 
1686     const SDValue &getBasePtr() const { return getOperand(3); }
1687     const SDValue &getIndex()   const { return getOperand(4); }
1688     const SDValue &getMask()    const { return getOperand(2); }
1689     const SDValue &getScale()   const { return getOperand(5); }
1690 
1691     static bool classof(const SDNode *N) {
1692       return N->getOpcode() == X86ISD::MGATHER ||
1693              N->getOpcode() == X86ISD::MSCATTER;
1694     }
1695   };
1696 
1697   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
1698   public:
1699     const SDValue &getPassThru() const { return getOperand(1); }
1700 
1701     static bool classof(const SDNode *N) {
1702       return N->getOpcode() == X86ISD::MGATHER;
1703     }
1704   };
1705 
1706   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
1707   public:
1708     const SDValue &getValue() const { return getOperand(1); }
1709 
1710     static bool classof(const SDNode *N) {
1711       return N->getOpcode() == X86ISD::MSCATTER;
1712     }
1713   };
1714 
1715   /// Generate unpacklo/unpackhi shuffle mask.
1716   void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
1717                                bool Unary);
1718 
1719   /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
1720   /// imposed by AVX and specific to the unary pattern. Example:
1721   /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
1722   /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
1723   void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
1724 
1725 } // end namespace llvm
1726 
1727 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
1728