xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86ISelLowering.h (revision 79ac3c12a714bcd3f2354c52d948aed9575c46d6)
1 //===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the interfaces that X86 uses to lower LLVM code into a
10 // selection DAG.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
15 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
16 
17 #include "llvm/CodeGen/TargetLowering.h"
18 
19 namespace llvm {
20   class X86Subtarget;
21   class X86TargetMachine;
22 
23   namespace X86ISD {
24     // X86 Specific DAG Nodes
25   enum NodeType : unsigned {
26     // Start the numbering where the builtin ops leave off.
27     FIRST_NUMBER = ISD::BUILTIN_OP_END,
28 
29     /// Bit scan forward.
30     BSF,
31     /// Bit scan reverse.
32     BSR,
33 
34     /// X86 funnel/double shift i16 instructions. These correspond to
35     /// X86::SHLDW and X86::SHRDW instructions which have different amt
36     /// modulo rules to generic funnel shifts.
37     /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
38     FSHL,
39     FSHR,
40 
41     /// Bitwise logical AND of floating point values. This corresponds
42     /// to X86::ANDPS or X86::ANDPD.
43     FAND,
44 
45     /// Bitwise logical OR of floating point values. This corresponds
46     /// to X86::ORPS or X86::ORPD.
47     FOR,
48 
49     /// Bitwise logical XOR of floating point values. This corresponds
50     /// to X86::XORPS or X86::XORPD.
51     FXOR,
52 
53     ///  Bitwise logical ANDNOT of floating point values. This
54     /// corresponds to X86::ANDNPS or X86::ANDNPD.
55     FANDN,
56 
57     /// These operations represent an abstract X86 call
58     /// instruction, which includes a bunch of information.  In particular the
59     /// operands of these node are:
60     ///
61     ///     #0 - The incoming token chain
62     ///     #1 - The callee
63     ///     #2 - The number of arg bytes the caller pushes on the stack.
64     ///     #3 - The number of arg bytes the callee pops off the stack.
65     ///     #4 - The value to pass in AL/AX/EAX (optional)
66     ///     #5 - The value to pass in DL/DX/EDX (optional)
67     ///
68     /// The result values of these nodes are:
69     ///
70     ///     #0 - The outgoing token chain
71     ///     #1 - The first register result value (optional)
72     ///     #2 - The second register result value (optional)
73     ///
74     CALL,
75 
76     /// Same as call except it adds the NoTrack prefix.
77     NT_CALL,
78 
79     /// X86 compare and logical compare instructions.
80     CMP,
81     FCMP,
82     COMI,
83     UCOMI,
84 
85     /// X86 bit-test instructions.
86     BT,
87 
88     /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
89     /// operand, usually produced by a CMP instruction.
90     SETCC,
91 
92     /// X86 Select
93     SELECTS,
94 
95     // Same as SETCC except it's materialized with a sbb and the value is all
96     // one's or all zero's.
97     SETCC_CARRY, // R = carry_bit ? ~0 : 0
98 
99     /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
100     /// Operands are two FP values to compare; result is a mask of
101     /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
102     FSETCC,
103 
104     /// X86 FP SETCC, similar to above, but with output as an i1 mask and
105     /// and a version with SAE.
106     FSETCCM,
107     FSETCCM_SAE,
108 
109     /// X86 conditional moves. Operand 0 and operand 1 are the two values
110     /// to select from. Operand 2 is the condition code, and operand 3 is the
111     /// flag operand produced by a CMP or TEST instruction.
112     CMOV,
113 
114     /// X86 conditional branches. Operand 0 is the chain operand, operand 1
115     /// is the block to branch if condition is true, operand 2 is the
116     /// condition code, and operand 3 is the flag operand produced by a CMP
117     /// or TEST instruction.
118     BRCOND,
119 
120     /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
121     /// operand 1 is the target address.
122     NT_BRIND,
123 
124     /// Return with a flag operand. Operand 0 is the chain operand, operand
125     /// 1 is the number of bytes of stack to pop.
126     RET_FLAG,
127 
128     /// Return from interrupt. Operand 0 is the number of bytes to pop.
129     IRET,
130 
131     /// Repeat fill, corresponds to X86::REP_STOSx.
132     REP_STOS,
133 
134     /// Repeat move, corresponds to X86::REP_MOVSx.
135     REP_MOVS,
136 
137     /// On Darwin, this node represents the result of the popl
138     /// at function entry, used for PIC code.
139     GlobalBaseReg,
140 
141     /// A wrapper node for TargetConstantPool, TargetJumpTable,
142     /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
143     /// MCSymbol and TargetBlockAddress.
144     Wrapper,
145 
146     /// Special wrapper used under X86-64 PIC mode for RIP
147     /// relative displacements.
148     WrapperRIP,
149 
150     /// Copies a 64-bit value from an MMX vector to the low word
151     /// of an XMM vector, with the high word zero filled.
152     MOVQ2DQ,
153 
154     /// Copies a 64-bit value from the low word of an XMM vector
155     /// to an MMX vector.
156     MOVDQ2Q,
157 
158     /// Copies a 32-bit value from the low word of a MMX
159     /// vector to a GPR.
160     MMX_MOVD2W,
161 
162     /// Copies a GPR into the low 32-bit word of a MMX vector
163     /// and zero out the high word.
164     MMX_MOVW2D,
165 
166     /// Extract an 8-bit value from a vector and zero extend it to
167     /// i32, corresponds to X86::PEXTRB.
168     PEXTRB,
169 
170     /// Extract a 16-bit value from a vector and zero extend it to
171     /// i32, corresponds to X86::PEXTRW.
172     PEXTRW,
173 
174     /// Insert any element of a 4 x float vector into any element
175     /// of a destination 4 x floatvector.
176     INSERTPS,
177 
178     /// Insert the lower 8-bits of a 32-bit value to a vector,
179     /// corresponds to X86::PINSRB.
180     PINSRB,
181 
182     /// Insert the lower 16-bits of a 32-bit value to a vector,
183     /// corresponds to X86::PINSRW.
184     PINSRW,
185 
186     /// Shuffle 16 8-bit values within a vector.
187     PSHUFB,
188 
189     /// Compute Sum of Absolute Differences.
190     PSADBW,
191     /// Compute Double Block Packed Sum-Absolute-Differences
192     DBPSADBW,
193 
194     /// Bitwise Logical AND NOT of Packed FP values.
195     ANDNP,
196 
197     /// Blend where the selector is an immediate.
198     BLENDI,
199 
200     /// Dynamic (non-constant condition) vector blend where only the sign bits
201     /// of the condition elements are used. This is used to enforce that the
202     /// condition mask is not valid for generic VSELECT optimizations. This
203     /// is also used to implement the intrinsics.
204     /// Operands are in VSELECT order: MASK, TRUE, FALSE
205     BLENDV,
206 
207     /// Combined add and sub on an FP vector.
208     ADDSUB,
209 
210     //  FP vector ops with rounding mode.
211     FADD_RND,
212     FADDS,
213     FADDS_RND,
214     FSUB_RND,
215     FSUBS,
216     FSUBS_RND,
217     FMUL_RND,
218     FMULS,
219     FMULS_RND,
220     FDIV_RND,
221     FDIVS,
222     FDIVS_RND,
223     FMAX_SAE,
224     FMAXS_SAE,
225     FMIN_SAE,
226     FMINS_SAE,
227     FSQRT_RND,
228     FSQRTS,
229     FSQRTS_RND,
230 
231     // FP vector get exponent.
232     FGETEXP,
233     FGETEXP_SAE,
234     FGETEXPS,
235     FGETEXPS_SAE,
236     // Extract Normalized Mantissas.
237     VGETMANT,
238     VGETMANT_SAE,
239     VGETMANTS,
240     VGETMANTS_SAE,
241     // FP Scale.
242     SCALEF,
243     SCALEF_RND,
244     SCALEFS,
245     SCALEFS_RND,
246 
247     // Unsigned Integer average.
248     AVG,
249 
250     /// Integer horizontal add/sub.
251     HADD,
252     HSUB,
253 
254     /// Floating point horizontal add/sub.
255     FHADD,
256     FHSUB,
257 
258     // Detect Conflicts Within a Vector
259     CONFLICT,
260 
261     /// Floating point max and min.
262     FMAX,
263     FMIN,
264 
265     /// Commutative FMIN and FMAX.
266     FMAXC,
267     FMINC,
268 
269     /// Scalar intrinsic floating point max and min.
270     FMAXS,
271     FMINS,
272 
273     /// Floating point reciprocal-sqrt and reciprocal approximation.
274     /// Note that these typically require refinement
275     /// in order to obtain suitable precision.
276     FRSQRT,
277     FRCP,
278 
279     // AVX-512 reciprocal approximations with a little more precision.
280     RSQRT14,
281     RSQRT14S,
282     RCP14,
283     RCP14S,
284 
285     // Thread Local Storage.
286     TLSADDR,
287 
288     // Thread Local Storage. A call to get the start address
289     // of the TLS block for the current module.
290     TLSBASEADDR,
291 
292     // Thread Local Storage.  When calling to an OS provided
293     // thunk at the address from an earlier relocation.
294     TLSCALL,
295 
296     // Exception Handling helpers.
297     EH_RETURN,
298 
299     // SjLj exception handling setjmp.
300     EH_SJLJ_SETJMP,
301 
302     // SjLj exception handling longjmp.
303     EH_SJLJ_LONGJMP,
304 
305     // SjLj exception handling dispatch.
306     EH_SJLJ_SETUP_DISPATCH,
307 
308     /// Tail call return. See X86TargetLowering::LowerCall for
309     /// the list of operands.
310     TC_RETURN,
311 
312     // Vector move to low scalar and zero higher vector elements.
313     VZEXT_MOVL,
314 
315     // Vector integer truncate.
316     VTRUNC,
317     // Vector integer truncate with unsigned/signed saturation.
318     VTRUNCUS,
319     VTRUNCS,
320 
321     // Masked version of the above. Used when less than a 128-bit result is
322     // produced since the mask only applies to the lower elements and can't
323     // be represented by a select.
324     // SRC, PASSTHRU, MASK
325     VMTRUNC,
326     VMTRUNCUS,
327     VMTRUNCS,
328 
329     // Vector FP extend.
330     VFPEXT,
331     VFPEXT_SAE,
332     VFPEXTS,
333     VFPEXTS_SAE,
334 
335     // Vector FP round.
336     VFPROUND,
337     VFPROUND_RND,
338     VFPROUNDS,
339     VFPROUNDS_RND,
340 
341     // Masked version of above. Used for v2f64->v4f32.
342     // SRC, PASSTHRU, MASK
343     VMFPROUND,
344 
345     // 128-bit vector logical left / right shift
346     VSHLDQ,
347     VSRLDQ,
348 
349     // Vector shift elements
350     VSHL,
351     VSRL,
352     VSRA,
353 
354     // Vector variable shift
355     VSHLV,
356     VSRLV,
357     VSRAV,
358 
359     // Vector shift elements by immediate
360     VSHLI,
361     VSRLI,
362     VSRAI,
363 
364     // Shifts of mask registers.
365     KSHIFTL,
366     KSHIFTR,
367 
368     // Bit rotate by immediate
369     VROTLI,
370     VROTRI,
371 
372     // Vector packed double/float comparison.
373     CMPP,
374 
375     // Vector integer comparisons.
376     PCMPEQ,
377     PCMPGT,
378 
379     // v8i16 Horizontal minimum and position.
380     PHMINPOS,
381 
382     MULTISHIFT,
383 
384     /// Vector comparison generating mask bits for fp and
385     /// integer signed and unsigned data types.
386     CMPM,
387     // Vector mask comparison generating mask bits for FP values.
388     CMPMM,
389     // Vector mask comparison with SAE for FP values.
390     CMPMM_SAE,
391 
392     // Arithmetic operations with FLAGS results.
393     ADD,
394     SUB,
395     ADC,
396     SBB,
397     SMUL,
398     UMUL,
399     OR,
400     XOR,
401     AND,
402 
403     // Bit field extract.
404     BEXTR,
405     BEXTRI,
406 
407     // Zero High Bits Starting with Specified Bit Position.
408     BZHI,
409 
410     // Parallel extract and deposit.
411     PDEP,
412     PEXT,
413 
414     // X86-specific multiply by immediate.
415     MUL_IMM,
416 
417     // Vector sign bit extraction.
418     MOVMSK,
419 
420     // Vector bitwise comparisons.
421     PTEST,
422 
423     // Vector packed fp sign bitwise comparisons.
424     TESTP,
425 
426     // OR/AND test for masks.
427     KORTEST,
428     KTEST,
429 
430     // ADD for masks.
431     KADD,
432 
433     // Several flavors of instructions with vector shuffle behaviors.
434     // Saturated signed/unnsigned packing.
435     PACKSS,
436     PACKUS,
437     // Intra-lane alignr.
438     PALIGNR,
439     // AVX512 inter-lane alignr.
440     VALIGN,
441     PSHUFD,
442     PSHUFHW,
443     PSHUFLW,
444     SHUFP,
445     // VBMI2 Concat & Shift.
446     VSHLD,
447     VSHRD,
448     VSHLDV,
449     VSHRDV,
450     // Shuffle Packed Values at 128-bit granularity.
451     SHUF128,
452     MOVDDUP,
453     MOVSHDUP,
454     MOVSLDUP,
455     MOVLHPS,
456     MOVHLPS,
457     MOVSD,
458     MOVSS,
459     UNPCKL,
460     UNPCKH,
461     VPERMILPV,
462     VPERMILPI,
463     VPERMI,
464     VPERM2X128,
465 
466     // Variable Permute (VPERM).
467     // Res = VPERMV MaskV, V0
468     VPERMV,
469 
470     // 3-op Variable Permute (VPERMT2).
471     // Res = VPERMV3 V0, MaskV, V1
472     VPERMV3,
473 
474     // Bitwise ternary logic.
475     VPTERNLOG,
476     // Fix Up Special Packed Float32/64 values.
477     VFIXUPIMM,
478     VFIXUPIMM_SAE,
479     VFIXUPIMMS,
480     VFIXUPIMMS_SAE,
481     // Range Restriction Calculation For Packed Pairs of Float32/64 values.
482     VRANGE,
483     VRANGE_SAE,
484     VRANGES,
485     VRANGES_SAE,
486     // Reduce - Perform Reduction Transformation on scalar\packed FP.
487     VREDUCE,
488     VREDUCE_SAE,
489     VREDUCES,
490     VREDUCES_SAE,
491     // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
492     // Also used by the legacy (V)ROUND intrinsics where we mask out the
493     // scaling part of the immediate.
494     VRNDSCALE,
495     VRNDSCALE_SAE,
496     VRNDSCALES,
497     VRNDSCALES_SAE,
498     // Tests Types Of a FP Values for packed types.
499     VFPCLASS,
500     // Tests Types Of a FP Values for scalar types.
501     VFPCLASSS,
502 
503     // Broadcast (splat) scalar or element 0 of a vector. If the operand is
504     // a vector, this node may change the vector length as part of the splat.
505     VBROADCAST,
506     // Broadcast mask to vector.
507     VBROADCASTM,
508 
509     /// SSE4A Extraction and Insertion.
510     EXTRQI,
511     INSERTQI,
512 
513     // XOP arithmetic/logical shifts.
514     VPSHA,
515     VPSHL,
516     // XOP signed/unsigned integer comparisons.
517     VPCOM,
518     VPCOMU,
519     // XOP packed permute bytes.
520     VPPERM,
521     // XOP two source permutation.
522     VPERMIL2,
523 
524     // Vector multiply packed unsigned doubleword integers.
525     PMULUDQ,
526     // Vector multiply packed signed doubleword integers.
527     PMULDQ,
528     // Vector Multiply Packed UnsignedIntegers with Round and Scale.
529     MULHRS,
530 
531     // Multiply and Add Packed Integers.
532     VPMADDUBSW,
533     VPMADDWD,
534 
535     // AVX512IFMA multiply and add.
536     // NOTE: These are different than the instruction and perform
537     // op0 x op1 + op2.
538     VPMADD52L,
539     VPMADD52H,
540 
541     // VNNI
542     VPDPBUSD,
543     VPDPBUSDS,
544     VPDPWSSD,
545     VPDPWSSDS,
546 
547     // FMA nodes.
548     // We use the target independent ISD::FMA for the non-inverted case.
549     FNMADD,
550     FMSUB,
551     FNMSUB,
552     FMADDSUB,
553     FMSUBADD,
554 
555     // FMA with rounding mode.
556     FMADD_RND,
557     FNMADD_RND,
558     FMSUB_RND,
559     FNMSUB_RND,
560     FMADDSUB_RND,
561     FMSUBADD_RND,
562 
563     // Compress and expand.
564     COMPRESS,
565     EXPAND,
566 
567     // Bits shuffle
568     VPSHUFBITQMB,
569 
570     // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
571     SINT_TO_FP_RND,
572     UINT_TO_FP_RND,
573     SCALAR_SINT_TO_FP,
574     SCALAR_UINT_TO_FP,
575     SCALAR_SINT_TO_FP_RND,
576     SCALAR_UINT_TO_FP_RND,
577 
578     // Vector float/double to signed/unsigned integer.
579     CVTP2SI,
580     CVTP2UI,
581     CVTP2SI_RND,
582     CVTP2UI_RND,
583     // Scalar float/double to signed/unsigned integer.
584     CVTS2SI,
585     CVTS2UI,
586     CVTS2SI_RND,
587     CVTS2UI_RND,
588 
589     // Vector float/double to signed/unsigned integer with truncation.
590     CVTTP2SI,
591     CVTTP2UI,
592     CVTTP2SI_SAE,
593     CVTTP2UI_SAE,
594     // Scalar float/double to signed/unsigned integer with truncation.
595     CVTTS2SI,
596     CVTTS2UI,
597     CVTTS2SI_SAE,
598     CVTTS2UI_SAE,
599 
600     // Vector signed/unsigned integer to float/double.
601     CVTSI2P,
602     CVTUI2P,
603 
604     // Masked versions of above. Used for v2f64->v4f32.
605     // SRC, PASSTHRU, MASK
606     MCVTP2SI,
607     MCVTP2UI,
608     MCVTTP2SI,
609     MCVTTP2UI,
610     MCVTSI2P,
611     MCVTUI2P,
612 
613     // Vector float to bfloat16.
614     // Convert TWO packed single data to one packed BF16 data
615     CVTNE2PS2BF16,
616     // Convert packed single data to packed BF16 data
617     CVTNEPS2BF16,
618     // Masked version of above.
619     // SRC, PASSTHRU, MASK
620     MCVTNEPS2BF16,
621 
622     // Dot product of BF16 pairs to accumulated into
623     // packed single precision.
624     DPBF16PS,
625 
626     // Save xmm argument registers to the stack, according to %al. An operator
627     // is needed so that this can be expanded with control flow.
628     VASTART_SAVE_XMM_REGS,
629 
630     // Windows's _chkstk call to do stack probing.
631     WIN_ALLOCA,
632 
633     // For allocating variable amounts of stack space when using
634     // segmented stacks. Check if the current stacklet has enough space, and
635     // falls back to heap allocation if not.
636     SEG_ALLOCA,
637 
638     // For allocating stack space when using stack clash protector.
639     // Allocation is performed by block, and each block is probed.
640     PROBED_ALLOCA,
641 
642     // Memory barriers.
643     MEMBARRIER,
644     MFENCE,
645 
646     // Get a random integer and indicate whether it is valid in CF.
647     RDRAND,
648 
649     // Get a NIST SP800-90B & C compliant random integer and
650     // indicate whether it is valid in CF.
651     RDSEED,
652 
653     // Protection keys
654     // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
655     // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
656     // value for ECX.
657     RDPKRU,
658     WRPKRU,
659 
660     // SSE42 string comparisons.
661     // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
662     // will emit one or two instructions based on which results are used. If
663     // flags and index/mask this allows us to use a single instruction since
664     // we won't have to pick and opcode for flags. Instead we can rely on the
665     // DAG to CSE everything and decide at isel.
666     PCMPISTR,
667     PCMPESTR,
668 
669     // Test if in transactional execution.
670     XTEST,
671 
672     // ERI instructions.
673     RSQRT28,
674     RSQRT28_SAE,
675     RSQRT28S,
676     RSQRT28S_SAE,
677     RCP28,
678     RCP28_SAE,
679     RCP28S,
680     RCP28S_SAE,
681     EXP2,
682     EXP2_SAE,
683 
684     // Conversions between float and half-float.
685     CVTPS2PH,
686     CVTPH2PS,
687     CVTPH2PS_SAE,
688 
689     // Masked version of above.
690     // SRC, RND, PASSTHRU, MASK
691     MCVTPS2PH,
692 
693     // Galois Field Arithmetic Instructions
694     GF2P8AFFINEINVQB,
695     GF2P8AFFINEQB,
696     GF2P8MULB,
697 
698     // LWP insert record.
699     LWPINS,
700 
701     // User level wait
702     UMWAIT,
703     TPAUSE,
704 
705     // Enqueue Stores Instructions
706     ENQCMD,
707     ENQCMDS,
708 
709     // For avx512-vp2intersect
710     VP2INTERSECT,
711 
712     // User level interrupts - testui
713     TESTUI,
714 
715     /// X86 strict FP compare instructions.
716     STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
717     STRICT_FCMPS,
718 
719     // Vector packed double/float comparison.
720     STRICT_CMPP,
721 
722     /// Vector comparison generating mask bits for fp and
723     /// integer signed and unsigned data types.
724     STRICT_CMPM,
725 
726     // Vector float/double to signed/unsigned integer with truncation.
727     STRICT_CVTTP2SI,
728     STRICT_CVTTP2UI,
729 
730     // Vector FP extend.
731     STRICT_VFPEXT,
732 
733     // Vector FP round.
734     STRICT_VFPROUND,
735 
736     // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
737     // Also used by the legacy (V)ROUND intrinsics where we mask out the
738     // scaling part of the immediate.
739     STRICT_VRNDSCALE,
740 
741     // Vector signed/unsigned integer to float/double.
742     STRICT_CVTSI2P,
743     STRICT_CVTUI2P,
744 
745     // Strict FMA nodes.
746     STRICT_FNMADD,
747     STRICT_FMSUB,
748     STRICT_FNMSUB,
749 
750     // Conversions between float and half-float.
751     STRICT_CVTPS2PH,
752     STRICT_CVTPH2PS,
753 
754     // WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
755     // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
756 
757     // Compare and swap.
758     LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
759     LCMPXCHG8_DAG,
760     LCMPXCHG16_DAG,
761     LCMPXCHG16_SAVE_RBX_DAG,
762 
763     /// LOCK-prefixed arithmetic read-modify-write instructions.
764     /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
765     LADD,
766     LSUB,
767     LOR,
768     LXOR,
769     LAND,
770 
771     // Load, scalar_to_vector, and zero extend.
772     VZEXT_LOAD,
773 
774     // extract_vector_elt, store.
775     VEXTRACT_STORE,
776 
777     // scalar broadcast from memory.
778     VBROADCAST_LOAD,
779 
780     // subvector broadcast from memory.
781     SUBV_BROADCAST_LOAD,
782 
783     // Store FP control world into i16 memory.
784     FNSTCW16m,
785 
786     /// This instruction implements FP_TO_SINT with the
787     /// integer destination in memory and a FP reg source.  This corresponds
788     /// to the X86::FIST*m instructions and the rounding mode change stuff. It
789     /// has two inputs (token chain and address) and two outputs (int value
790     /// and token chain). Memory VT specifies the type to store to.
791     FP_TO_INT_IN_MEM,
792 
793     /// This instruction implements SINT_TO_FP with the
794     /// integer source in memory and FP reg result.  This corresponds to the
795     /// X86::FILD*m instructions. It has two inputs (token chain and address)
796     /// and two outputs (FP value and token chain). The integer source type is
797     /// specified by the memory VT.
798     FILD,
799 
800     /// This instruction implements a fp->int store from FP stack
801     /// slots. This corresponds to the fist instruction. It takes a
802     /// chain operand, value to store, address, and glue. The memory VT
803     /// specifies the type to store as.
804     FIST,
805 
806     /// This instruction implements an extending load to FP stack slots.
807     /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
808     /// operand, and ptr to load from. The memory VT specifies the type to
809     /// load from.
810     FLD,
811 
812     /// This instruction implements a truncating store from FP stack
813     /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
814     /// chain operand, value to store, address, and glue. The memory VT
815     /// specifies the type to store as.
816     FST,
817 
818     /// These instructions grab the address of the next argument
819     /// from a va_list. (reads and modifies the va_list in memory)
820     VAARG_64,
821     VAARG_X32,
822 
823     // Vector truncating store with unsigned/signed saturation
824     VTRUNCSTOREUS,
825     VTRUNCSTORES,
826     // Vector truncating masked store with unsigned/signed saturation
827     VMTRUNCSTOREUS,
828     VMTRUNCSTORES,
829 
830     // X86 specific gather and scatter
831     MGATHER,
832     MSCATTER,
833 
834     // Key locker nodes that produce flags.
835     AESENC128KL,
836     AESDEC128KL,
837     AESENC256KL,
838     AESDEC256KL,
839     AESENCWIDE128KL,
840     AESDECWIDE128KL,
841     AESENCWIDE256KL,
842     AESDECWIDE256KL,
843 
844     // WARNING: Do not add anything in the end unless you want the node to
845     // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
846     // opcodes will be thought as target memory ops!
847   };
848   } // end namespace X86ISD
849 
850   /// Define some predicates that are used for node matching.
851   namespace X86 {
852     /// Returns true if Elt is a constant zero or floating point constant +0.0.
853     bool isZeroNode(SDValue Elt);
854 
855     /// Returns true of the given offset can be
856     /// fit into displacement field of the instruction.
857     bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
858                                       bool hasSymbolicDisplacement);
859 
860     /// Determines whether the callee is required to pop its
861     /// own arguments. Callee pop is necessary to support tail calls.
862     bool isCalleePop(CallingConv::ID CallingConv,
863                      bool is64Bit, bool IsVarArg, bool GuaranteeTCO);
864 
865     /// If Op is a constant whose elements are all the same constant or
866     /// undefined, return true and return the constant value in \p SplatVal.
867     /// If we have undef bits that don't cover an entire element, we treat these
868     /// as zero if AllowPartialUndefs is set, else we fail and return false.
869     bool isConstantSplat(SDValue Op, APInt &SplatVal,
870                          bool AllowPartialUndefs = true);
871   } // end namespace X86
872 
873   //===--------------------------------------------------------------------===//
874   //  X86 Implementation of the TargetLowering interface
875   class X86TargetLowering final : public TargetLowering {
876   public:
877     explicit X86TargetLowering(const X86TargetMachine &TM,
878                                const X86Subtarget &STI);
879 
880     unsigned getJumpTableEncoding() const override;
881     bool useSoftFloat() const override;
882 
883     void markLibCallAttributes(MachineFunction *MF, unsigned CC,
884                                ArgListTy &Args) const override;
885 
886     MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
887       return MVT::i8;
888     }
889 
890     const MCExpr *
891     LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
892                               const MachineBasicBlock *MBB, unsigned uid,
893                               MCContext &Ctx) const override;
894 
895     /// Returns relocation base for the given PIC jumptable.
896     SDValue getPICJumpTableRelocBase(SDValue Table,
897                                      SelectionDAG &DAG) const override;
898     const MCExpr *
899     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
900                                  unsigned JTI, MCContext &Ctx) const override;
901 
902     /// Return the desired alignment for ByVal aggregate
903     /// function arguments in the caller parameter area. For X86, aggregates
904     /// that contains are placed at 16-byte boundaries while the rest are at
905     /// 4-byte boundaries.
906     unsigned getByValTypeAlignment(Type *Ty,
907                                    const DataLayout &DL) const override;
908 
909     EVT getOptimalMemOpType(const MemOp &Op,
910                             const AttributeList &FuncAttributes) const override;
911 
912     /// Returns true if it's safe to use load / store of the
913     /// specified type to expand memcpy / memset inline. This is mostly true
914     /// for all types except for some special cases. For example, on X86
915     /// targets without SSE2 f64 load / store are done with fldl / fstpl which
916     /// also does type conversion. Note the specified type doesn't have to be
917     /// legal as the hook is used before type legalization.
918     bool isSafeMemOpType(MVT VT) const override;
919 
920     /// Returns true if the target allows unaligned memory accesses of the
921     /// specified type. Returns whether it is "fast" in the last argument.
922     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
923                                         MachineMemOperand::Flags Flags,
924                                         bool *Fast) const override;
925 
926     /// Provide custom lowering hooks for some operations.
927     ///
928     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
929 
930     /// Replace the results of node with an illegal result
931     /// type with new values built out of custom code.
932     ///
933     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
934                             SelectionDAG &DAG) const override;
935 
936     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
937 
938     /// Return true if the target has native support for
939     /// the specified value type and it is 'desirable' to use the type for the
940     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
941     /// instruction encodings are longer and some i16 instructions are slow.
942     bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override;
943 
944     /// Return true if the target has native support for the
945     /// specified value type and it is 'desirable' to use the type. e.g. On x86
946     /// i16 is legal, but undesirable since i16 instruction encodings are longer
947     /// and some i16 instructions are slow.
948     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
949 
950     /// Return the newly negated expression if the cost is not expensive and
951     /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
952     /// do the negation.
953     SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
954                                  bool LegalOperations, bool ForCodeSize,
955                                  NegatibleCost &Cost,
956                                  unsigned Depth) const override;
957 
958     MachineBasicBlock *
959     EmitInstrWithCustomInserter(MachineInstr &MI,
960                                 MachineBasicBlock *MBB) const override;
961 
962     /// This method returns the name of a target specific DAG node.
963     const char *getTargetNodeName(unsigned Opcode) const override;
964 
965     /// Do not merge vector stores after legalization because that may conflict
966     /// with x86-specific store splitting optimizations.
967     bool mergeStoresAfterLegalization(EVT MemVT) const override {
968       return !MemVT.isVector();
969     }
970 
971     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
972                           const SelectionDAG &DAG) const override;
973 
974     bool isCheapToSpeculateCttz() const override;
975 
976     bool isCheapToSpeculateCtlz() const override;
977 
978     bool isCtlzFast() const override;
979 
980     bool hasBitPreservingFPLogic(EVT VT) const override {
981       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
982     }
983 
984     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
985       // If the pair to store is a mixture of float and int values, we will
986       // save two bitwise instructions and one float-to-int instruction and
987       // increase one store instruction. There is potentially a more
988       // significant benefit because it avoids the float->int domain switch
989       // for input value. So It is more likely a win.
990       if ((LTy.isFloatingPoint() && HTy.isInteger()) ||
991           (LTy.isInteger() && HTy.isFloatingPoint()))
992         return true;
993       // If the pair only contains int values, we will save two bitwise
994       // instructions and increase one store instruction (costing one more
995       // store buffer). Since the benefit is more blurred so we leave
996       // such pair out until we get testcase to prove it is a win.
997       return false;
998     }
999 
1000     bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
1001 
1002     bool hasAndNotCompare(SDValue Y) const override;
1003 
1004     bool hasAndNot(SDValue Y) const override;
1005 
1006     bool hasBitTest(SDValue X, SDValue Y) const override;
1007 
1008     bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
1009         SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
1010         unsigned OldShiftOpcode, unsigned NewShiftOpcode,
1011         SelectionDAG &DAG) const override;
1012 
1013     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
1014                                            CombineLevel Level) const override;
1015 
1016     bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
1017 
1018     bool
1019     shouldTransformSignedTruncationCheck(EVT XVT,
1020                                          unsigned KeptBits) const override {
1021       // For vectors, we don't have a preference..
1022       if (XVT.isVector())
1023         return false;
1024 
1025       auto VTIsOk = [](EVT VT) -> bool {
1026         return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
1027                VT == MVT::i64;
1028       };
1029 
1030       // We are ok with KeptBitsVT being byte/word/dword, what MOVS supports.
1031       // XVT will be larger than KeptBitsVT.
1032       MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
1033       return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
1034     }
1035 
1036     bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override;
1037 
1038     bool shouldSplatInsEltVarIndex(EVT VT) const override;
1039 
1040     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
1041       return VT.isScalarInteger();
1042     }
1043 
1044     /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
1045     MVT hasFastEqualityCompare(unsigned NumBits) const override;
1046 
1047     /// Return the value type to use for ISD::SETCC.
1048     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
1049                            EVT VT) const override;
1050 
1051     bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
1052                                       const APInt &DemandedElts,
1053                                       TargetLoweringOpt &TLO) const override;
1054 
1055     /// Determine which of the bits specified in Mask are known to be either
1056     /// zero or one and return them in the KnownZero/KnownOne bitsets.
1057     void computeKnownBitsForTargetNode(const SDValue Op,
1058                                        KnownBits &Known,
1059                                        const APInt &DemandedElts,
1060                                        const SelectionDAG &DAG,
1061                                        unsigned Depth = 0) const override;
1062 
1063     /// Determine the number of bits in the operation that are sign bits.
1064     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
1065                                              const APInt &DemandedElts,
1066                                              const SelectionDAG &DAG,
1067                                              unsigned Depth) const override;
1068 
1069     bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op,
1070                                                  const APInt &DemandedElts,
1071                                                  APInt &KnownUndef,
1072                                                  APInt &KnownZero,
1073                                                  TargetLoweringOpt &TLO,
1074                                                  unsigned Depth) const override;
1075 
1076     bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
1077                                                     const APInt &DemandedElts,
1078                                                     unsigned MaskIndex,
1079                                                     TargetLoweringOpt &TLO,
1080                                                     unsigned Depth) const;
1081 
1082     bool SimplifyDemandedBitsForTargetNode(SDValue Op,
1083                                            const APInt &DemandedBits,
1084                                            const APInt &DemandedElts,
1085                                            KnownBits &Known,
1086                                            TargetLoweringOpt &TLO,
1087                                            unsigned Depth) const override;
1088 
1089     SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
1090         SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1091         SelectionDAG &DAG, unsigned Depth) const override;
1092 
1093     const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const override;
1094 
1095     SDValue unwrapAddress(SDValue N) const override;
1096 
1097     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
1098 
1099     bool ExpandInlineAsm(CallInst *CI) const override;
1100 
1101     ConstraintType getConstraintType(StringRef Constraint) const override;
1102 
1103     /// Examine constraint string and operand type and determine a weight value.
1104     /// The operand object must already have been set up with the operand type.
1105     ConstraintWeight
1106       getSingleConstraintMatchWeight(AsmOperandInfo &info,
1107                                      const char *constraint) const override;
1108 
1109     const char *LowerXConstraint(EVT ConstraintVT) const override;
1110 
1111     /// Lower the specified operand into the Ops vector. If it is invalid, don't
1112     /// add anything to Ops. If hasMemory is true it means one of the asm
1113     /// constraint of the inline asm instruction being processed is 'm'.
1114     void LowerAsmOperandForConstraint(SDValue Op,
1115                                       std::string &Constraint,
1116                                       std::vector<SDValue> &Ops,
1117                                       SelectionDAG &DAG) const override;
1118 
1119     unsigned
1120     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
1121       if (ConstraintCode == "o")
1122         return InlineAsm::Constraint_o;
1123       else if (ConstraintCode == "v")
1124         return InlineAsm::Constraint_v;
1125       else if (ConstraintCode == "X")
1126         return InlineAsm::Constraint_X;
1127       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
1128     }
1129 
1130     /// Handle Lowering flag assembly outputs.
1131     SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag,
1132                                         const SDLoc &DL,
1133                                         const AsmOperandInfo &Constraint,
1134                                         SelectionDAG &DAG) const override;
1135 
1136     /// Given a physical register constraint
1137     /// (e.g. {edx}), return the register number and the register class for the
1138     /// register.  This should only be used for C_Register constraints.  On
1139     /// error, this returns a register number of 0.
1140     std::pair<unsigned, const TargetRegisterClass *>
1141     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
1142                                  StringRef Constraint, MVT VT) const override;
1143 
1144     /// Return true if the addressing mode represented
1145     /// by AM is legal for this target, for a load/store of the specified type.
1146     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
1147                                Type *Ty, unsigned AS,
1148                                Instruction *I = nullptr) const override;
1149 
1150     /// Return true if the specified immediate is legal
1151     /// icmp immediate, that is the target has icmp instructions which can
1152     /// compare a register against the immediate without having to materialize
1153     /// the immediate into a register.
1154     bool isLegalICmpImmediate(int64_t Imm) const override;
1155 
1156     /// Return true if the specified immediate is legal
1157     /// add immediate, that is the target has add instructions which can
1158     /// add a register and the immediate without having to materialize
1159     /// the immediate into a register.
1160     bool isLegalAddImmediate(int64_t Imm) const override;
1161 
1162     bool isLegalStoreImmediate(int64_t Imm) const override;
1163 
1164     /// Return the cost of the scaling factor used in the addressing
1165     /// mode represented by AM for this target, for a load/store
1166     /// of the specified type.
1167     /// If the AM is supported, the return value must be >= 0.
1168     /// If the AM is not supported, it returns a negative value.
1169     int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
1170                              unsigned AS) const override;
1171 
1172     /// This is used to enable splatted operand transforms for vector shifts
1173     /// and vector funnel shifts.
1174     bool isVectorShiftByScalarCheap(Type *Ty) const override;
1175 
1176     /// Add x86-specific opcodes to the default list.
1177     bool isBinOp(unsigned Opcode) const override;
1178 
1179     /// Returns true if the opcode is a commutative binary operation.
1180     bool isCommutativeBinOp(unsigned Opcode) const override;
1181 
1182     /// Return true if it's free to truncate a value of
1183     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
1184     /// register EAX to i16 by referencing its sub-register AX.
1185     bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
1186     bool isTruncateFree(EVT VT1, EVT VT2) const override;
1187 
1188     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
1189 
1190     /// Return true if any actual instruction that defines a
1191     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
1192     /// register. This does not necessarily include registers defined in
1193     /// unknown ways, such as incoming arguments, or copies from unknown
1194     /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
1195     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
1196     /// all instructions that define 32-bit values implicit zero-extend the
1197     /// result out to 64 bits.
1198     bool isZExtFree(Type *Ty1, Type *Ty2) const override;
1199     bool isZExtFree(EVT VT1, EVT VT2) const override;
1200     bool isZExtFree(SDValue Val, EVT VT2) const override;
1201 
1202     bool shouldSinkOperands(Instruction *I,
1203                             SmallVectorImpl<Use *> &Ops) const override;
1204     bool shouldConvertPhiType(Type *From, Type *To) const override;
1205 
1206     /// Return true if folding a vector load into ExtVal (a sign, zero, or any
1207     /// extend node) is profitable.
1208     bool isVectorLoadExtDesirable(SDValue) const override;
1209 
1210     /// Return true if an FMA operation is faster than a pair of fmul and fadd
1211     /// instructions. fmuladd intrinsics will be expanded to FMAs when this
1212     /// method returns true, otherwise fmuladd is expanded to fmul + fadd.
1213     bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
1214                                     EVT VT) const override;
1215 
1216     /// Return true if it's profitable to narrow
1217     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
1218     /// from i32 to i8 but not from i32 to i16.
1219     bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
1220 
1221     /// Given an intrinsic, checks if on the target the intrinsic will need to map
1222     /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
1223     /// true and stores the intrinsic information into the IntrinsicInfo that was
1224     /// passed to the function.
1225     bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
1226                             MachineFunction &MF,
1227                             unsigned Intrinsic) const override;
1228 
1229     /// Returns true if the target can instruction select the
1230     /// specified FP immediate natively. If false, the legalizer will
1231     /// materialize the FP immediate as a load from a constant pool.
1232     bool isFPImmLegal(const APFloat &Imm, EVT VT,
1233                       bool ForCodeSize) const override;
1234 
1235     /// Targets can use this to indicate that they only support *some*
1236     /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
1237     /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
1238     /// be legal.
1239     bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1240 
1241     /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
1242     /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
1243     /// constant pool entry.
1244     bool isVectorClearMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
1245 
1246     /// Returns true if lowering to a jump table is allowed.
1247     bool areJTsAllowed(const Function *Fn) const override;
1248 
1249     /// If true, then instruction selection should
1250     /// seek to shrink the FP constant of the specified type to a smaller type
1251     /// in order to save space and / or reduce runtime.
1252     bool ShouldShrinkFPConstant(EVT VT) const override {
1253       // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
1254       // expensive than a straight movsd. On the other hand, it's important to
1255       // shrink long double fp constant since fldt is very slow.
1256       return !X86ScalarSSEf64 || VT == MVT::f80;
1257     }
1258 
1259     /// Return true if we believe it is correct and profitable to reduce the
1260     /// load node to a smaller type.
1261     bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
1262                                EVT NewVT) const override;
1263 
1264     /// Return true if the specified scalar FP type is computed in an SSE
1265     /// register, not on the X87 floating point stack.
1266     bool isScalarFPTypeInSSEReg(EVT VT) const {
1267       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
1268              (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
1269     }
1270 
1271     /// Returns true if it is beneficial to convert a load of a constant
1272     /// to just the constant itself.
1273     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
1274                                            Type *Ty) const override;
1275 
1276     bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override;
1277 
1278     bool convertSelectOfConstantsToMath(EVT VT) const override;
1279 
1280     bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
1281                                 SDValue C) const override;
1282 
1283     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
1284     /// with this index.
1285     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1286                                  unsigned Index) const override;
1287 
1288     /// Scalar ops always have equal or better analysis/performance/power than
1289     /// the vector equivalent, so this always makes sense if the scalar op is
1290     /// supported.
1291     bool shouldScalarizeBinop(SDValue) const override;
1292 
1293     /// Extract of a scalar FP value from index 0 of a vector is free.
1294     bool isExtractVecEltCheap(EVT VT, unsigned Index) const override {
1295       EVT EltVT = VT.getScalarType();
1296       return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0;
1297     }
1298 
1299     /// Overflow nodes should get combined/lowered to optimal instructions
1300     /// (they should allow eliminating explicit compares by getting flags from
1301     /// math ops).
1302     bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
1303                               bool MathUsed) const override;
1304 
1305     bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
1306                                       unsigned AddrSpace) const override {
1307       // If we can replace more than 2 scalar stores, there will be a reduction
1308       // in instructions even after we add a vector constant load.
1309       return NumElem > 2;
1310     }
1311 
1312     bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
1313                                  const SelectionDAG &DAG,
1314                                  const MachineMemOperand &MMO) const override;
1315 
1316     /// Intel processors have a unified instruction and data cache
1317     const char * getClearCacheBuiltinName() const override {
1318       return nullptr; // nothing to do, move along.
1319     }
1320 
1321     Register getRegisterByName(const char* RegName, LLT VT,
1322                                const MachineFunction &MF) const override;
1323 
1324     /// If a physical register, this returns the register that receives the
1325     /// exception address on entry to an EH pad.
1326     Register
1327     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
1328 
1329     /// If a physical register, this returns the register that receives the
1330     /// exception typeid on entry to a landing pad.
1331     Register
1332     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
1333 
1334     virtual bool needsFixedCatchObjects() const override;
1335 
1336     /// This method returns a target specific FastISel object,
1337     /// or null if the target does not support "fast" ISel.
1338     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1339                              const TargetLibraryInfo *libInfo) const override;
1340 
1341     /// If the target has a standard location for the stack protector cookie,
1342     /// returns the address of that location. Otherwise, returns nullptr.
1343     Value *getIRStackGuard(IRBuilder<> &IRB) const override;
1344 
1345     bool useLoadStackGuardNode() const override;
1346     bool useStackGuardXorFP() const override;
1347     void insertSSPDeclarations(Module &M) const override;
1348     Value *getSDagStackGuard(const Module &M) const override;
1349     Function *getSSPStackGuardCheck(const Module &M) const override;
1350     SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1351                                 const SDLoc &DL) const override;
1352 
1353 
1354     /// Return true if the target stores SafeStack pointer at a fixed offset in
1355     /// some non-standard address space, and populates the address space and
1356     /// offset as appropriate.
1357     Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
1358 
1359     std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
1360                                           SDValue Chain, SDValue Pointer,
1361                                           MachinePointerInfo PtrInfo,
1362                                           Align Alignment,
1363                                           SelectionDAG &DAG) const;
1364 
1365     /// Customize the preferred legalization strategy for certain types.
1366     LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
1367 
1368     bool softPromoteHalfType() const override { return true; }
1369 
1370     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
1371                                       EVT VT) const override;
1372 
1373     unsigned getNumRegistersForCallingConv(LLVMContext &Context,
1374                                            CallingConv::ID CC,
1375                                            EVT VT) const override;
1376 
1377     unsigned getVectorTypeBreakdownForCallingConv(
1378         LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1379         unsigned &NumIntermediates, MVT &RegisterVT) const override;
1380 
1381     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
1382 
1383     bool supportSwiftError() const override;
1384 
1385     bool hasStackProbeSymbol(MachineFunction &MF) const override;
1386     bool hasInlineStackProbe(MachineFunction &MF) const override;
1387     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
1388 
1389     unsigned getStackProbeSize(MachineFunction &MF) const;
1390 
1391     bool hasVectorBlend() const override { return true; }
1392 
1393     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
1394 
1395     /// Lower interleaved load(s) into target specific
1396     /// instructions/intrinsics.
1397     bool lowerInterleavedLoad(LoadInst *LI,
1398                               ArrayRef<ShuffleVectorInst *> Shuffles,
1399                               ArrayRef<unsigned> Indices,
1400                               unsigned Factor) const override;
1401 
1402     /// Lower interleaved store(s) into target specific
1403     /// instructions/intrinsics.
1404     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
1405                                unsigned Factor) const override;
1406 
1407     SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value,
1408                                    SDValue Addr, SelectionDAG &DAG)
1409                                    const override;
1410 
1411     Align getPrefLoopAlignment(MachineLoop *ML) const override;
1412 
1413   protected:
1414     std::pair<const TargetRegisterClass *, uint8_t>
1415     findRepresentativeClass(const TargetRegisterInfo *TRI,
1416                             MVT VT) const override;
1417 
1418   private:
1419     /// Keep a reference to the X86Subtarget around so that we can
1420     /// make the right decision when generating code for different targets.
1421     const X86Subtarget &Subtarget;
1422 
1423     /// Select between SSE or x87 floating point ops.
1424     /// When SSE is available, use it for f32 operations.
1425     /// When SSE2 is available, use it for f64 operations.
1426     bool X86ScalarSSEf32;
1427     bool X86ScalarSSEf64;
1428 
1429     /// A list of legal FP immediates.
1430     std::vector<APFloat> LegalFPImmediates;
1431 
1432     /// Indicate that this x86 target can instruction
1433     /// select the specified FP immediate natively.
1434     void addLegalFPImmediate(const APFloat& Imm) {
1435       LegalFPImmediates.push_back(Imm);
1436     }
1437 
1438     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
1439                             CallingConv::ID CallConv, bool isVarArg,
1440                             const SmallVectorImpl<ISD::InputArg> &Ins,
1441                             const SDLoc &dl, SelectionDAG &DAG,
1442                             SmallVectorImpl<SDValue> &InVals,
1443                             uint32_t *RegMask) const;
1444     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
1445                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
1446                              const SDLoc &dl, SelectionDAG &DAG,
1447                              const CCValAssign &VA, MachineFrameInfo &MFI,
1448                              unsigned i) const;
1449     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
1450                              const SDLoc &dl, SelectionDAG &DAG,
1451                              const CCValAssign &VA,
1452                              ISD::ArgFlagsTy Flags, bool isByval) const;
1453 
1454     // Call lowering helpers.
1455 
1456     /// Check whether the call is eligible for tail call optimization. Targets
1457     /// that want to do tail call optimization should implement this function.
1458     bool IsEligibleForTailCallOptimization(SDValue Callee,
1459                                            CallingConv::ID CalleeCC,
1460                                            bool isVarArg,
1461                                            bool isCalleeStructRet,
1462                                            bool isCallerStructRet,
1463                                            Type *RetTy,
1464                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
1465                                     const SmallVectorImpl<SDValue> &OutVals,
1466                                     const SmallVectorImpl<ISD::InputArg> &Ins,
1467                                            SelectionDAG& DAG) const;
1468     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
1469                                     SDValue Chain, bool IsTailCall,
1470                                     bool Is64Bit, int FPDiff,
1471                                     const SDLoc &dl) const;
1472 
1473     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
1474                                          SelectionDAG &DAG) const;
1475 
1476     unsigned getAddressSpace(void) const;
1477 
1478     SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
1479                             SDValue &Chain) const;
1480     SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
1481 
1482     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
1483     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
1484     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1485     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
1486 
1487     unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr,
1488                                   const unsigned char OpFlags = 0) const;
1489     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
1490     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
1491     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
1492     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
1493     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
1494 
1495     /// Creates target global address or external symbol nodes for calls or
1496     /// other uses.
1497     SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
1498                                   bool ForCall) const;
1499 
1500     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1501     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
1502     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
1503     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
1504     SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const;
1505     SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
1506     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
1507     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
1508     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
1509     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
1510     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
1511     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
1512     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
1513     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
1514     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1515     SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
1516     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
1517     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
1518     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
1519     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
1520     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
1521     SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
1522     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
1523     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
1524     SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
1525     SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
1526     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
1527     SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
1528     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
1529     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
1530 
1531     SDValue
1532     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1533                          const SmallVectorImpl<ISD::InputArg> &Ins,
1534                          const SDLoc &dl, SelectionDAG &DAG,
1535                          SmallVectorImpl<SDValue> &InVals) const override;
1536     SDValue LowerCall(CallLoweringInfo &CLI,
1537                       SmallVectorImpl<SDValue> &InVals) const override;
1538 
1539     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1540                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1541                         const SmallVectorImpl<SDValue> &OutVals,
1542                         const SDLoc &dl, SelectionDAG &DAG) const override;
1543 
1544     bool supportSplitCSR(MachineFunction *MF) const override {
1545       return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
1546           MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
1547     }
1548     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
1549     void insertCopiesSplitCSR(
1550       MachineBasicBlock *Entry,
1551       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
1552 
1553     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
1554 
1555     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
1556 
1557     EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
1558                             ISD::NodeType ExtendKind) const override;
1559 
1560     bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
1561                         bool isVarArg,
1562                         const SmallVectorImpl<ISD::OutputArg> &Outs,
1563                         LLVMContext &Context) const override;
1564 
1565     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
1566 
1567     TargetLoweringBase::AtomicExpansionKind
1568     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
1569     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
1570     TargetLoweringBase::AtomicExpansionKind
1571     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
1572 
1573     LoadInst *
1574     lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
1575 
1576     bool lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const override;
1577     bool lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const override;
1578 
1579     bool needsCmpXchgNb(Type *MemType) const;
1580 
1581     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
1582                                 MachineBasicBlock *DispatchBB, int FI) const;
1583 
1584     // Utility function to emit the low-level va_arg code for X86-64.
1585     MachineBasicBlock *
1586     EmitVAARGWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
1587 
1588     /// Utility function to emit the xmm reg save portion of va_start.
1589     MachineBasicBlock *
1590     EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
1591                                              MachineBasicBlock *BB) const;
1592 
1593     MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
1594                                                  MachineInstr &MI2,
1595                                                  MachineBasicBlock *BB) const;
1596 
1597     MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
1598                                          MachineBasicBlock *BB) const;
1599 
1600     MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
1601                                            MachineBasicBlock *BB) const;
1602 
1603     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
1604                                             MachineBasicBlock *BB) const;
1605 
1606     MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
1607                                                MachineBasicBlock *BB) const;
1608 
1609     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
1610                                           MachineBasicBlock *BB) const;
1611 
1612     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
1613                                           MachineBasicBlock *BB) const;
1614 
1615     MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
1616                                                 MachineBasicBlock *BB) const;
1617 
1618     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
1619                                         MachineBasicBlock *MBB) const;
1620 
1621     void emitSetJmpShadowStackFix(MachineInstr &MI,
1622                                   MachineBasicBlock *MBB) const;
1623 
1624     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
1625                                          MachineBasicBlock *MBB) const;
1626 
1627     MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
1628                                                  MachineBasicBlock *MBB) const;
1629 
1630     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
1631                                              MachineBasicBlock *MBB) const;
1632 
1633     /// Emit flags for the given setcc condition and operands. Also returns the
1634     /// corresponding X86 condition code constant in X86CC.
1635     SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
1636                               const SDLoc &dl, SelectionDAG &DAG,
1637                               SDValue &X86CC) const;
1638 
1639     /// Check if replacement of SQRT with RSQRT should be disabled.
1640     bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
1641 
1642     /// Use rsqrt* to speed up sqrt calculations.
1643     SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
1644                             int &RefinementSteps, bool &UseOneConstNR,
1645                             bool Reciprocal) const override;
1646 
1647     /// Use rcp* to speed up fdiv calculations.
1648     SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
1649                              int &RefinementSteps) const override;
1650 
1651     /// Reassociate floating point divisions into multiply by reciprocal.
1652     unsigned combineRepeatedFPDivisors() const override;
1653 
1654     SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
1655                           SmallVectorImpl<SDNode *> &Created) const override;
1656   };
1657 
1658   namespace X86 {
1659     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
1660                              const TargetLibraryInfo *libInfo);
1661   } // end namespace X86
1662 
1663   // X86 specific Gather/Scatter nodes.
1664   // The class has the same order of operands as MaskedGatherScatterSDNode for
1665   // convenience.
1666   class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
1667   public:
1668     // This is a intended as a utility and should never be directly created.
1669     X86MaskedGatherScatterSDNode() = delete;
1670     ~X86MaskedGatherScatterSDNode() = delete;
1671 
1672     const SDValue &getBasePtr() const { return getOperand(3); }
1673     const SDValue &getIndex()   const { return getOperand(4); }
1674     const SDValue &getMask()    const { return getOperand(2); }
1675     const SDValue &getScale()   const { return getOperand(5); }
1676 
1677     static bool classof(const SDNode *N) {
1678       return N->getOpcode() == X86ISD::MGATHER ||
1679              N->getOpcode() == X86ISD::MSCATTER;
1680     }
1681   };
1682 
1683   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
1684   public:
1685     const SDValue &getPassThru() const { return getOperand(1); }
1686 
1687     static bool classof(const SDNode *N) {
1688       return N->getOpcode() == X86ISD::MGATHER;
1689     }
1690   };
1691 
1692   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
1693   public:
1694     const SDValue &getValue() const { return getOperand(1); }
1695 
1696     static bool classof(const SDNode *N) {
1697       return N->getOpcode() == X86ISD::MSCATTER;
1698     }
1699   };
1700 
1701   /// Generate unpacklo/unpackhi shuffle mask.
1702   void createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, bool Lo,
1703                                bool Unary);
1704 
1705   /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
1706   /// imposed by AVX and specific to the unary pattern. Example:
1707   /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
1708   /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
1709   void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
1710 
1711 } // end namespace llvm
1712 
1713 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
1714