xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision 43e29d03f416d7dda52112a29600a7c82ee1a91e)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               PatFrags mem_frags, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
116                   [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
117  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
118                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
119  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
120                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
121  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
122                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123}
124
125//===----------------------------------------------------------------------===//
126// AVX & SSE - Zero/One Vectors
127//===----------------------------------------------------------------------===//
128
129// Alias instruction that maps zero vector to pxor / xorp* for sse.
130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
131// swizzled by ExecutionDomainFix to pxor.
132// We set canFoldAsLoad because this can be converted to a constant-pool
133// load of an all-zeros value if folding it would be beneficial.
134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
135    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
137               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138}
139
140let Predicates = [NoAVX512] in {
141def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
142def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
143def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
144def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
145def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
146def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147}
148
149
150// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
151// and doesn't need it because on sandy bridge the register is set to zero
152// at the rename stage without using any execution unit, so SET0PSY
153// and SET0PDY can be used for vector int instructions without penalty
154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
155    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
157                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
158}
159
160let Predicates = [NoAVX512] in {
161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
167}
168
169// We set canFoldAsLoad because this can be converted to a constant-pool
170// load of an all-ones value if folding it would be beneficial.
171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
172    isPseudo = 1, SchedRW = [WriteZero] in {
173  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
174                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
175  let Predicates = [HasAVX1Only, OptForMinSize] in {
176  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178  }
179  let Predicates = [HasAVX2] in
180  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
181                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
182}
183
184//===----------------------------------------------------------------------===//
185// SSE 1 & 2 - Move FP Scalar Instructions
186//
187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
188// register copies because it's a partial register update; Register-to-register
189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
190// that the insert be implementable in terms of a copy, and just mentioned, we
191// don't use movss/movsd for copies.
192//===----------------------------------------------------------------------===//
193
194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
195                         string asm_opr, Domain d, string Name> {
196  let isCommutable = 1 in
197  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
198              (ins VR128:$src1, VR128:$src2),
199              !strconcat(base_opc, asm_opr),
200              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
201              Sched<[SchedWriteFShuffle.XMM]>;
202
203  // For the disassembler
204  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
205  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
206                  (ins VR128:$src1, VR128:$src2),
207                  !strconcat(base_opc, asm_opr), []>,
208                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
209}
210
211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212                      X86MemOperand x86memop, string OpcodeStr,
213                      Domain d, string Name, Predicate pred> {
214  // AVX
215  let Predicates = [UseAVX, OptForSize] in
216  defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
218                              "V"#Name>,
219                              VEX_4V, VEX_LIG, VEX_WIG;
220
221  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
222                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
223                     [(store RC:$src, addr:$dst)], d>,
224                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
225  // SSE1 & 2
226  let Constraints = "$src1 = $dst" in {
227    let Predicates = [pred, NoSSE41_Or_OptForSize] in
228    defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
229                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
230  }
231
232  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
233                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
234                     [(store RC:$src, addr:$dst)], d>,
235                     Sched<[WriteFStore]>;
236
237  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
238                  (!cast<Instruction>("V"#NAME#"rr_REV")
239                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
240  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
241                  (!cast<Instruction>(NAME#"rr_REV")
242                   VR128:$dst, VR128:$src2), 0>;
243}
244
245// Loading from memory automatically zeroing upper bits.
246multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
247                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
248                         Domain d> {
249  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
250                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
251                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
252                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
253  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
254                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
255                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
256                     Sched<[WriteFLoad]>;
257
258  // _alt version uses FR32/FR64 register class.
259  let isCodeGenOnly = 1 in {
260  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
261                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
262                         [(set RC:$dst, (mem_pat addr:$src))], d>,
263                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
264  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
265                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
266                         [(set RC:$dst, (mem_pat addr:$src))], d>,
267                         Sched<[WriteFLoad]>;
268  }
269}
270
271defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
272                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
273defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
274                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
275
276let canFoldAsLoad = 1, isReMaterializable = 1 in {
277  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
278                             SSEPackedSingle>, XS;
279  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
280                             SSEPackedDouble>, XD;
281}
282
283// Patterns
284let Predicates = [UseAVX] in {
285  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
286            (VMOVSSrm addr:$src)>;
287  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
288            (VMOVSDrm addr:$src)>;
289
290  // Represent the same patterns above but in the form they appear for
291  // 256-bit types
292  def : Pat<(v8f32 (X86vzload32 addr:$src)),
293            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
294  def : Pat<(v4f64 (X86vzload64 addr:$src)),
295            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
296}
297
298let Predicates = [UseAVX, OptForSize] in {
299  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
300  // MOVSS to the lower bits.
301  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
302            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
303  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
304            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
305
306  // Move low f32 and clear high bits.
307  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
308            (SUBREG_TO_REG (i32 0),
309             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
310              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
311  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
312            (SUBREG_TO_REG (i32 0),
313             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
314              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
315}
316
317let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
318// Move scalar to XMM zero-extended, zeroing a VR128 then do a
319// MOVSS to the lower bits.
320def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
321          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
322def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
323          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
324}
325
326let Predicates = [UseSSE2] in
327def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
328          (MOVSDrm addr:$src)>;
329
330let Predicates = [UseSSE1] in
331def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
332          (MOVSSrm addr:$src)>;
333
334//===----------------------------------------------------------------------===//
335// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
336//===----------------------------------------------------------------------===//
337
338multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
339                            X86MemOperand x86memop, PatFrag ld_frag,
340                            string asm, Domain d,
341                            X86SchedWriteMoveLS sched> {
342let hasSideEffects = 0, isMoveReg = 1 in
343  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
344              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
345           Sched<[sched.RR]>;
346let canFoldAsLoad = 1, isReMaterializable = 1 in
347  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
348              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
349                   [(set RC:$dst, (ld_frag addr:$src))], d>,
350           Sched<[sched.RM]>;
351}
352
353let Predicates = [HasAVX, NoVLX] in {
354defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
355                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
356                                PS, VEX, VEX_WIG;
357defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
358                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
359                                PD, VEX, VEX_WIG;
360defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
361                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
362                                PS, VEX, VEX_WIG;
363defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
364                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
365                                PD, VEX, VEX_WIG;
366
367defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
368                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
369                                 PS, VEX, VEX_L, VEX_WIG;
370defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
371                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
372                                 PD, VEX, VEX_L, VEX_WIG;
373defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
374                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
375                                 PS, VEX, VEX_L, VEX_WIG;
376defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
377                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
378                                 PD, VEX, VEX_L, VEX_WIG;
379}
380
381let Predicates = [UseSSE1] in {
382defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
383                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384                               PS;
385defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
386                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
387                               PS;
388}
389let Predicates = [UseSSE2] in {
390defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
391                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
392                               PD;
393defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
394                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
395                               PD;
396}
397
398let Predicates = [HasAVX, NoVLX]  in {
399let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
400def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
401                   "movaps\t{$src, $dst|$dst, $src}",
402                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
403                   VEX, VEX_WIG;
404def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
405                   "movapd\t{$src, $dst|$dst, $src}",
406                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
407                   VEX, VEX_WIG;
408def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
409                   "movups\t{$src, $dst|$dst, $src}",
410                   [(store (v4f32 VR128:$src), addr:$dst)]>,
411                   VEX, VEX_WIG;
412def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
413                   "movupd\t{$src, $dst|$dst, $src}",
414                   [(store (v2f64 VR128:$src), addr:$dst)]>,
415                   VEX, VEX_WIG;
416} // SchedRW
417
418let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
419def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
420                   "movaps\t{$src, $dst|$dst, $src}",
421                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
422                   VEX, VEX_L, VEX_WIG;
423def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
424                   "movapd\t{$src, $dst|$dst, $src}",
425                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
426                   VEX, VEX_L, VEX_WIG;
427def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
428                   "movups\t{$src, $dst|$dst, $src}",
429                   [(store (v8f32 VR256:$src), addr:$dst)]>,
430                   VEX, VEX_L, VEX_WIG;
431def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
432                   "movupd\t{$src, $dst|$dst, $src}",
433                   [(store (v4f64 VR256:$src), addr:$dst)]>,
434                   VEX, VEX_L, VEX_WIG;
435} // SchedRW
436} // Predicate
437
438// For disassembler
439let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
440    isMoveReg = 1 in {
441let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
442  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
443                          (ins VR128:$src),
444                          "movaps\t{$src, $dst|$dst, $src}", []>,
445                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
446  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
447                           (ins VR128:$src),
448                           "movapd\t{$src, $dst|$dst, $src}", []>,
449                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
450  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
451                           (ins VR128:$src),
452                           "movups\t{$src, $dst|$dst, $src}", []>,
453                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
454  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
455                           (ins VR128:$src),
456                           "movupd\t{$src, $dst|$dst, $src}", []>,
457                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
458} // SchedRW
459
460let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
461  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
462                            (ins VR256:$src),
463                            "movaps\t{$src, $dst|$dst, $src}", []>,
464                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
465  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
466                            (ins VR256:$src),
467                            "movapd\t{$src, $dst|$dst, $src}", []>,
468                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
469  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
470                            (ins VR256:$src),
471                            "movups\t{$src, $dst|$dst, $src}", []>,
472                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
473  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
474                            (ins VR256:$src),
475                            "movupd\t{$src, $dst|$dst, $src}", []>,
476                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
477} // SchedRW
478} // Predicate
479
480// Reversed version with ".s" suffix for GAS compatibility.
481def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
482                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
483def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
484                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
485def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
486                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
487def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
488                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
489def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
490                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
491def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
492                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
493def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
494                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
495def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
496                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
497
498let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
499def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500                   "movaps\t{$src, $dst|$dst, $src}",
501                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
502def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503                   "movapd\t{$src, $dst|$dst, $src}",
504                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
505def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506                   "movups\t{$src, $dst|$dst, $src}",
507                   [(store (v4f32 VR128:$src), addr:$dst)]>;
508def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
509                   "movupd\t{$src, $dst|$dst, $src}",
510                   [(store (v2f64 VR128:$src), addr:$dst)]>;
511} // SchedRW
512
513// For disassembler
514let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
515    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
516  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517                         "movaps\t{$src, $dst|$dst, $src}", []>,
518                         FoldGenData<"MOVAPSrr">;
519  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520                         "movapd\t{$src, $dst|$dst, $src}", []>,
521                         FoldGenData<"MOVAPDrr">;
522  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523                         "movups\t{$src, $dst|$dst, $src}", []>,
524                         FoldGenData<"MOVUPSrr">;
525  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
526                         "movupd\t{$src, $dst|$dst, $src}", []>,
527                         FoldGenData<"MOVUPDrr">;
528}
529
530// Reversed version with ".s" suffix for GAS compatibility.
531def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
532                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
533def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
534                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
535def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
536                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
537def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
538                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
539
540let Predicates = [HasAVX, NoVLX] in {
541  // 256-bit load/store need to use floating point load/store in case we don't
542  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
543  // available and changing the domain is beneficial.
544  def : Pat<(alignedloadv4i64 addr:$src),
545            (VMOVAPSYrm addr:$src)>;
546  def : Pat<(alignedloadv8i32 addr:$src),
547            (VMOVAPSYrm addr:$src)>;
548  def : Pat<(alignedloadv16i16 addr:$src),
549            (VMOVAPSYrm addr:$src)>;
550  def : Pat<(alignedloadv32i8 addr:$src),
551            (VMOVAPSYrm addr:$src)>;
552  def : Pat<(loadv4i64 addr:$src),
553            (VMOVUPSYrm addr:$src)>;
554  def : Pat<(loadv8i32 addr:$src),
555            (VMOVUPSYrm addr:$src)>;
556  def : Pat<(loadv16i16 addr:$src),
557            (VMOVUPSYrm addr:$src)>;
558  def : Pat<(loadv32i8 addr:$src),
559            (VMOVUPSYrm addr:$src)>;
560
561  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
562            (VMOVAPSYmr addr:$dst, VR256:$src)>;
563  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
564            (VMOVAPSYmr addr:$dst, VR256:$src)>;
565  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
566            (VMOVAPSYmr addr:$dst, VR256:$src)>;
567  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
568            (VMOVAPSYmr addr:$dst, VR256:$src)>;
569  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
570            (VMOVUPSYmr addr:$dst, VR256:$src)>;
571  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
572            (VMOVUPSYmr addr:$dst, VR256:$src)>;
573  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
574            (VMOVUPSYmr addr:$dst, VR256:$src)>;
575  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
576            (VMOVUPSYmr addr:$dst, VR256:$src)>;
577
578  def : Pat<(alignedloadv8f16 addr:$src),
579            (VMOVAPSrm addr:$src)>;
580  def : Pat<(alignedloadv8bf16 addr:$src),
581            (VMOVAPSrm addr:$src)>;
582  def : Pat<(loadv8f16 addr:$src),
583            (VMOVUPSrm addr:$src)>;
584  def : Pat<(loadv8bf16 addr:$src),
585            (VMOVUPSrm addr:$src)>;
586  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
587            (VMOVAPSmr addr:$dst, VR128:$src)>;
588  def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst),
589            (VMOVAPSmr addr:$dst, VR128:$src)>;
590  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
591            (VMOVUPSmr addr:$dst, VR128:$src)>;
592  def : Pat<(store (v8bf16 VR128:$src), addr:$dst),
593            (VMOVUPSmr addr:$dst, VR128:$src)>;
594
595  def : Pat<(alignedloadv16f16 addr:$src),
596            (VMOVAPSYrm addr:$src)>;
597  def : Pat<(alignedloadv16bf16 addr:$src),
598            (VMOVAPSYrm addr:$src)>;
599  def : Pat<(loadv16f16 addr:$src),
600            (VMOVUPSYrm addr:$src)>;
601  def : Pat<(loadv16bf16 addr:$src),
602            (VMOVUPSYrm addr:$src)>;
603  def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
604            (VMOVAPSYmr addr:$dst, VR256:$src)>;
605  def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst),
606            (VMOVAPSYmr addr:$dst, VR256:$src)>;
607  def : Pat<(store (v16f16 VR256:$src), addr:$dst),
608            (VMOVUPSYmr addr:$dst, VR256:$src)>;
609  def : Pat<(store (v16bf16 VR256:$src), addr:$dst),
610            (VMOVUPSYmr addr:$dst, VR256:$src)>;
611}
612
613// Use movaps / movups for SSE integer load / store (one byte shorter).
614// The instructions selected below are then converted to MOVDQA/MOVDQU
615// during the SSE domain pass.
616let Predicates = [UseSSE1] in {
617  def : Pat<(alignedloadv2i64 addr:$src),
618            (MOVAPSrm addr:$src)>;
619  def : Pat<(alignedloadv4i32 addr:$src),
620            (MOVAPSrm addr:$src)>;
621  def : Pat<(alignedloadv8i16 addr:$src),
622            (MOVAPSrm addr:$src)>;
623  def : Pat<(alignedloadv16i8 addr:$src),
624            (MOVAPSrm addr:$src)>;
625  def : Pat<(loadv2i64 addr:$src),
626            (MOVUPSrm addr:$src)>;
627  def : Pat<(loadv4i32 addr:$src),
628            (MOVUPSrm addr:$src)>;
629  def : Pat<(loadv8i16 addr:$src),
630            (MOVUPSrm addr:$src)>;
631  def : Pat<(loadv16i8 addr:$src),
632            (MOVUPSrm addr:$src)>;
633
634  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
635            (MOVAPSmr addr:$dst, VR128:$src)>;
636  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
637            (MOVAPSmr addr:$dst, VR128:$src)>;
638  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
639            (MOVAPSmr addr:$dst, VR128:$src)>;
640  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
641            (MOVAPSmr addr:$dst, VR128:$src)>;
642  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
643            (MOVUPSmr addr:$dst, VR128:$src)>;
644  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
645            (MOVUPSmr addr:$dst, VR128:$src)>;
646  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
647            (MOVUPSmr addr:$dst, VR128:$src)>;
648  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
649            (MOVUPSmr addr:$dst, VR128:$src)>;
650}
651
652let Predicates = [UseSSE2] in {
653  def : Pat<(alignedloadv8f16 addr:$src),
654            (MOVAPSrm addr:$src)>;
655  def : Pat<(loadv8f16 addr:$src),
656            (MOVUPSrm addr:$src)>;
657  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
658            (MOVAPSmr addr:$dst, VR128:$src)>;
659  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
660            (MOVUPSmr addr:$dst, VR128:$src)>;
661}
662
663//===----------------------------------------------------------------------===//
664// SSE 1 & 2 - Move Low packed FP Instructions
665//===----------------------------------------------------------------------===//
666
667multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
668                                      string base_opc, string asm_opr> {
669  // No pattern as they need be special cased between high and low.
670  let hasSideEffects = 0, mayLoad = 1 in
671  def PSrm : PI<opc, MRMSrcMem,
672                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
673                !strconcat(base_opc, "s", asm_opr),
674                [], SSEPackedSingle>, PS,
675                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
676
677  def PDrm : PI<opc, MRMSrcMem,
678         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
679         !strconcat(base_opc, "d", asm_opr),
680     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
681                              (scalar_to_vector (loadf64 addr:$src2)))))],
682              SSEPackedDouble>, PD,
683     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
684}
685
686multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
687                                 string base_opc> {
688  let Predicates = [UseAVX] in
689    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
690                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
691                                    VEX_4V, VEX_WIG;
692
693  let Constraints = "$src1 = $dst" in
694    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
695                                    "\t{$src2, $dst|$dst, $src2}">;
696}
697
698defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
699
700let SchedRW = [WriteFStore] in {
701let Predicates = [UseAVX] in {
702let mayStore = 1, hasSideEffects = 0 in
703def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
704                     "movlps\t{$src, $dst|$dst, $src}",
705                     []>,
706                     VEX, VEX_WIG;
707def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
708                     "movlpd\t{$src, $dst|$dst, $src}",
709                     [(store (f64 (extractelt (v2f64 VR128:$src),
710                                   (iPTR 0))), addr:$dst)]>,
711                     VEX, VEX_WIG;
712}// UseAVX
713let mayStore = 1, hasSideEffects = 0 in
714def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
715                   "movlps\t{$src, $dst|$dst, $src}",
716                   []>;
717def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
718                   "movlpd\t{$src, $dst|$dst, $src}",
719                   [(store (f64 (extractelt (v2f64 VR128:$src),
720                                 (iPTR 0))), addr:$dst)]>;
721} // SchedRW
722
723let Predicates = [UseSSE1] in {
724  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
725  // end up with a movsd or blend instead of shufp.
726  // No need for aligned load, we're only loading 64-bits.
727  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
728                      (i8 -28)),
729            (MOVLPSrm VR128:$src1, addr:$src2)>;
730  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
731            (MOVLPSrm VR128:$src1, addr:$src2)>;
732
733  def : Pat<(v4f32 (X86vzload64 addr:$src)),
734            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
735  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
736            (MOVLPSmr addr:$dst, VR128:$src)>;
737}
738
739//===----------------------------------------------------------------------===//
740// SSE 1 & 2 - Move Hi packed FP Instructions
741//===----------------------------------------------------------------------===//
742
743defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
744
745let SchedRW = [WriteFStore] in {
746// v2f64 extract element 1 is always custom lowered to unpack high to low
747// and extract element 0 so the non-store version isn't too horrible.
748let Predicates = [UseAVX] in {
749let mayStore = 1, hasSideEffects = 0 in
750def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
751                   "movhps\t{$src, $dst|$dst, $src}",
752                   []>, VEX, VEX_WIG;
753def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
754                   "movhpd\t{$src, $dst|$dst, $src}",
755                   [(store (f64 (extractelt
756                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
757                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
758} // UseAVX
759let mayStore = 1, hasSideEffects = 0 in
760def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
761                   "movhps\t{$src, $dst|$dst, $src}",
762                   []>;
763def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
764                   "movhpd\t{$src, $dst|$dst, $src}",
765                   [(store (f64 (extractelt
766                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
767                                 (iPTR 0))), addr:$dst)]>;
768} // SchedRW
769
770let Predicates = [UseAVX] in {
771  // MOVHPD patterns
772  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
773            (VMOVHPDrm VR128:$src1, addr:$src2)>;
774
775  def : Pat<(store (f64 (extractelt
776                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
777                          (iPTR 0))), addr:$dst),
778            (VMOVHPDmr addr:$dst, VR128:$src)>;
779
780  // MOVLPD patterns
781  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
782            (VMOVLPDrm VR128:$src1, addr:$src2)>;
783}
784
785let Predicates = [UseSSE1] in {
786  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
787  // end up with a movsd or blend instead of shufp.
788  // No need for aligned load, we're only loading 64-bits.
789  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
790            (MOVHPSrm VR128:$src1, addr:$src2)>;
791  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
792            (MOVHPSrm VR128:$src1, addr:$src2)>;
793
794  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
795                                addr:$dst),
796            (MOVHPSmr addr:$dst, VR128:$src)>;
797}
798
799let Predicates = [UseSSE2] in {
800  // MOVHPD patterns
801  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
802            (MOVHPDrm VR128:$src1, addr:$src2)>;
803
804  def : Pat<(store (f64 (extractelt
805                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
806                          (iPTR 0))), addr:$dst),
807            (MOVHPDmr addr:$dst, VR128:$src)>;
808
809  // MOVLPD patterns
810  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
811            (MOVLPDrm VR128:$src1, addr:$src2)>;
812}
813
814let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
815  // Use MOVLPD to load into the low bits from a full vector unless we can use
816  // BLENDPD.
817  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
818            (MOVLPDrm VR128:$src1, addr:$src2)>;
819}
820
821//===----------------------------------------------------------------------===//
822// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
823//===----------------------------------------------------------------------===//
824
825let Predicates = [UseAVX] in {
826  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
827                                       (ins VR128:$src1, VR128:$src2),
828                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
829                      [(set VR128:$dst,
830                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
831                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
832  let isCommutable = 1 in
833  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
834                                       (ins VR128:$src1, VR128:$src2),
835                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
836                      [(set VR128:$dst,
837                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
838                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
839                      NotMemoryFoldable;
840}
841let Constraints = "$src1 = $dst" in {
842  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
843                                       (ins VR128:$src1, VR128:$src2),
844                      "movlhps\t{$src2, $dst|$dst, $src2}",
845                      [(set VR128:$dst,
846                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
847                      Sched<[SchedWriteFShuffle.XMM]>;
848  let isCommutable = 1 in
849  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
850                                       (ins VR128:$src1, VR128:$src2),
851                      "movhlps\t{$src2, $dst|$dst, $src2}",
852                      [(set VR128:$dst,
853                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
854                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
855}
856
857//===----------------------------------------------------------------------===//
858// SSE 1 & 2 - Conversion Instructions
859//===----------------------------------------------------------------------===//
860
861multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
862                     SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
863                     string asm, string mem, X86FoldableSchedWrite sched,
864                     Domain d,
865                     SchedRead Int2Fpu = ReadDefault> {
866  let ExeDomain = d in {
867  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
868              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
869              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
870              Sched<[sched, Int2Fpu]>;
871  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
872              mem#"\t{$src, $dst|$dst, $src}",
873              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
874              Sched<[sched.Folded]>;
875  }
876}
877
878multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
879                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
880                       string asm, Domain d, X86FoldableSchedWrite sched> {
881let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
882  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
883             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
884             Sched<[sched]>;
885  let mayLoad = 1 in
886  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
887             [(set RC:$dst, (DstTy (any_sint_to_fp
888                                    (SrcTy (ld_frag addr:$src)))))], d>,
889             Sched<[sched.Folded]>;
890}
891}
892
893multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
894                          X86MemOperand x86memop, string asm, string mem,
895                          X86FoldableSchedWrite sched, Domain d> {
896let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
897  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
898              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
899              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
900  let mayLoad = 1 in
901  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
902              (ins DstRC:$src1, x86memop:$src),
903              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
904           Sched<[sched.Folded, sched.ReadAfterFold]>;
905} // hasSideEffects = 0
906}
907
908let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
909defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
910                                "cvttss2si", "cvttss2si",
911                                WriteCvtSS2I, SSEPackedSingle>,
912                                XS, VEX, VEX_LIG;
913defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
914                                "cvttss2si", "cvttss2si",
915                                WriteCvtSS2I, SSEPackedSingle>,
916                                XS, VEX, VEX_W, VEX_LIG;
917defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
918                                "cvttsd2si", "cvttsd2si",
919                                WriteCvtSD2I, SSEPackedDouble>,
920                                XD, VEX, VEX_LIG;
921defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
922                                "cvttsd2si", "cvttsd2si",
923                                WriteCvtSD2I, SSEPackedDouble>,
924                                XD, VEX, VEX_W, VEX_LIG;
925
926defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
927                               "cvtss2si", "cvtss2si",
928                               WriteCvtSS2I, SSEPackedSingle>,
929                               XS, VEX, VEX_LIG;
930defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
931                               "cvtss2si", "cvtss2si",
932                               WriteCvtSS2I, SSEPackedSingle>,
933                               XS, VEX, VEX_W, VEX_LIG;
934defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
935                               "cvtsd2si", "cvtsd2si",
936                               WriteCvtSD2I, SSEPackedDouble>,
937                               XD, VEX, VEX_LIG;
938defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
939                               "cvtsd2si", "cvtsd2si",
940                               WriteCvtSD2I, SSEPackedDouble>,
941                               XD, VEX, VEX_W, VEX_LIG;
942}
943
944// The assembler can recognize rr 64-bit instructions by seeing a rxx
945// register, but the same isn't true when only using memory operands,
946// provide other assembly "l" and "q" forms to address this explicitly
947// where appropriate to do so.
948let isCodeGenOnly = 1 in {
949defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
950                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
951                                  VEX_LIG, SIMD_EXC;
952defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
953                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
954                                  VEX_W, VEX_LIG, SIMD_EXC;
955defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
956                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
957                                  VEX_LIG;
958defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
959                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
960                                  VEX_W, VEX_LIG, SIMD_EXC;
961} // isCodeGenOnly = 1
962
963let Predicates = [UseAVX] in {
964  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
965            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
966  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
967            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
968  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
969            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
970  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
971            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
972
973  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
974            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
975  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
976            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
977  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
978            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
979  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
980            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
981
982  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
983  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
984
985  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
986  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
987}
988
989let isCodeGenOnly = 1 in {
990defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
991                      "cvttss2si", "cvttss2si",
992                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
993defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
994                      "cvttss2si", "cvttss2si",
995                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
996defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
997                      "cvttsd2si", "cvttsd2si",
998                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
999defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
1000                      "cvttsd2si", "cvttsd2si",
1001                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
1002
1003defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
1004                     "cvtss2si", "cvtss2si",
1005                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
1006defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
1007                     "cvtss2si", "cvtss2si",
1008                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
1009defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
1010                     "cvtsd2si", "cvtsd2si",
1011                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
1012defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
1013                     "cvtsd2si", "cvtsd2si",
1014                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
1015
1016defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
1017                      "cvtsi2ss", "cvtsi2ss{l}",
1018                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
1019defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
1020                      "cvtsi2ss", "cvtsi2ss{q}",
1021                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
1022defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
1023                      "cvtsi2sd", "cvtsi2sd{l}",
1024                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
1025defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
1026                      "cvtsi2sd", "cvtsi2sd{q}",
1027                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
1028} // isCodeGenOnly = 1
1029
1030let Predicates = [UseSSE1] in {
1031  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
1032  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
1033}
1034
1035let Predicates = [UseSSE2] in {
1036  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
1037  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
1038}
1039
1040// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1041// and/or XMM operand(s).
1042
1043multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1044                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1045                          Operand memop, PatFrags mem_frags, string asm,
1046                          X86FoldableSchedWrite sched, Domain d> {
1047let ExeDomain = d in {
1048  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1049                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1050                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1051               Sched<[sched]>;
1052  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1053                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1054                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1055               Sched<[sched.Folded]>;
1056}
1057}
1058
1059multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1060                    RegisterClass DstRC, X86MemOperand x86memop,
1061                    string asm, string mem, X86FoldableSchedWrite sched,
1062                    Domain d, bit Is2Addr = 1> {
1063let hasSideEffects = 0, ExeDomain = d in {
1064  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1065                  !if(Is2Addr,
1066                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1067                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1068                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1069  let mayLoad = 1 in
1070  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1071                  (ins DstRC:$src1, x86memop:$src2),
1072                  !if(Is2Addr,
1073                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1074                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1075                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1076}
1077}
1078
1079let Uses = [MXCSR], mayRaiseFPException = 1 in {
1080let Predicates = [UseAVX] in {
1081defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1082                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1083                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1084defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1085                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1086                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1087}
1088defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1089                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1090                 SSEPackedDouble>, XD;
1091defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1092                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1093                   SSEPackedDouble>, XD, REX_W;
1094}
1095
1096let Predicates = [UseAVX] in {
1097defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1098          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1099          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1100defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1101          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1102          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1103defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1104          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1105          XD, VEX_4V, VEX_LIG;
1106defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1107          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1108          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1109}
1110let Constraints = "$src1 = $dst" in {
1111  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1112                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1113                        XS, SIMD_EXC;
1114  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1115                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1116                        XS, REX_W, SIMD_EXC;
1117  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1118                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1119                        XD;
1120  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1121                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1122                        XD, REX_W, SIMD_EXC;
1123}
1124
1125def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1126               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1127def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1128               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1129def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1130               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1131def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1132               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1133
1134def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1135              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1136def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1137              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1138
1139def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1140                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1141def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1142                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1143def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1144                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1145def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1146                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1147
1148def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1149                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1150def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1151                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1152
1153/// SSE 1 Only
1154
1155// Aliases for intrinsics
1156let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1157defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1158                                ssmem, sse_load_f32, "cvttss2si",
1159                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1160defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1161                               X86cvtts2Int, ssmem, sse_load_f32,
1162                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1163                               XS, VEX, VEX_LIG, VEX_W;
1164defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1165                                sdmem, sse_load_f64, "cvttsd2si",
1166                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1167defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1168                              X86cvtts2Int, sdmem, sse_load_f64,
1169                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1170                              XD, VEX, VEX_LIG, VEX_W;
1171}
1172let Uses = [MXCSR], mayRaiseFPException = 1 in {
1173defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1174                                    ssmem, sse_load_f32, "cvttss2si",
1175                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1176defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1177                                   X86cvtts2Int, ssmem, sse_load_f32,
1178                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1179                                   XS, REX_W;
1180defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1181                                    sdmem, sse_load_f64, "cvttsd2si",
1182                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1183defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1184                                  X86cvtts2Int, sdmem, sse_load_f64,
1185                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1186                                  XD, REX_W;
1187}
1188
1189def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1190                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1191def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1192                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1193def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1194                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1195def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1196                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1197def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1198                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1199def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1200                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1201def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1202                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1203def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1204                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1205
1206def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1207                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1208def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1209                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1210def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1211                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1212def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1213                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1214def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1215                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1216def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1217                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1218def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1219                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1220def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1221                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1222
1223let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1224defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1225                                  ssmem, sse_load_f32, "cvtss2si",
1226                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1227defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1228                                  ssmem, sse_load_f32, "cvtss2si",
1229                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1230}
1231let Uses = [MXCSR], mayRaiseFPException = 1 in {
1232defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1233                               ssmem, sse_load_f32, "cvtss2si",
1234                               WriteCvtSS2I, SSEPackedSingle>, XS;
1235defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1236                                 ssmem, sse_load_f32, "cvtss2si",
1237                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1238
1239defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1240                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1241                               SSEPackedSingle, WriteCvtI2PS>,
1242                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1243defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1244                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1245                               SSEPackedSingle, WriteCvtI2PSY>,
1246                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1247
1248defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1249                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1250                            SSEPackedSingle, WriteCvtI2PS>,
1251                            PS, Requires<[UseSSE2]>;
1252}
1253
1254// AVX aliases
1255def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1256                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1257def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1258                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1259def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1260                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1261def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1262                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1263def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1264                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1265def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1266                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1267def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1268                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1269def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1270                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1271
1272// SSE aliases
1273def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1274                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1275def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1276                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1277def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1278                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1279def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1280                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1281def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1282                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1283def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1284                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1285def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1286                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1287def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1288                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1289
1290/// SSE 2 Only
1291
1292// Convert scalar double to scalar single
1293let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
1294    ExeDomain = SSEPackedSingle in {
1295def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1296                        (ins FR32:$src1, FR64:$src2),
1297                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1298                        VEX_4V, VEX_LIG, VEX_WIG,
1299                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1300let mayLoad = 1 in
1301def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1302                     (ins FR32:$src1, f64mem:$src2),
1303                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1304                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1305                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1306}
1307
1308def : Pat<(f32 (any_fpround FR64:$src)),
1309            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1310          Requires<[UseAVX]>;
1311
1312let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1313def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1314                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1315                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1316                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1317def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1318                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1319                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1320                    XD, Requires<[UseSSE2, OptForSize]>,
1321                    Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1322}
1323
1324let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
1325def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1326                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1327                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1328                       [(set VR128:$dst,
1329                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1330                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1331                       Sched<[WriteCvtSD2SS]>;
1332def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1333                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1334                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1335                       [(set VR128:$dst,
1336                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1337                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1338                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1339let Constraints = "$src1 = $dst" in {
1340def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1341                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1342                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1343                       [(set VR128:$dst,
1344                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1345                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1346def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1347                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1348                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1349                       [(set VR128:$dst,
1350                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1351                       XD, Requires<[UseSSE2]>,
1352                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1353}
1354}
1355
1356// Convert scalar single to scalar double
1357// SSE2 instructions with XS prefix
1358let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
1359def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1360                    (ins FR64:$src1, FR32:$src2),
1361                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1362                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1363                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1364let mayLoad = 1 in
1365def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1366                    (ins FR64:$src1, f32mem:$src2),
1367                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1368                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1369                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1370                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1371} // isCodeGenOnly = 1, hasSideEffects = 0
1372
1373def : Pat<(f64 (any_fpextend FR32:$src)),
1374    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1375def : Pat<(any_fpextend (loadf32 addr:$src)),
1376    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1377
1378let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1379def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1380                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1381                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1382                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1383def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1384                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1385                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1386                   XS, Requires<[UseSSE2, OptForSize]>,
1387                   Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
1388} // isCodeGenOnly = 1
1389
1390let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1391    ExeDomain = SSEPackedSingle in {
1392def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1393                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1394                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1395                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1396                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1397let mayLoad = 1 in
1398def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1399                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1400                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1401                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1402                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1403let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1404def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1405                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1406                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1407                    []>, XS, Requires<[UseSSE2]>,
1408                    Sched<[WriteCvtSS2SD]>;
1409let mayLoad = 1 in
1410def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1411                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1412                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1413                    []>, XS, Requires<[UseSSE2]>,
1414                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1415}
1416} // hasSideEffects = 0
1417
1418// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1419// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1420// vmovs{s,d} instructions
1421let Predicates = [UseAVX] in {
1422def : Pat<(v4f32 (X86Movss
1423                   (v4f32 VR128:$dst),
1424                   (v4f32 (scalar_to_vector
1425                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1426          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1427
1428def : Pat<(v2f64 (X86Movsd
1429                   (v2f64 VR128:$dst),
1430                   (v2f64 (scalar_to_vector
1431                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1432          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1433
1434def : Pat<(v4f32 (X86Movss
1435                   (v4f32 VR128:$dst),
1436                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1437          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1438
1439def : Pat<(v4f32 (X86Movss
1440                   (v4f32 VR128:$dst),
1441                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1442          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1443
1444def : Pat<(v4f32 (X86Movss
1445                   (v4f32 VR128:$dst),
1446                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1447          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1448
1449def : Pat<(v4f32 (X86Movss
1450                   (v4f32 VR128:$dst),
1451                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1452          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1453
1454def : Pat<(v2f64 (X86Movsd
1455                   (v2f64 VR128:$dst),
1456                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1457          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1458
1459def : Pat<(v2f64 (X86Movsd
1460                   (v2f64 VR128:$dst),
1461                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1462          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1463
1464def : Pat<(v2f64 (X86Movsd
1465                   (v2f64 VR128:$dst),
1466                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1467          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1468
1469def : Pat<(v2f64 (X86Movsd
1470                   (v2f64 VR128:$dst),
1471                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1472          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1473} // Predicates = [UseAVX]
1474
1475let Predicates = [UseSSE2] in {
1476def : Pat<(v4f32 (X86Movss
1477                   (v4f32 VR128:$dst),
1478                   (v4f32 (scalar_to_vector
1479                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1480          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1481
1482def : Pat<(v2f64 (X86Movsd
1483                   (v2f64 VR128:$dst),
1484                   (v2f64 (scalar_to_vector
1485                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1486          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1487
1488def : Pat<(v2f64 (X86Movsd
1489                   (v2f64 VR128:$dst),
1490                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1491          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1492
1493def : Pat<(v2f64 (X86Movsd
1494                   (v2f64 VR128:$dst),
1495                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1496          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1497
1498def : Pat<(v2f64 (X86Movsd
1499                   (v2f64 VR128:$dst),
1500                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1501          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1502
1503def : Pat<(v2f64 (X86Movsd
1504                   (v2f64 VR128:$dst),
1505                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1506          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1507} // Predicates = [UseSSE2]
1508
1509let Predicates = [UseSSE1] in {
1510def : Pat<(v4f32 (X86Movss
1511                   (v4f32 VR128:$dst),
1512                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1513          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1514
1515def : Pat<(v4f32 (X86Movss
1516                   (v4f32 VR128:$dst),
1517                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1518          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1519
1520def : Pat<(v4f32 (X86Movss
1521                   (v4f32 VR128:$dst),
1522                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1523          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1524
1525def : Pat<(v4f32 (X86Movss
1526                   (v4f32 VR128:$dst),
1527                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1528          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1529} // Predicates = [UseSSE1]
1530
1531let Predicates = [HasAVX, NoVLX] in {
1532// Convert packed single/double fp to doubleword
1533def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1534                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1535                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1536                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1537def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1538                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1539                       [(set VR128:$dst,
1540                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1541                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1542def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1543                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1544                        [(set VR256:$dst,
1545                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1546                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1547def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1548                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1549                        [(set VR256:$dst,
1550                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1551                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1552}
1553def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1554                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1555                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1556                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1557def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1558                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1559                     [(set VR128:$dst,
1560                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1561                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1562
1563
1564// Convert Packed Double FP to Packed DW Integers
1565let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1566// The assembler can recognize rr 256-bit instructions by seeing a ymm
1567// register, but the same isn't true when using memory operands instead.
1568// Provide other assembly rr and rm forms to address this explicitly.
1569def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1570                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1571                       [(set VR128:$dst,
1572                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1573                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1574
1575// XMM only
1576def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1577                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1578                      [(set VR128:$dst,
1579                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1580                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1581
1582// YMM only
1583def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1584                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1585                       [(set VR128:$dst,
1586                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1587                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1588def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1589                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1590                       [(set VR128:$dst,
1591                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1592                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1593}
1594
1595def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1596                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1597def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1598                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1599
1600def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1601                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1602                      [(set VR128:$dst,
1603                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1604                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1605def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1606                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1607                      [(set VR128:$dst,
1608                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1609                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1610
1611// Convert with truncation packed single/double fp to doubleword
1612// SSE2 packed instructions with XS prefix
1613let Uses = [MXCSR], mayRaiseFPException = 1 in {
1614let Predicates = [HasAVX, NoVLX] in {
1615def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1616                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1617                         [(set VR128:$dst,
1618                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1619                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1620def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1621                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1622                         [(set VR128:$dst,
1623                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1624                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1625def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1626                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1627                          [(set VR256:$dst,
1628                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1629                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1630def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1631                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1632                          [(set VR256:$dst,
1633                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1634                          VEX, VEX_L,
1635                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1636}
1637
1638def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1639                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1640                       [(set VR128:$dst,
1641                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1642                       Sched<[WriteCvtPS2I]>;
1643def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1644                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1645                       [(set VR128:$dst,
1646                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1647                       Sched<[WriteCvtPS2ILd]>;
1648}
1649
1650// The assembler can recognize rr 256-bit instructions by seeing a ymm
1651// register, but the same isn't true when using memory operands instead.
1652// Provide other assembly rr and rm forms to address this explicitly.
1653let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1654// XMM only
1655def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1656                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1657                        [(set VR128:$dst,
1658                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1659                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1660def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1661                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1662                        [(set VR128:$dst,
1663                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1664                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1665
1666// YMM only
1667def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1668                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1669                         [(set VR128:$dst,
1670                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1671                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1672def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1673                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1674                         [(set VR128:$dst,
1675                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1676                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1677} // Predicates = [HasAVX, NoVLX]
1678
1679def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1680                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1681def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1682                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1683
1684let Predicates = [HasAVX, NoVLX] in {
1685  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1686            (VCVTTPD2DQYrr VR256:$src)>;
1687  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1688            (VCVTTPD2DQYrm addr:$src)>;
1689}
1690
1691def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1692                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1693                      [(set VR128:$dst,
1694                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1695                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1696def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1697                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1698                      [(set VR128:$dst,
1699                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1700                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1701
1702// Convert packed single to packed double
1703let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1704                  // SSE2 instructions without OpSize prefix
1705def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1706                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1707                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1708                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1709def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1710                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1711                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1712                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1713def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1714                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1715                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1716                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1717def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1718                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1719                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1720                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1721}
1722
1723let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1724def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1725                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1726                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1727                   PS, Sched<[WriteCvtPS2PD]>;
1728def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1729                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1730                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1731                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1732}
1733
1734// Convert Packed DW Integers to Packed Double FP
1735let Predicates = [HasAVX, NoVLX] in {
1736let hasSideEffects = 0, mayLoad = 1 in
1737def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1738                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1739                        [(set VR128:$dst,
1740                          (v2f64 (X86any_VSintToFP
1741                                  (bc_v4i32
1742                                   (v2i64 (scalar_to_vector
1743                                           (loadi64 addr:$src)))))))]>,
1744                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1745def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1746                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1747                        [(set VR128:$dst,
1748                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1749                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1750def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1751                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1752                         [(set VR256:$dst,
1753                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1754                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1755                         VEX_WIG;
1756def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1757                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1758                         [(set VR256:$dst,
1759                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1760                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1761}
1762
1763let hasSideEffects = 0, mayLoad = 1 in
1764def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1765                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1766                       [(set VR128:$dst,
1767                         (v2f64 (X86any_VSintToFP
1768                                 (bc_v4i32
1769                                  (v2i64 (scalar_to_vector
1770                                          (loadi64 addr:$src)))))))]>,
1771                       Sched<[WriteCvtI2PDLd]>;
1772def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1773                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1774                       [(set VR128:$dst,
1775                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1776                       Sched<[WriteCvtI2PD]>;
1777
1778// AVX register conversion intrinsics
1779let Predicates = [HasAVX, NoVLX] in {
1780  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1781            (VCVTDQ2PDrm addr:$src)>;
1782} // Predicates = [HasAVX, NoVLX]
1783
1784// SSE2 register conversion intrinsics
1785let Predicates = [UseSSE2] in {
1786  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1787            (CVTDQ2PDrm addr:$src)>;
1788} // Predicates = [UseSSE2]
1789
1790// Convert packed double to packed single
1791// The assembler can recognize rr 256-bit instructions by seeing a ymm
1792// register, but the same isn't true when using memory operands instead.
1793// Provide other assembly rr and rm forms to address this explicitly.
1794let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1795// XMM only
1796def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1797                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1798                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1799                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1800def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1801                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1802                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
1803                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1804
1805def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1806                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1807                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
1808                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1809def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1810                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1811                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
1812                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1813} // Predicates = [HasAVX, NoVLX]
1814
1815def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1816                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1817def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1818                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1819
1820def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1821                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1822                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1823                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1824def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1825                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1826                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
1827                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1828
1829//===----------------------------------------------------------------------===//
1830// SSE 1 & 2 - Compare Instructions
1831//===----------------------------------------------------------------------===//
1832
1833// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1834multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1835                            Operand memop, SDNode OpNode, ValueType VT,
1836                            PatFrag ld_frag, string asm,
1837                            X86FoldableSchedWrite sched,
1838                            PatFrags mem_frags> {
1839  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1840                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1841                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1842                                              VR128:$src2, timm:$cc))]>,
1843           Sched<[sched]>, SIMD_EXC;
1844  let mayLoad = 1 in
1845  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1846                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1847                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1848                                              (mem_frags addr:$src2), timm:$cc))]>,
1849           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1850
1851  let isCodeGenOnly = 1 in {
1852    let isCommutable = 1 in
1853    def rr : SIi8<0xC2, MRMSrcReg,
1854                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1855                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1856                  Sched<[sched]>, SIMD_EXC;
1857    def rm : SIi8<0xC2, MRMSrcMem,
1858                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1859                  [(set RC:$dst, (OpNode RC:$src1,
1860                                         (ld_frag addr:$src2), timm:$cc))]>,
1861                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1862  }
1863}
1864
1865let ExeDomain = SSEPackedSingle in
1866defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1867                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1868                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1869                 XS, VEX_4V, VEX_LIG, VEX_WIG;
1870let ExeDomain = SSEPackedDouble in
1871defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1872                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1873                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1874                 XD, VEX_4V, VEX_LIG, VEX_WIG;
1875
1876let Constraints = "$src1 = $dst" in {
1877  let ExeDomain = SSEPackedSingle in
1878  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1879                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1880                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1881  let ExeDomain = SSEPackedDouble in
1882  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1883                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1884                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1885}
1886
1887// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1888multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
1889                         ValueType vt, X86MemOperand x86memop,
1890                         PatFrag ld_frag, string OpcodeStr, Domain d,
1891                         X86FoldableSchedWrite sched = WriteFComX> {
1892  let ExeDomain = d in {
1893  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1894                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1895                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1896          Sched<[sched]>, SIMD_EXC;
1897  let mayLoad = 1 in
1898  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1899                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1900                     [(set EFLAGS, (OpNode (vt RC:$src1),
1901                                           (ld_frag addr:$src2)))]>,
1902          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1903}
1904}
1905
1906// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1907multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1908                             ValueType vt, Operand memop,
1909                             PatFrags mem_frags, string OpcodeStr,
1910                             Domain d,
1911                             X86FoldableSchedWrite sched = WriteFComX> {
1912let ExeDomain = d in {
1913  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1914                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1915                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1916          Sched<[sched]>, SIMD_EXC;
1917let mayLoad = 1 in
1918  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1919                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1920                     [(set EFLAGS, (OpNode (vt RC:$src1),
1921                                           (mem_frags addr:$src2)))]>,
1922          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1923}
1924}
1925
1926let Defs = [EFLAGS] in {
1927  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1928                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1929  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1930                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1931  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1932                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1933  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1934                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1935
1936  let isCodeGenOnly = 1 in {
1937    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1938                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1939    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1940                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1941
1942    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1943                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1944    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1945                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1946  }
1947  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1948                                  "ucomiss", SSEPackedSingle>, PS;
1949  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1950                                  "ucomisd", SSEPackedDouble>, PD;
1951  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1952                                  "comiss", SSEPackedSingle>, PS;
1953  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1954                                  "comisd", SSEPackedDouble>, PD;
1955
1956  let isCodeGenOnly = 1 in {
1957    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1958                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1959    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1960                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1961
1962    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1963                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1964    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1965                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1966  }
1967} // Defs = [EFLAGS]
1968
1969// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1970multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1971                            ValueType VT, string asm,
1972                            X86FoldableSchedWrite sched,
1973                            Domain d, PatFrag ld_frag> {
1974  let isCommutable = 1 in
1975  def rri : PIi8<0xC2, MRMSrcReg,
1976             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1977             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1978            Sched<[sched]>, SIMD_EXC;
1979  def rmi : PIi8<0xC2, MRMSrcMem,
1980             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1981             [(set RC:$dst,
1982               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1983            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1984}
1985
1986defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1987               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1988               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1989defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1990               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1991               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1992defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1993               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1994               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1995defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1996               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1997               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1998let Constraints = "$src1 = $dst" in {
1999  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
2000                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2001                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
2002  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
2003                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
2004                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
2005}
2006
2007def CommutableCMPCC : PatLeaf<(timm), [{
2008  uint64_t Imm = N->getZExtValue() & 0x7;
2009  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
2010}]>;
2011
2012// Patterns to select compares with loads in first operand.
2013let Predicates = [HasAVX] in {
2014  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
2015                                CommutableCMPCC:$cc)),
2016            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2017
2018  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
2019                                CommutableCMPCC:$cc)),
2020            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2021
2022  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
2023                                CommutableCMPCC:$cc)),
2024            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2025
2026  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
2027                                CommutableCMPCC:$cc)),
2028            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2029
2030  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2031                          CommutableCMPCC:$cc)),
2032            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2033
2034  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2035                          CommutableCMPCC:$cc)),
2036            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2037}
2038
2039let Predicates = [UseSSE2] in {
2040  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
2041                                CommutableCMPCC:$cc)),
2042            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2043
2044  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2045                          CommutableCMPCC:$cc)),
2046            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2047}
2048
2049let Predicates = [UseSSE1] in {
2050  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2051                                CommutableCMPCC:$cc)),
2052            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2053
2054  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2055                          CommutableCMPCC:$cc)),
2056            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2057}
2058
2059//===----------------------------------------------------------------------===//
2060// SSE 1 & 2 - Shuffle Instructions
2061//===----------------------------------------------------------------------===//
2062
2063/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2064multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2065                         ValueType vt, string asm, PatFrag mem_frag,
2066                         X86FoldableSchedWrite sched, Domain d,
2067                         bit IsCommutable = 0> {
2068  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2069                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2070                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2071                                       (i8 timm:$src3))))], d>,
2072            Sched<[sched.Folded, sched.ReadAfterFold]>;
2073  let isCommutable = IsCommutable in
2074  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2075                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2076                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2077                                     (i8 timm:$src3))))], d>,
2078            Sched<[sched]>;
2079}
2080
2081let Predicates = [HasAVX, NoVLX] in {
2082  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2083           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2084           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2085           PS, VEX_4V, VEX_WIG;
2086  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2087           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2088           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2089           PS, VEX_4V, VEX_L, VEX_WIG;
2090  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2091           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2092           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2093           PD, VEX_4V, VEX_WIG;
2094  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2095           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2096           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2097           PD, VEX_4V, VEX_L, VEX_WIG;
2098}
2099let Constraints = "$src1 = $dst" in {
2100  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2101                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2102                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2103  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2104                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2105                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2106}
2107
2108//===----------------------------------------------------------------------===//
2109// SSE 1 & 2 - Unpack FP Instructions
2110//===----------------------------------------------------------------------===//
2111
2112/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2113multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2114                                   PatFrag mem_frag, RegisterClass RC,
2115                                   X86MemOperand x86memop, string asm,
2116                                   X86FoldableSchedWrite sched, Domain d,
2117                                   bit IsCommutable = 0> {
2118    let isCommutable = IsCommutable in
2119    def rr : PI<opc, MRMSrcReg,
2120                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2121                asm, [(set RC:$dst,
2122                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2123                Sched<[sched]>;
2124    def rm : PI<opc, MRMSrcMem,
2125                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2126                asm, [(set RC:$dst,
2127                           (vt (OpNode RC:$src1,
2128                                       (mem_frag addr:$src2))))], d>,
2129             Sched<[sched.Folded, sched.ReadAfterFold]>;
2130}
2131
2132let Predicates = [HasAVX, NoVLX] in {
2133defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2134      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2135                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2136defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2137      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2138                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2139defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2140      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2141                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2142defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2143      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2144                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2145
2146defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2147      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2148                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2149defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2150      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2151                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2152defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2153      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2154                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2155defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2156      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2157                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2158}// Predicates = [HasAVX, NoVLX]
2159
2160let Constraints = "$src1 = $dst" in {
2161  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2162        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2163                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2164  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2165        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2166                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2167  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2168        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2169                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2170  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2171        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2172                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2173} // Constraints = "$src1 = $dst"
2174
2175let Predicates = [HasAVX1Only] in {
2176  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2177            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2178  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2179            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2180  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2181            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2182  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2183            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2184
2185  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2186            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2187  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2188            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2189  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2190            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2191  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2192            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2193}
2194
2195let Predicates = [UseSSE2] in {
2196  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2197  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2198                              (v2f64 (simple_load addr:$src2)))),
2199            (MOVHPDrm VR128:$src1, addr:$src2)>;
2200}
2201
2202//===----------------------------------------------------------------------===//
2203// SSE 1 & 2 - Extract Floating-Point Sign mask
2204//===----------------------------------------------------------------------===//
2205
2206/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2207multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2208                                string asm, Domain d> {
2209  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2210              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2211              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2212              Sched<[WriteFMOVMSK]>;
2213}
2214
2215let Predicates = [HasAVX] in {
2216  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2217                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2218  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2219                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2220  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2221                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2222  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2223                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2224
2225  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2226  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2227            (VMOVMSKPSrr VR128:$src)>;
2228  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2229            (VMOVMSKPDrr VR128:$src)>;
2230  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2231            (VMOVMSKPSYrr VR256:$src)>;
2232  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2233            (VMOVMSKPDYrr VR256:$src)>;
2234}
2235
2236defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2237                                     SSEPackedSingle>, PS;
2238defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2239                                     SSEPackedDouble>, PD;
2240
2241let Predicates = [UseSSE2] in {
2242  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2243  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2244            (MOVMSKPSrr VR128:$src)>;
2245  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2246            (MOVMSKPDrr VR128:$src)>;
2247}
2248
2249//===---------------------------------------------------------------------===//
2250// SSE2 - Packed Integer Logical Instructions
2251//===---------------------------------------------------------------------===//
2252
2253let ExeDomain = SSEPackedInt in { // SSE integer instructions
2254
2255/// PDI_binop_rm - Simple SSE2 binary operator.
2256multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2257                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2258                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2259                        bit IsCommutable, bit Is2Addr> {
2260  let isCommutable = IsCommutable in
2261  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2262       (ins RC:$src1, RC:$src2),
2263       !if(Is2Addr,
2264           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2265           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2266       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2267       Sched<[sched]>;
2268  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2269       (ins RC:$src1, x86memop:$src2),
2270       !if(Is2Addr,
2271           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2272           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2273       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2274       Sched<[sched.Folded, sched.ReadAfterFold]>;
2275}
2276} // ExeDomain = SSEPackedInt
2277
2278multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2279                         ValueType OpVT128, ValueType OpVT256,
2280                         X86SchedWriteWidths sched, bit IsCommutable,
2281                         Predicate prd> {
2282let Predicates = [HasAVX, prd] in
2283  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2284                             VR128, load, i128mem, sched.XMM,
2285                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2286
2287let Constraints = "$src1 = $dst" in
2288  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2289                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2290
2291let Predicates = [HasAVX2, prd] in
2292  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2293                               OpVT256, VR256, load, i256mem, sched.YMM,
2294                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2295}
2296
2297// These are ordered here for pattern ordering requirements with the fp versions
2298
2299defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2300                           SchedWriteVecLogic, 1, NoVLX>;
2301defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2302                           SchedWriteVecLogic, 1, NoVLX>;
2303defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2304                           SchedWriteVecLogic, 1, NoVLX>;
2305defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2306                           SchedWriteVecLogic, 0, NoVLX>;
2307
2308//===----------------------------------------------------------------------===//
2309// SSE 1 & 2 - Logical Instructions
2310//===----------------------------------------------------------------------===//
2311
2312/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2313///
2314/// There are no patterns here because isel prefers integer versions for SSE2
2315/// and later. There are SSE1 v4f32 patterns later.
2316multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2317                                   X86SchedWriteWidths sched> {
2318  let Predicates = [HasAVX, NoVLX] in {
2319  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2320        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2321        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2322
2323  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2324        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2325        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2326
2327  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2328       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2329       [], [], 0>, PS, VEX_4V, VEX_WIG;
2330
2331  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2332       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2333       [], [], 0>, PD, VEX_4V, VEX_WIG;
2334  }
2335
2336  let Constraints = "$src1 = $dst" in {
2337    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2338         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2339         [], []>, PS;
2340
2341    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2342         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2343         [], []>, PD;
2344  }
2345}
2346
2347defm AND  : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
2348defm OR   : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
2349defm XOR  : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
2350let isCommutable = 0 in
2351  defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
2352
2353let Predicates = [HasAVX2, NoVLX] in {
2354  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2355            (VPANDYrr VR256:$src1, VR256:$src2)>;
2356  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2357            (VPANDYrr VR256:$src1, VR256:$src2)>;
2358  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2359            (VPANDYrr VR256:$src1, VR256:$src2)>;
2360
2361  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2362            (VPORYrr VR256:$src1, VR256:$src2)>;
2363  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2364            (VPORYrr VR256:$src1, VR256:$src2)>;
2365  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2366            (VPORYrr VR256:$src1, VR256:$src2)>;
2367
2368  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2369            (VPXORYrr VR256:$src1, VR256:$src2)>;
2370  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2371            (VPXORYrr VR256:$src1, VR256:$src2)>;
2372  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2373            (VPXORYrr VR256:$src1, VR256:$src2)>;
2374
2375  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2376            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2377  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2378            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2379  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2380            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2381
2382  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2383            (VPANDYrm VR256:$src1, addr:$src2)>;
2384  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2385            (VPANDYrm VR256:$src1, addr:$src2)>;
2386  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2387            (VPANDYrm VR256:$src1, addr:$src2)>;
2388
2389  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2390            (VPORYrm VR256:$src1, addr:$src2)>;
2391  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2392            (VPORYrm VR256:$src1, addr:$src2)>;
2393  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2394            (VPORYrm VR256:$src1, addr:$src2)>;
2395
2396  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2397            (VPXORYrm VR256:$src1, addr:$src2)>;
2398  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2399            (VPXORYrm VR256:$src1, addr:$src2)>;
2400  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2401            (VPXORYrm VR256:$src1, addr:$src2)>;
2402
2403  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2404            (VPANDNYrm VR256:$src1, addr:$src2)>;
2405  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2406            (VPANDNYrm VR256:$src1, addr:$src2)>;
2407  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2408            (VPANDNYrm VR256:$src1, addr:$src2)>;
2409}
2410
2411// If only AVX1 is supported, we need to handle integer operations with
2412// floating point instructions since the integer versions aren't available.
2413let Predicates = [HasAVX1Only] in {
2414  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2415            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2416  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2417            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2418  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2419            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2420  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2421            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2422
2423  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2424            (VORPSYrr VR256:$src1, VR256:$src2)>;
2425  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2426            (VORPSYrr VR256:$src1, VR256:$src2)>;
2427  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2428            (VORPSYrr VR256:$src1, VR256:$src2)>;
2429  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2430            (VORPSYrr VR256:$src1, VR256:$src2)>;
2431
2432  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2433            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2434  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2435            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2436  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2437            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2438  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2439            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2440
2441  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2442            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2443  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2444            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2445  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2446            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2447  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2448            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2449
2450  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2451            (VANDPSYrm VR256:$src1, addr:$src2)>;
2452  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2453            (VANDPSYrm VR256:$src1, addr:$src2)>;
2454  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2455            (VANDPSYrm VR256:$src1, addr:$src2)>;
2456  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2457            (VANDPSYrm VR256:$src1, addr:$src2)>;
2458
2459  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2460            (VORPSYrm VR256:$src1, addr:$src2)>;
2461  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2462            (VORPSYrm VR256:$src1, addr:$src2)>;
2463  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2464            (VORPSYrm VR256:$src1, addr:$src2)>;
2465  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2466            (VORPSYrm VR256:$src1, addr:$src2)>;
2467
2468  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2469            (VXORPSYrm VR256:$src1, addr:$src2)>;
2470  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2471            (VXORPSYrm VR256:$src1, addr:$src2)>;
2472  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2473            (VXORPSYrm VR256:$src1, addr:$src2)>;
2474  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2475            (VXORPSYrm VR256:$src1, addr:$src2)>;
2476
2477  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2478            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2479  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2480            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2481  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2482            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2483  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2484            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2485}
2486
2487let Predicates = [HasAVX, NoVLX] in {
2488  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2489            (VPANDrr VR128:$src1, VR128:$src2)>;
2490  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2491            (VPANDrr VR128:$src1, VR128:$src2)>;
2492  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2493            (VPANDrr VR128:$src1, VR128:$src2)>;
2494
2495  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2496            (VPORrr VR128:$src1, VR128:$src2)>;
2497  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2498            (VPORrr VR128:$src1, VR128:$src2)>;
2499  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2500            (VPORrr VR128:$src1, VR128:$src2)>;
2501
2502  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2503            (VPXORrr VR128:$src1, VR128:$src2)>;
2504  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2505            (VPXORrr VR128:$src1, VR128:$src2)>;
2506  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2507            (VPXORrr VR128:$src1, VR128:$src2)>;
2508
2509  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2510            (VPANDNrr VR128:$src1, VR128:$src2)>;
2511  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2512            (VPANDNrr VR128:$src1, VR128:$src2)>;
2513  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2514            (VPANDNrr VR128:$src1, VR128:$src2)>;
2515
2516  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2517            (VPANDrm VR128:$src1, addr:$src2)>;
2518  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2519            (VPANDrm VR128:$src1, addr:$src2)>;
2520  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2521            (VPANDrm VR128:$src1, addr:$src2)>;
2522
2523  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2524            (VPORrm VR128:$src1, addr:$src2)>;
2525  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2526            (VPORrm VR128:$src1, addr:$src2)>;
2527  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2528            (VPORrm VR128:$src1, addr:$src2)>;
2529
2530  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2531            (VPXORrm VR128:$src1, addr:$src2)>;
2532  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2533            (VPXORrm VR128:$src1, addr:$src2)>;
2534  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2535            (VPXORrm VR128:$src1, addr:$src2)>;
2536
2537  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2538            (VPANDNrm VR128:$src1, addr:$src2)>;
2539  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2540            (VPANDNrm VR128:$src1, addr:$src2)>;
2541  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2542            (VPANDNrm VR128:$src1, addr:$src2)>;
2543}
2544
2545let Predicates = [UseSSE2] in {
2546  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2547            (PANDrr VR128:$src1, VR128:$src2)>;
2548  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2549            (PANDrr VR128:$src1, VR128:$src2)>;
2550  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2551            (PANDrr VR128:$src1, VR128:$src2)>;
2552
2553  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2554            (PORrr VR128:$src1, VR128:$src2)>;
2555  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2556            (PORrr VR128:$src1, VR128:$src2)>;
2557  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2558            (PORrr VR128:$src1, VR128:$src2)>;
2559
2560  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2561            (PXORrr VR128:$src1, VR128:$src2)>;
2562  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2563            (PXORrr VR128:$src1, VR128:$src2)>;
2564  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2565            (PXORrr VR128:$src1, VR128:$src2)>;
2566
2567  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2568            (PANDNrr VR128:$src1, VR128:$src2)>;
2569  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2570            (PANDNrr VR128:$src1, VR128:$src2)>;
2571  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2572            (PANDNrr VR128:$src1, VR128:$src2)>;
2573
2574  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2575            (PANDrm VR128:$src1, addr:$src2)>;
2576  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2577            (PANDrm VR128:$src1, addr:$src2)>;
2578  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2579            (PANDrm VR128:$src1, addr:$src2)>;
2580
2581  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2582            (PORrm VR128:$src1, addr:$src2)>;
2583  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2584            (PORrm VR128:$src1, addr:$src2)>;
2585  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2586            (PORrm VR128:$src1, addr:$src2)>;
2587
2588  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2589            (PXORrm VR128:$src1, addr:$src2)>;
2590  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2591            (PXORrm VR128:$src1, addr:$src2)>;
2592  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2593            (PXORrm VR128:$src1, addr:$src2)>;
2594
2595  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2596            (PANDNrm VR128:$src1, addr:$src2)>;
2597  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2598            (PANDNrm VR128:$src1, addr:$src2)>;
2599  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2600            (PANDNrm VR128:$src1, addr:$src2)>;
2601}
2602
2603// Patterns for packed operations when we don't have integer type available.
2604def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2605          (ANDPSrr VR128:$src1, VR128:$src2)>;
2606def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2607          (ORPSrr VR128:$src1, VR128:$src2)>;
2608def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2609          (XORPSrr VR128:$src1, VR128:$src2)>;
2610def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2611          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2612
2613def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2614          (ANDPSrm VR128:$src1, addr:$src2)>;
2615def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2616          (ORPSrm VR128:$src1, addr:$src2)>;
2617def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2618          (XORPSrm VR128:$src1, addr:$src2)>;
2619def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2620          (ANDNPSrm VR128:$src1, addr:$src2)>;
2621
2622//===----------------------------------------------------------------------===//
2623// SSE 1 & 2 - Arithmetic Instructions
2624//===----------------------------------------------------------------------===//
2625
2626/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2627/// vector forms.
2628///
2629/// In addition, we also have a special variant of the scalar form here to
2630/// represent the associated intrinsic operation.  This form is unlike the
2631/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2632/// and leaves the top elements unmodified (therefore these cannot be commuted).
2633///
2634/// These three forms can each be reg+reg or reg+mem.
2635///
2636
2637/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2638/// classes below
2639multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2640                                  SDPatternOperator OpNode, X86SchedWriteSizes sched> {
2641let Uses = [MXCSR], mayRaiseFPException = 1 in {
2642  let Predicates = [HasAVX, NoVLX] in {
2643  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2644                               VR128, v4f32, f128mem, loadv4f32,
2645                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2646  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2647                               VR128, v2f64, f128mem, loadv2f64,
2648                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2649
2650  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2651                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2652                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2653  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2654                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2655                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2656  }
2657
2658  let Constraints = "$src1 = $dst" in {
2659    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2660                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2661                              sched.PS.XMM>, PS;
2662    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2663                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2664                              sched.PD.XMM>, PD;
2665  }
2666}
2667}
2668
2669multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2670                                  X86SchedWriteSizes sched> {
2671let Uses = [MXCSR], mayRaiseFPException = 1 in {
2672  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2673                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2674                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2675  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2676                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2677                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2678
2679  let Constraints = "$src1 = $dst" in {
2680    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2681                              OpNode, FR32, f32mem, SSEPackedSingle,
2682                              sched.PS.Scl>, XS;
2683    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2684                              OpNode, FR64, f64mem, SSEPackedDouble,
2685                              sched.PD.Scl>, XD;
2686  }
2687}
2688}
2689
2690multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2691                                      SDPatternOperator OpNode,
2692                                      X86SchedWriteSizes sched> {
2693let Uses = [MXCSR], mayRaiseFPException = 1 in {
2694  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2695                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2696                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2697  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2698                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2699                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2700
2701  let Constraints = "$src1 = $dst" in {
2702    defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2703                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2704                   SSEPackedSingle, sched.PS.Scl>, XS;
2705    defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2706                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2707                   SSEPackedDouble, sched.PD.Scl>, XD;
2708  }
2709}
2710}
2711
2712// Binary Arithmetic instructions
2713defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2714           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2715           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2716defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2717           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2718           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2719let isCommutable = 0 in {
2720  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2721             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2722             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2723  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2724             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2725             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2726  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2727             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2728             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2729  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2730             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2731             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2732}
2733
2734let isCodeGenOnly = 1 in {
2735  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2736             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2737  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2738             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2739}
2740
2741// Patterns used to select SSE scalar fp arithmetic instructions from
2742// either:
2743//
2744// (1) a scalar fp operation followed by a blend
2745//
2746// The effect is that the backend no longer emits unnecessary vector
2747// insert instructions immediately after SSE scalar fp instructions
2748// like addss or mulss.
2749//
2750// For example, given the following code:
2751//   __m128 foo(__m128 A, __m128 B) {
2752//     A[0] += B[0];
2753//     return A;
2754//   }
2755//
2756// Previously we generated:
2757//   addss %xmm0, %xmm1
2758//   movss %xmm1, %xmm0
2759//
2760// We now generate:
2761//   addss %xmm1, %xmm0
2762//
2763// (2) a vector packed single/double fp operation followed by a vector insert
2764//
2765// The effect is that the backend converts the packed fp instruction
2766// followed by a vector insert into a single SSE scalar fp instruction.
2767//
2768// For example, given the following code:
2769//   __m128 foo(__m128 A, __m128 B) {
2770//     __m128 C = A + B;
2771//     return (__m128) {c[0], a[1], a[2], a[3]};
2772//   }
2773//
2774// Previously we generated:
2775//   addps %xmm0, %xmm1
2776//   movss %xmm1, %xmm0
2777//
2778// We now generate:
2779//   addss %xmm1, %xmm0
2780
2781// TODO: Some canonicalization in lowering would simplify the number of
2782// patterns we have to try to match.
2783multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
2784                                ValueType VT, ValueType EltTy,
2785                                RegisterClass RC, PatFrag ld_frag,
2786                                Predicate BasePredicate> {
2787  let Predicates = [BasePredicate] in {
2788    // extracted scalar math op with insert via movss/movsd
2789    def : Pat<(VT (Move (VT VR128:$dst),
2790                        (VT (scalar_to_vector
2791                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2792                                 RC:$src))))),
2793              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2794               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2795    def : Pat<(VT (Move (VT VR128:$dst),
2796                        (VT (scalar_to_vector
2797                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2798                                 (ld_frag addr:$src)))))),
2799              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2800  }
2801
2802  // Repeat for AVX versions of the instructions.
2803  let Predicates = [UseAVX] in {
2804    // extracted scalar math op with insert via movss/movsd
2805    def : Pat<(VT (Move (VT VR128:$dst),
2806                        (VT (scalar_to_vector
2807                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2808                                 RC:$src))))),
2809              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2810               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2811    def : Pat<(VT (Move (VT VR128:$dst),
2812                        (VT (scalar_to_vector
2813                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2814                                 (ld_frag addr:$src)))))),
2815              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2816  }
2817}
2818
2819defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2820defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2821defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2822defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2823
2824defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2825defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2826defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2827defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2828
2829/// Unop Arithmetic
2830/// In addition, we also have a special variant of the scalar form here to
2831/// represent the associated intrinsic operation.  This form is unlike the
2832/// plain scalar form, in that it takes an entire vector (instead of a
2833/// scalar) and leaves the top elements undefined.
2834///
2835/// And, we have a special variant form for a full-vector intrinsic form.
2836
2837/// sse_fp_unop_s - SSE1 unops in scalar form
2838/// For the non-AVX defs, we need $src1 to be tied to $dst because
2839/// the HW instructions are 2 operand / destructive.
2840multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2841                          X86MemOperand x86memop, Operand intmemop,
2842                          SDPatternOperator OpNode, Domain d,
2843                          X86FoldableSchedWrite sched, Predicate target> {
2844  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2845  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2846              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2847            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2848            Requires<[target]>;
2849  let mayLoad = 1 in
2850  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2851            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2852            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2853            Sched<[sched.Folded]>,
2854            Requires<[target, OptForSize]>;
2855  }
2856
2857  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2858  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2859                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2860                Sched<[sched]>;
2861  let mayLoad = 1 in
2862  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2863                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2864                Sched<[sched.Folded, sched.ReadAfterFold]>;
2865  }
2866
2867}
2868
2869multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2870                              Intrinsic Intr, Predicate target> {
2871  let Predicates = [target] in {
2872  // These are unary operations, but they are modeled as having 2 source operands
2873  // because the high elements of the destination are unchanged in SSE.
2874  def : Pat<(Intr VR128:$src),
2875            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2876  }
2877  // We don't want to fold scalar loads into these instructions unless
2878  // optimizing for size. This is because the folded instruction will have a
2879  // partial register update, while the unfolded sequence will not, e.g.
2880  // movss mem, %xmm0
2881  // rcpss %xmm0, %xmm0
2882  // which has a clobber before the rcp, vs.
2883  // rcpss mem, %xmm0
2884  let Predicates = [target, OptForSize] in {
2885    def : Pat<(Intr (mem_frags addr:$src2)),
2886               (!cast<Instruction>(NAME#m_Int)
2887                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2888  }
2889}
2890
2891multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2892                              Intrinsic Intr, Predicate target> {
2893  let Predicates = [target] in {
2894   def : Pat<(Intr VR128:$src),
2895             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2896                                 VR128:$src)>;
2897  }
2898  let Predicates = [target, OptForSize] in {
2899    def : Pat<(Intr (mem_frags addr:$src2)),
2900              (!cast<Instruction>(NAME#m_Int)
2901                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2902  }
2903}
2904
2905multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2906                          ValueType ScalarVT, X86MemOperand x86memop,
2907                          Operand intmemop, SDPatternOperator OpNode, Domain d,
2908                          X86FoldableSchedWrite sched, Predicate target> {
2909  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2910  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2911            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2912            [], d>, Sched<[sched]>;
2913  let mayLoad = 1 in
2914  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2915             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2916            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2917  }
2918  let hasSideEffects = 0, ExeDomain = d in {
2919  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2920                (ins VR128:$src1, VR128:$src2),
2921             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2922             []>, Sched<[sched]>;
2923  let mayLoad = 1 in
2924  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2925                (ins VR128:$src1, intmemop:$src2),
2926             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2927             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2928  }
2929
2930  // We don't want to fold scalar loads into these instructions unless
2931  // optimizing for size. This is because the folded instruction will have a
2932  // partial register update, while the unfolded sequence will not, e.g.
2933  // vmovss mem, %xmm0
2934  // vrcpss %xmm0, %xmm0, %xmm0
2935  // which has a clobber before the rcp, vs.
2936  // vrcpss mem, %xmm0, %xmm0
2937  // TODO: In theory, we could fold the load, and avoid the stall caused by
2938  // the partial register store, either in BreakFalseDeps or with smarter RA.
2939  let Predicates = [target] in {
2940   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2941                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2942  }
2943  let Predicates = [target, OptForSize] in {
2944    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2945              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2946            addr:$src)>;
2947  }
2948}
2949
2950/// sse1_fp_unop_p - SSE1 unops in packed form.
2951multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2952                          X86SchedWriteWidths sched, list<Predicate> prds> {
2953let Predicates = prds in {
2954  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2955                       !strconcat("v", OpcodeStr,
2956                                  "ps\t{$src, $dst|$dst, $src}"),
2957                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2958                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2959  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2960                       !strconcat("v", OpcodeStr,
2961                                  "ps\t{$src, $dst|$dst, $src}"),
2962                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2963                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2964  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2965                        !strconcat("v", OpcodeStr,
2966                                   "ps\t{$src, $dst|$dst, $src}"),
2967                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2968                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2969  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2970                        !strconcat("v", OpcodeStr,
2971                                   "ps\t{$src, $dst|$dst, $src}"),
2972                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2973                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2974}
2975
2976  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2977                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2978                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2979                Sched<[sched.XMM]>;
2980  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2981                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2982                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2983                Sched<[sched.XMM.Folded]>;
2984}
2985
2986/// sse2_fp_unop_p - SSE2 unops in vector forms.
2987multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2988                          SDPatternOperator OpNode, X86SchedWriteWidths sched> {
2989let Predicates = [HasAVX, NoVLX] in {
2990  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2991                       !strconcat("v", OpcodeStr,
2992                                  "pd\t{$src, $dst|$dst, $src}"),
2993                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2994                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2995  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2996                       !strconcat("v", OpcodeStr,
2997                                  "pd\t{$src, $dst|$dst, $src}"),
2998                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2999                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
3000  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3001                        !strconcat("v", OpcodeStr,
3002                                   "pd\t{$src, $dst|$dst, $src}"),
3003                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
3004                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3005  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3006                        !strconcat("v", OpcodeStr,
3007                                   "pd\t{$src, $dst|$dst, $src}"),
3008                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
3009                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
3010}
3011
3012  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3013                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3014                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3015                Sched<[sched.XMM]>;
3016  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3017                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3018                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3019                Sched<[sched.XMM.Folded]>;
3020}
3021
3022multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
3023  defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
3024                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3025                      UseSSE1>, XS;
3026  defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
3027                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3028                      AVXTarget>,
3029                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
3030}
3031
3032multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3033                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3034  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
3035                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3036  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
3037                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3038                       XS, VEX_4V, VEX_LIG, VEX_WIG;
3039}
3040
3041multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3042                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3043  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
3044                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3045  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
3046                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3047                         XD, VEX_4V, VEX_LIG, VEX_WIG;
3048}
3049
3050// Square root.
3051defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3052             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3053             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3054             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3055
3056// Reciprocal approximations. Note that these typically require refinement
3057// in order to obtain suitable precision.
3058defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3059             sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
3060             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3061defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3062             sse1_fp_unop_s_intr<"rcp", HasAVX>,
3063             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3064
3065// There is no f64 version of the reciprocal approximation instructions.
3066
3067multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
3068                                      ValueType VT, Predicate BasePredicate> {
3069  let Predicates = [BasePredicate] in {
3070    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3071                                  (OpNode (extractelt VT:$src, 0))))),
3072              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3073  }
3074
3075  // Repeat for AVX versions of the instructions.
3076  let Predicates = [UseAVX] in {
3077    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3078                                  (OpNode (extractelt VT:$src, 0))))),
3079              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3080  }
3081}
3082
3083defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3084defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3085
3086multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3087                                           SDNode Move, ValueType VT,
3088                                           Predicate BasePredicate> {
3089  let Predicates = [BasePredicate] in {
3090    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3091              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3092  }
3093
3094  // Repeat for AVX versions of the instructions.
3095  let Predicates = [HasAVX] in {
3096    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3097              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3098  }
3099}
3100
3101defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3102                                       v4f32, UseSSE1>;
3103defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3104                                       v4f32, UseSSE1>;
3105
3106
3107//===----------------------------------------------------------------------===//
3108// SSE 1 & 2 - Non-temporal stores
3109//===----------------------------------------------------------------------===//
3110
3111let AddedComplexity = 400 in { // Prefer non-temporal versions
3112let Predicates = [HasAVX, NoVLX] in {
3113let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3114def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3115                     (ins f128mem:$dst, VR128:$src),
3116                     "movntps\t{$src, $dst|$dst, $src}",
3117                     [(alignednontemporalstore (v4f32 VR128:$src),
3118                                               addr:$dst)]>, VEX, VEX_WIG;
3119def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3120                     (ins f128mem:$dst, VR128:$src),
3121                     "movntpd\t{$src, $dst|$dst, $src}",
3122                     [(alignednontemporalstore (v2f64 VR128:$src),
3123                                               addr:$dst)]>, VEX, VEX_WIG;
3124} // SchedRW
3125
3126let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3127def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3128                     (ins f256mem:$dst, VR256:$src),
3129                     "movntps\t{$src, $dst|$dst, $src}",
3130                     [(alignednontemporalstore (v8f32 VR256:$src),
3131                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3132def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3133                     (ins f256mem:$dst, VR256:$src),
3134                     "movntpd\t{$src, $dst|$dst, $src}",
3135                     [(alignednontemporalstore (v4f64 VR256:$src),
3136                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3137} // SchedRW
3138
3139let ExeDomain = SSEPackedInt in {
3140def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3141                         (ins i128mem:$dst, VR128:$src),
3142                         "movntdq\t{$src, $dst|$dst, $src}",
3143                         [(alignednontemporalstore (v2i64 VR128:$src),
3144                                                   addr:$dst)]>, VEX, VEX_WIG,
3145                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3146def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3147                    (ins i256mem:$dst, VR256:$src),
3148                    "movntdq\t{$src, $dst|$dst, $src}",
3149                    [(alignednontemporalstore (v4i64 VR256:$src),
3150                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3151                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3152} // ExeDomain
3153} // Predicates
3154
3155let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3156def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3157                    "movntps\t{$src, $dst|$dst, $src}",
3158                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3159def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3160                    "movntpd\t{$src, $dst|$dst, $src}",
3161                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3162} // SchedRW
3163
3164let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3165def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3166                    "movntdq\t{$src, $dst|$dst, $src}",
3167                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3168
3169let SchedRW = [WriteStoreNT] in {
3170// There is no AVX form for instructions below this point
3171def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3172                 "movnti{l}\t{$src, $dst|$dst, $src}",
3173                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3174               PS, Requires<[HasSSE2]>;
3175def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3176                     "movnti{q}\t{$src, $dst|$dst, $src}",
3177                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3178                  PS, Requires<[HasSSE2]>;
3179} // SchedRW = [WriteStoreNT]
3180
3181let Predicates = [HasAVX, NoVLX] in {
3182  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3183            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3184  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3185            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3186  def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
3187            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3188  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3189            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3190
3191  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3192            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3193  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3194            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3195  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3196            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3197  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3198            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3199}
3200
3201let Predicates = [UseSSE2] in {
3202  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3203            (MOVNTDQmr addr:$dst, VR128:$src)>;
3204  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3205            (MOVNTDQmr addr:$dst, VR128:$src)>;
3206  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3207            (MOVNTDQmr addr:$dst, VR128:$src)>;
3208  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3209            (MOVNTDQmr addr:$dst, VR128:$src)>;
3210}
3211
3212} // AddedComplexity
3213
3214//===----------------------------------------------------------------------===//
3215// SSE 1 & 2 - Prefetch and memory fence
3216//===----------------------------------------------------------------------===//
3217
3218// Prefetch intrinsic.
3219let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3220def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3221    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3222def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3223    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3224def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3225    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3226def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3227    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3228}
3229
3230// FIXME: How should flush instruction be modeled?
3231let SchedRW = [WriteLoad] in {
3232// Flush cache
3233def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3234               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3235               PS, Requires<[HasCLFLUSH]>;
3236}
3237
3238let SchedRW = [WriteNop] in {
3239// Pause. This "instruction" is encoded as "rep; nop", so even though it
3240// was introduced with SSE2, it's backward compatible.
3241def PAUSE : I<0x90, RawFrm, (outs), (ins),
3242              "pause", [(int_x86_sse2_pause)]>, OBXS;
3243}
3244
3245let SchedRW = [WriteFence] in {
3246// Load, store, and memory fence
3247// TODO: As with mfence, we may want to ease the availability of sfence/lfence
3248// to include any 64-bit target.
3249def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3250               PS, Requires<[HasSSE1]>;
3251def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3252               PS, Requires<[HasSSE2]>;
3253def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3254               PS, Requires<[HasMFence]>;
3255} // SchedRW
3256
3257def : Pat<(X86MFence), (MFENCE)>;
3258
3259//===----------------------------------------------------------------------===//
3260// SSE 1 & 2 - Load/Store XCSR register
3261//===----------------------------------------------------------------------===//
3262
3263let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3264def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3265               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3266               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3267let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3268def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3269               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3270               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3271
3272let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3273def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3274              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3275              PS, Sched<[WriteLDMXCSR]>;
3276let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3277def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3278              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3279              PS, Sched<[WriteSTMXCSR]>;
3280
3281//===---------------------------------------------------------------------===//
3282// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3283//===---------------------------------------------------------------------===//
3284
3285let ExeDomain = SSEPackedInt in { // SSE integer instructions
3286
3287let hasSideEffects = 0 in {
3288def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3289                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3290                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3291def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3292                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3293                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3294def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3295                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3296                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3297def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3298                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3299                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3300}
3301
3302// For Disassembler
3303let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3304def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3305                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3306                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3307                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3308def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3309                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3310                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3311                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3312def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3313                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3314                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3315                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3316def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3317                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3318                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3319                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3320}
3321
3322let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3323    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3324def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3325                      "movdqa\t{$src, $dst|$dst, $src}",
3326                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3327                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3328def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3329                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3330                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3331                      VEX, VEX_L, VEX_WIG;
3332def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3333                   "vmovdqu\t{$src, $dst|$dst, $src}",
3334                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3335                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3336                   XS, VEX, VEX_WIG;
3337def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3338                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3339                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3340                   XS, VEX, VEX_L, VEX_WIG;
3341}
3342
3343let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3344def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3345                      (ins i128mem:$dst, VR128:$src),
3346                      "movdqa\t{$src, $dst|$dst, $src}",
3347                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3348                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3349def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3350                      (ins i256mem:$dst, VR256:$src),
3351                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3352                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3353def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3354                   "vmovdqu\t{$src, $dst|$dst, $src}",
3355                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3356                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3357def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3358                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3359                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3360}
3361
3362let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3363let hasSideEffects = 0 in {
3364def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3365                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3366
3367def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3368                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3369                   XS, Requires<[UseSSE2]>;
3370}
3371
3372// For Disassembler
3373let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3374def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3375                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3376                       FoldGenData<"MOVDQArr">;
3377
3378def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3379                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3380                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3381}
3382} // SchedRW
3383
3384let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3385    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3386def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3387                   "movdqa\t{$src, $dst|$dst, $src}",
3388                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3389def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3390                   "movdqu\t{$src, $dst|$dst, $src}",
3391                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3392                 XS, Requires<[UseSSE2]>;
3393}
3394
3395let mayStore = 1, hasSideEffects = 0,
3396    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3397def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3398                   "movdqa\t{$src, $dst|$dst, $src}",
3399                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3400def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3401                   "movdqu\t{$src, $dst|$dst, $src}",
3402                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3403                 XS, Requires<[UseSSE2]>;
3404}
3405
3406} // ExeDomain = SSEPackedInt
3407
3408// Reversed version with ".s" suffix for GAS compatibility.
3409def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3410                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3411def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3412                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3413def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3414                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3415def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3416                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3417
3418// Reversed version with ".s" suffix for GAS compatibility.
3419def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3420                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3421def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3422                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3423
3424let Predicates = [HasAVX, NoVLX] in {
3425  // Additional patterns for other integer sizes.
3426  def : Pat<(alignedloadv4i32 addr:$src),
3427            (VMOVDQArm addr:$src)>;
3428  def : Pat<(alignedloadv8i16 addr:$src),
3429            (VMOVDQArm addr:$src)>;
3430  def : Pat<(alignedloadv8f16 addr:$src),
3431            (VMOVDQArm addr:$src)>;
3432  def : Pat<(alignedloadv16i8 addr:$src),
3433            (VMOVDQArm addr:$src)>;
3434  def : Pat<(loadv4i32 addr:$src),
3435            (VMOVDQUrm addr:$src)>;
3436  def : Pat<(loadv8i16 addr:$src),
3437            (VMOVDQUrm addr:$src)>;
3438  def : Pat<(loadv8f16 addr:$src),
3439            (VMOVDQUrm addr:$src)>;
3440  def : Pat<(loadv16i8 addr:$src),
3441            (VMOVDQUrm addr:$src)>;
3442
3443  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3444            (VMOVDQAmr addr:$dst, VR128:$src)>;
3445  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3446            (VMOVDQAmr addr:$dst, VR128:$src)>;
3447  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
3448            (VMOVDQAmr addr:$dst, VR128:$src)>;
3449  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3450            (VMOVDQAmr addr:$dst, VR128:$src)>;
3451  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3452            (VMOVDQUmr addr:$dst, VR128:$src)>;
3453  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3454            (VMOVDQUmr addr:$dst, VR128:$src)>;
3455  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
3456            (VMOVDQUmr addr:$dst, VR128:$src)>;
3457  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3458            (VMOVDQUmr addr:$dst, VR128:$src)>;
3459}
3460
3461//===---------------------------------------------------------------------===//
3462// SSE2 - Packed Integer Arithmetic Instructions
3463//===---------------------------------------------------------------------===//
3464
3465let ExeDomain = SSEPackedInt in { // SSE integer instructions
3466
3467/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3468multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3469                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3470                         PatFrag memop_frag, X86MemOperand x86memop,
3471                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3472  let isCommutable = 1 in
3473  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3474       (ins RC:$src1, RC:$src2),
3475       !if(Is2Addr,
3476           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3477           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3478       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3479       Sched<[sched]>;
3480  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3481       (ins RC:$src1, x86memop:$src2),
3482       !if(Is2Addr,
3483           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3484           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3485       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3486                                     (memop_frag addr:$src2))))]>,
3487       Sched<[sched.Folded, sched.ReadAfterFold]>;
3488}
3489} // ExeDomain = SSEPackedInt
3490
3491defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3492                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3493defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3494                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3495defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3496                             SchedWriteVecALU, 1, NoVLX>;
3497defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3498                             SchedWriteVecALU, 1, NoVLX>;
3499defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3500                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3501defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3502                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3503defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3504                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3505defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3506                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3507defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3508                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3509defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3510                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3511defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3512                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3513defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3514                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3515defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3516                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3517defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3518                             SchedWriteVecALU, 0, NoVLX>;
3519defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3520                             SchedWriteVecALU, 0, NoVLX>;
3521defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3522                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3523defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3524                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3525defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3526                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3527defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3528                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3529defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3530                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3531defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3532                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3533defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3534                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3535defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3536                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3537defm PAVGB   : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
3538                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3539defm PAVGW   : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
3540                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3541defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3542                             SchedWriteVecIMul, 1, NoVLX>;
3543
3544let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3545defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3546                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3547                              VEX_4V, VEX_WIG;
3548
3549let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3550defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3551                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3552                               0>, VEX_4V, VEX_L, VEX_WIG;
3553let Constraints = "$src1 = $dst" in
3554defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3555                             memop, i128mem, SchedWriteVecIMul.XMM>;
3556
3557let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3558defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3559                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3560                             VEX_4V, VEX_WIG;
3561let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3562defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3563                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3564                             VEX_4V, VEX_L, VEX_WIG;
3565let Constraints = "$src1 = $dst" in
3566defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3567                            memop, i128mem, SchedWritePSADBW.XMM>;
3568
3569//===---------------------------------------------------------------------===//
3570// SSE2 - Packed Integer Logical Instructions
3571//===---------------------------------------------------------------------===//
3572
3573multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3574                         string OpcodeStr, SDNode OpNode,
3575                         SDNode OpNode2, RegisterClass RC,
3576                         X86FoldableSchedWrite sched,
3577                         X86FoldableSchedWrite schedImm,
3578                         ValueType DstVT, ValueType SrcVT,
3579                         PatFrag ld_frag, bit Is2Addr = 1> {
3580  // src2 is always 128-bit
3581  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3582       (ins RC:$src1, VR128:$src2),
3583       !if(Is2Addr,
3584           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3585           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3586       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3587       Sched<[sched]>;
3588  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3589       (ins RC:$src1, i128mem:$src2),
3590       !if(Is2Addr,
3591           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3592           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3593       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3594                       (SrcVT (ld_frag addr:$src2)))))]>,
3595       Sched<[sched.Folded, sched.ReadAfterFold]>;
3596  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3597       (ins RC:$src1, u8imm:$src2),
3598       !if(Is2Addr,
3599           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3600           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3601       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3602       Sched<[schedImm]>;
3603}
3604
3605multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3606                             string OpcodeStr, SDNode OpNode,
3607                             SDNode OpNode2, ValueType DstVT128,
3608                             ValueType DstVT256, ValueType SrcVT,
3609                             X86SchedWriteWidths sched,
3610                             X86SchedWriteWidths schedImm, Predicate prd> {
3611let Predicates = [HasAVX, prd] in
3612  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3613                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3614                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3615let Predicates = [HasAVX2, prd] in
3616  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3617                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3618                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3619                                VEX_WIG;
3620let Constraints = "$src1 = $dst" in
3621  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3622                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3623                            memop>;
3624}
3625
3626multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3627                        SDNode OpNode, RegisterClass RC, ValueType VT,
3628                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3629  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3630       !if(Is2Addr,
3631           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3632           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3633       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3634       Sched<[sched]>;
3635}
3636
3637multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3638                            SDNode OpNode, X86SchedWriteWidths sched> {
3639let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3640  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3641                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3642let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3643  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3644                               VR256, v32i8, sched.YMM, 0>,
3645                               VEX_4V, VEX_L, VEX_WIG;
3646let Constraints = "$src1 = $dst" in
3647  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3648                           sched.XMM>;
3649}
3650
3651let ExeDomain = SSEPackedInt in {
3652  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3653                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3654                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3655  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3656                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3657                                 SchedWriteVecShiftImm, NoVLX>;
3658  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3659                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3660                                 SchedWriteVecShiftImm, NoVLX>;
3661
3662  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3663                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3664                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3665  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3666                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3667                                 SchedWriteVecShiftImm, NoVLX>;
3668  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3669                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3670                                 SchedWriteVecShiftImm, NoVLX>;
3671
3672  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3673                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3674                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3675  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3676                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3677                                 SchedWriteVecShiftImm, NoVLX>;
3678
3679  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3680                                 SchedWriteShuffle>;
3681  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3682                                 SchedWriteShuffle>;
3683} // ExeDomain = SSEPackedInt
3684
3685//===---------------------------------------------------------------------===//
3686// SSE2 - Packed Integer Comparison Instructions
3687//===---------------------------------------------------------------------===//
3688
3689defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3690                             SchedWriteVecALU, 1, TruePredicate>;
3691defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3692                             SchedWriteVecALU, 1, TruePredicate>;
3693defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3694                             SchedWriteVecALU, 1, TruePredicate>;
3695defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3696                             SchedWriteVecALU, 0, TruePredicate>;
3697defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3698                             SchedWriteVecALU, 0, TruePredicate>;
3699defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3700                             SchedWriteVecALU, 0, TruePredicate>;
3701
3702//===---------------------------------------------------------------------===//
3703// SSE2 - Packed Integer Shuffle Instructions
3704//===---------------------------------------------------------------------===//
3705
3706let ExeDomain = SSEPackedInt in {
3707multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3708                         SDNode OpNode, X86SchedWriteWidths sched,
3709                         Predicate prd> {
3710let Predicates = [HasAVX, prd] in {
3711  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3712                      (ins VR128:$src1, u8imm:$src2),
3713                      !strconcat("v", OpcodeStr,
3714                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3715                      [(set VR128:$dst,
3716                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3717                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3718  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3719                      (ins i128mem:$src1, u8imm:$src2),
3720                      !strconcat("v", OpcodeStr,
3721                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3722                     [(set VR128:$dst,
3723                       (vt128 (OpNode (load addr:$src1),
3724                        (i8 timm:$src2))))]>, VEX,
3725                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3726}
3727
3728let Predicates = [HasAVX2, prd] in {
3729  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3730                       (ins VR256:$src1, u8imm:$src2),
3731                       !strconcat("v", OpcodeStr,
3732                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3733                       [(set VR256:$dst,
3734                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3735                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3736  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3737                       (ins i256mem:$src1, u8imm:$src2),
3738                       !strconcat("v", OpcodeStr,
3739                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3740                      [(set VR256:$dst,
3741                        (vt256 (OpNode (load addr:$src1),
3742                         (i8 timm:$src2))))]>, VEX, VEX_L,
3743                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3744}
3745
3746let Predicates = [UseSSE2] in {
3747  def ri : Ii8<0x70, MRMSrcReg,
3748               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3749               !strconcat(OpcodeStr,
3750                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3751               [(set VR128:$dst,
3752                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3753               Sched<[sched.XMM]>;
3754  def mi : Ii8<0x70, MRMSrcMem,
3755               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3756               !strconcat(OpcodeStr,
3757                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3758               [(set VR128:$dst,
3759                 (vt128 (OpNode (memop addr:$src1),
3760                        (i8 timm:$src2))))]>,
3761               Sched<[sched.XMM.Folded]>;
3762}
3763}
3764} // ExeDomain = SSEPackedInt
3765
3766defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3767                             SchedWriteShuffle, NoVLX>, PD;
3768defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3769                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3770defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3771                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3772
3773//===---------------------------------------------------------------------===//
3774// Packed Integer Pack Instructions (SSE & AVX)
3775//===---------------------------------------------------------------------===//
3776
3777let ExeDomain = SSEPackedInt in {
3778multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3779                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3780                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3781                     PatFrag ld_frag, bit Is2Addr = 1> {
3782  def rr : PDI<opc, MRMSrcReg,
3783               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3784               !if(Is2Addr,
3785                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3786                   !strconcat(OpcodeStr,
3787                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3788               [(set RC:$dst,
3789                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3790               Sched<[sched]>;
3791  def rm : PDI<opc, MRMSrcMem,
3792               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3793               !if(Is2Addr,
3794                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3795                   !strconcat(OpcodeStr,
3796                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3797               [(set RC:$dst,
3798                     (OutVT (OpNode (ArgVT RC:$src1),
3799                                    (ld_frag addr:$src2))))]>,
3800               Sched<[sched.Folded, sched.ReadAfterFold]>;
3801}
3802
3803multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3804                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3805                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3806                     PatFrag ld_frag, bit Is2Addr = 1> {
3807  def rr : SS48I<opc, MRMSrcReg,
3808                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3809                 !if(Is2Addr,
3810                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3811                     !strconcat(OpcodeStr,
3812                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3813                 [(set RC:$dst,
3814                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3815                 Sched<[sched]>;
3816  def rm : SS48I<opc, MRMSrcMem,
3817                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3818                 !if(Is2Addr,
3819                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3820                     !strconcat(OpcodeStr,
3821                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3822                 [(set RC:$dst,
3823                       (OutVT (OpNode (ArgVT RC:$src1),
3824                                      (ld_frag addr:$src2))))]>,
3825                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3826}
3827
3828let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3829  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3830                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3831                             VEX_4V, VEX_WIG;
3832  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3833                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3834                             VEX_4V, VEX_WIG;
3835
3836  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3837                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3838                             VEX_4V, VEX_WIG;
3839  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3840                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3841                             VEX_4V, VEX_WIG;
3842}
3843
3844let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3845  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3846                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3847                              VEX_4V, VEX_L, VEX_WIG;
3848  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3849                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3850                              VEX_4V, VEX_L, VEX_WIG;
3851
3852  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3853                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3854                              VEX_4V, VEX_L, VEX_WIG;
3855  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3856                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3857                              VEX_4V, VEX_L, VEX_WIG;
3858}
3859
3860let Constraints = "$src1 = $dst" in {
3861  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3862                            i128mem, SchedWriteShuffle.XMM, memop>;
3863  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3864                            i128mem, SchedWriteShuffle.XMM, memop>;
3865
3866  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3867                            i128mem, SchedWriteShuffle.XMM, memop>;
3868
3869  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3870                            i128mem, SchedWriteShuffle.XMM, memop>;
3871}
3872} // ExeDomain = SSEPackedInt
3873
3874//===---------------------------------------------------------------------===//
3875// SSE2 - Packed Integer Unpack Instructions
3876//===---------------------------------------------------------------------===//
3877
3878let ExeDomain = SSEPackedInt in {
3879multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3880                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3881                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3882                       bit Is2Addr = 1> {
3883  def rr : PDI<opc, MRMSrcReg,
3884      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3885      !if(Is2Addr,
3886          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3887          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3888      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3889      Sched<[sched]>;
3890  def rm : PDI<opc, MRMSrcMem,
3891      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3892      !if(Is2Addr,
3893          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3894          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3895      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3896      Sched<[sched.Folded, sched.ReadAfterFold]>;
3897}
3898
3899let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3900  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3901                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3902                                 VEX_4V, VEX_WIG;
3903  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3904                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3905                                 VEX_4V, VEX_WIG;
3906  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3907                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3908                                 VEX_4V, VEX_WIG;
3909  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3910                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3911                                 VEX_4V, VEX_WIG;
3912}
3913
3914let Predicates = [HasAVX, NoVLX] in {
3915  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3916                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3917                                 VEX_4V, VEX_WIG;
3918  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3919                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3920                                 VEX_4V, VEX_WIG;
3921  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3922                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3923                                 VEX_4V, VEX_WIG;
3924  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3925                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3926                                 VEX_4V, VEX_WIG;
3927}
3928
3929let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3930  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3931                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3932                                  VEX_4V, VEX_L, VEX_WIG;
3933  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3934                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3935                                  VEX_4V, VEX_L, VEX_WIG;
3936  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3937                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3938                                  VEX_4V, VEX_L, VEX_WIG;
3939  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3940                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3941                                  VEX_4V, VEX_L, VEX_WIG;
3942}
3943
3944let Predicates = [HasAVX2, NoVLX] in {
3945  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3946                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3947                                  VEX_4V, VEX_L, VEX_WIG;
3948  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3949                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3950                                  VEX_4V, VEX_L, VEX_WIG;
3951  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3952                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3953                                  VEX_4V, VEX_L, VEX_WIG;
3954  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3955                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3956                                  VEX_4V, VEX_L, VEX_WIG;
3957}
3958
3959let Constraints = "$src1 = $dst" in {
3960  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3961                                i128mem, SchedWriteShuffle.XMM, memop>;
3962  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3963                                i128mem, SchedWriteShuffle.XMM, memop>;
3964  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3965                                i128mem, SchedWriteShuffle.XMM, memop>;
3966  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3967                                i128mem, SchedWriteShuffle.XMM, memop>;
3968
3969  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3970                                i128mem, SchedWriteShuffle.XMM, memop>;
3971  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3972                                i128mem, SchedWriteShuffle.XMM, memop>;
3973  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3974                                i128mem, SchedWriteShuffle.XMM, memop>;
3975  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3976                                i128mem, SchedWriteShuffle.XMM, memop>;
3977}
3978} // ExeDomain = SSEPackedInt
3979
3980//===---------------------------------------------------------------------===//
3981// SSE2 - Packed Integer Extract and Insert
3982//===---------------------------------------------------------------------===//
3983
3984let ExeDomain = SSEPackedInt in {
3985multiclass sse2_pinsrw<bit Is2Addr = 1> {
3986  def rr : Ii8<0xC4, MRMSrcReg,
3987       (outs VR128:$dst), (ins VR128:$src1,
3988        GR32orGR64:$src2, u8imm:$src3),
3989       !if(Is2Addr,
3990           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3991           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3992       [(set VR128:$dst,
3993         (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
3994       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3995  def rm : Ii8<0xC4, MRMSrcMem,
3996                      (outs VR128:$dst), (ins VR128:$src1,
3997                       i16mem:$src2, u8imm:$src3),
3998       !if(Is2Addr,
3999           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
4000           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
4001       [(set VR128:$dst,
4002         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
4003                    timm:$src3))]>,
4004       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
4005}
4006
4007// Extract
4008let Predicates = [HasAVX, NoBWI] in
4009def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
4010                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4011                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4012                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4013                                            timm:$src2))]>,
4014                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
4015def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
4016                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4017                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4018                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4019                                            timm:$src2))]>,
4020               Sched<[WriteVecExtract]>;
4021
4022// Insert
4023let Predicates = [HasAVX, NoBWI] in
4024defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
4025
4026let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4027defm PINSRW : sse2_pinsrw, PD;
4028
4029} // ExeDomain = SSEPackedInt
4030
4031// Always select FP16 instructions if available.
4032let Predicates = [UseSSE2], AddedComplexity = -10 in {
4033  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4034  def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
4035  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4036  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4037}
4038
4039let Predicates = [HasAVX, NoBWI] in {
4040  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4041  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4042  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4043}
4044
4045//===---------------------------------------------------------------------===//
4046// SSE2 - Packed Mask Creation
4047//===---------------------------------------------------------------------===//
4048
4049let ExeDomain = SSEPackedInt in {
4050
4051def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4052           (ins VR128:$src),
4053           "pmovmskb\t{$src, $dst|$dst, $src}",
4054           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4055           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
4056
4057let Predicates = [HasAVX2] in {
4058def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4059           (ins VR256:$src),
4060           "pmovmskb\t{$src, $dst|$dst, $src}",
4061           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4062           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
4063}
4064
4065def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4066           "pmovmskb\t{$src, $dst|$dst, $src}",
4067           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4068           Sched<[WriteVecMOVMSK]>;
4069
4070} // ExeDomain = SSEPackedInt
4071
4072//===---------------------------------------------------------------------===//
4073// SSE2 - Conditional Store
4074//===---------------------------------------------------------------------===//
4075
4076let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4077// As VEX does not have separate instruction contexts for address size
4078// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
4079// Prefer VMASKMODDQU64.
4080let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
4081def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4082           (ins VR128:$src, VR128:$mask),
4083           "maskmovdqu\t{$mask, $src|$src, $mask}",
4084           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4085           VEX, VEX_WIG;
4086let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4087def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4088           (ins VR128:$src, VR128:$mask),
4089           "maskmovdqu\t{$mask, $src|$src, $mask}",
4090           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4091           VEX, VEX_WIG;
4092
4093let Uses = [EDI], Predicates = [UseSSE2] in
4094def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4095           "maskmovdqu\t{$mask, $src|$src, $mask}",
4096           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4097let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4098def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4099           "maskmovdqu\t{$mask, $src|$src, $mask}",
4100           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4101
4102} // ExeDomain = SSEPackedInt
4103
4104//===---------------------------------------------------------------------===//
4105// SSE2 - Move Doubleword/Quadword
4106//===---------------------------------------------------------------------===//
4107
4108//===---------------------------------------------------------------------===//
4109// Move Int Doubleword to Packed Double Int
4110//
4111let ExeDomain = SSEPackedInt in {
4112def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4113                        "movd\t{$src, $dst|$dst, $src}",
4114                        [(set VR128:$dst,
4115                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4116                          VEX, Sched<[WriteVecMoveFromGpr]>;
4117def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4118                        "movd\t{$src, $dst|$dst, $src}",
4119                        [(set VR128:$dst,
4120                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4121                        VEX, Sched<[WriteVecLoad]>;
4122def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4123                          "movq\t{$src, $dst|$dst, $src}",
4124                          [(set VR128:$dst,
4125                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4126                          VEX, Sched<[WriteVecMoveFromGpr]>;
4127let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4128def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4129                          "movq\t{$src, $dst|$dst, $src}", []>,
4130                          VEX, Sched<[WriteVecLoad]>;
4131let isCodeGenOnly = 1 in
4132def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4133                         "movq\t{$src, $dst|$dst, $src}",
4134                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4135                         VEX, Sched<[WriteVecMoveFromGpr]>;
4136
4137def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4138                      "movd\t{$src, $dst|$dst, $src}",
4139                      [(set VR128:$dst,
4140                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4141                      Sched<[WriteVecMoveFromGpr]>;
4142def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4143                      "movd\t{$src, $dst|$dst, $src}",
4144                      [(set VR128:$dst,
4145                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4146                      Sched<[WriteVecLoad]>;
4147def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4148                        "movq\t{$src, $dst|$dst, $src}",
4149                        [(set VR128:$dst,
4150                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4151                        Sched<[WriteVecMoveFromGpr]>;
4152let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4153def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4154                        "movq\t{$src, $dst|$dst, $src}", []>,
4155                        Sched<[WriteVecLoad]>;
4156let isCodeGenOnly = 1 in
4157def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4158                       "movq\t{$src, $dst|$dst, $src}",
4159                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4160                       Sched<[WriteVecMoveFromGpr]>;
4161} // ExeDomain = SSEPackedInt
4162
4163//===---------------------------------------------------------------------===//
4164// Move Int Doubleword to Single Scalar
4165//
4166let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4167  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4168                        "movd\t{$src, $dst|$dst, $src}",
4169                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4170                        VEX, Sched<[WriteVecMoveFromGpr]>;
4171
4172  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4173                        "movd\t{$src, $dst|$dst, $src}",
4174                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4175                        Sched<[WriteVecMoveFromGpr]>;
4176
4177} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4178
4179//===---------------------------------------------------------------------===//
4180// Move Packed Doubleword Int to Packed Double Int
4181//
4182let ExeDomain = SSEPackedInt in {
4183def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4184                         "movd\t{$src, $dst|$dst, $src}",
4185                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4186                                          (iPTR 0)))]>, VEX,
4187                         Sched<[WriteVecMoveToGpr]>;
4188def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4189                         (ins i32mem:$dst, VR128:$src),
4190                         "movd\t{$src, $dst|$dst, $src}",
4191                         [(store (i32 (extractelt (v4i32 VR128:$src),
4192                                       (iPTR 0))), addr:$dst)]>,
4193                         VEX, Sched<[WriteVecStore]>;
4194def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4195                       "movd\t{$src, $dst|$dst, $src}",
4196                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4197                                        (iPTR 0)))]>,
4198                   Sched<[WriteVecMoveToGpr]>;
4199def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4200                       "movd\t{$src, $dst|$dst, $src}",
4201                       [(store (i32 (extractelt (v4i32 VR128:$src),
4202                                     (iPTR 0))), addr:$dst)]>,
4203                       Sched<[WriteVecStore]>;
4204} // ExeDomain = SSEPackedInt
4205
4206//===---------------------------------------------------------------------===//
4207// Move Packed Doubleword Int first element to Doubleword Int
4208//
4209let ExeDomain = SSEPackedInt in {
4210let SchedRW = [WriteVecMoveToGpr] in {
4211def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4212                          "movq\t{$src, $dst|$dst, $src}",
4213                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4214                                                        (iPTR 0)))]>,
4215                      VEX;
4216
4217def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4218                        "movq\t{$src, $dst|$dst, $src}",
4219                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4220                                                         (iPTR 0)))]>;
4221} //SchedRW
4222
4223let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4224def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4225                          (ins i64mem:$dst, VR128:$src),
4226                          "movq\t{$src, $dst|$dst, $src}", []>,
4227                          VEX, Sched<[WriteVecStore]>;
4228let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4229def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4230                        "movq\t{$src, $dst|$dst, $src}", []>,
4231                        Sched<[WriteVecStore]>;
4232} // ExeDomain = SSEPackedInt
4233
4234//===---------------------------------------------------------------------===//
4235// Bitcast FR64 <-> GR64
4236//
4237let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4238  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4239                           "movq\t{$src, $dst|$dst, $src}",
4240                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4241                           VEX, Sched<[WriteVecMoveToGpr]>;
4242
4243  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4244                         "movq\t{$src, $dst|$dst, $src}",
4245                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4246                         Sched<[WriteVecMoveToGpr]>;
4247} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4248
4249//===---------------------------------------------------------------------===//
4250// Move Scalar Single to Double Int
4251//
4252let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4253  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4254                        "movd\t{$src, $dst|$dst, $src}",
4255                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4256                        VEX, Sched<[WriteVecMoveToGpr]>;
4257  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4258                        "movd\t{$src, $dst|$dst, $src}",
4259                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4260                        Sched<[WriteVecMoveToGpr]>;
4261} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4262
4263let Predicates = [UseAVX] in {
4264  def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))),
4265            (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
4266                                              GR8:$src, sub_8bit)))>;
4267  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4268            (VMOVDI2PDIrr GR32:$src)>;
4269
4270  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4271            (VMOV64toPQIrr GR64:$src)>;
4272
4273  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4274  // These instructions also write zeros in the high part of a 256-bit register.
4275  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4276            (VMOVDI2PDIrm addr:$src)>;
4277  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4278            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4279}
4280
4281let Predicates = [UseSSE2] in {
4282  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4283            (MOVDI2PDIrr GR32:$src)>;
4284
4285  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4286            (MOV64toPQIrr GR64:$src)>;
4287  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4288            (MOVDI2PDIrm addr:$src)>;
4289}
4290
4291// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4292// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4293// these aliases.
4294def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4295                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4296def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4297                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4298// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4299def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4300                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4301def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4302                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4303
4304//===---------------------------------------------------------------------===//
4305// SSE2 - Move Quadword
4306//===---------------------------------------------------------------------===//
4307
4308//===---------------------------------------------------------------------===//
4309// Move Quadword Int to Packed Quadword Int
4310//
4311
4312let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4313def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4314                    "vmovq\t{$src, $dst|$dst, $src}",
4315                    [(set VR128:$dst,
4316                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4317                    VEX, Requires<[UseAVX]>, VEX_WIG;
4318def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4319                    "movq\t{$src, $dst|$dst, $src}",
4320                    [(set VR128:$dst,
4321                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4322                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4323} // ExeDomain, SchedRW
4324
4325//===---------------------------------------------------------------------===//
4326// Move Packed Quadword Int to Quadword Int
4327//
4328let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4329def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4330                        "movq\t{$src, $dst|$dst, $src}",
4331                        [(store (i64 (extractelt (v2i64 VR128:$src),
4332                                      (iPTR 0))), addr:$dst)]>,
4333                        VEX, VEX_WIG;
4334def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4335                      "movq\t{$src, $dst|$dst, $src}",
4336                      [(store (i64 (extractelt (v2i64 VR128:$src),
4337                                    (iPTR 0))), addr:$dst)]>;
4338} // ExeDomain, SchedRW
4339
4340// For disassembler only
4341let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4342    SchedRW = [SchedWriteVecLogic.XMM] in {
4343def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4344                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4345def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4346                      "movq\t{$src, $dst|$dst, $src}", []>;
4347}
4348
4349def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4350                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4351def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4352                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4353
4354let Predicates = [UseAVX] in {
4355  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4356            (VMOVQI2PQIrm addr:$src)>;
4357  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4358            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4359
4360  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4361            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4362}
4363
4364let Predicates = [UseSSE2] in {
4365  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4366
4367  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4368            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4369}
4370
4371//===---------------------------------------------------------------------===//
4372// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4373// IA32 document. movq xmm1, xmm2 does clear the high bits.
4374//
4375let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4376def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4377                        "vmovq\t{$src, $dst|$dst, $src}",
4378                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4379                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4380def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4381                        "movq\t{$src, $dst|$dst, $src}",
4382                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4383                        XS, Requires<[UseSSE2]>;
4384} // ExeDomain, SchedRW
4385
4386let Predicates = [UseAVX] in {
4387  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4388            (VMOVZPQILo2PQIrr VR128:$src)>;
4389}
4390let Predicates = [UseSSE2] in {
4391  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4392            (MOVZPQILo2PQIrr VR128:$src)>;
4393}
4394
4395let Predicates = [UseAVX] in {
4396  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4397            (SUBREG_TO_REG (i32 0),
4398             (v2f64 (VMOVZPQILo2PQIrr
4399                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4400             sub_xmm)>;
4401  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4402            (SUBREG_TO_REG (i32 0),
4403             (v2i64 (VMOVZPQILo2PQIrr
4404                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4405             sub_xmm)>;
4406}
4407
4408//===---------------------------------------------------------------------===//
4409// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4410//===---------------------------------------------------------------------===//
4411
4412multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4413                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4414                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4415def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4416                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4417                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4418                      Sched<[sched]>;
4419def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4420                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4421                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4422                      Sched<[sched.Folded]>;
4423}
4424
4425let Predicates = [HasAVX, NoVLX] in {
4426  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4427                                       v4f32, VR128, loadv4f32, f128mem,
4428                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4429  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4430                                       v4f32, VR128, loadv4f32, f128mem,
4431                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4432  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4433                                       v8f32, VR256, loadv8f32, f256mem,
4434                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4435  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4436                                       v8f32, VR256, loadv8f32, f256mem,
4437                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4438}
4439defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4440                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4441defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4442                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4443
4444let Predicates = [HasAVX, NoVLX] in {
4445  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4446            (VMOVSHDUPrr VR128:$src)>;
4447  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4448            (VMOVSHDUPrm addr:$src)>;
4449  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4450            (VMOVSLDUPrr VR128:$src)>;
4451  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4452            (VMOVSLDUPrm addr:$src)>;
4453  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4454            (VMOVSHDUPYrr VR256:$src)>;
4455  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4456            (VMOVSHDUPYrm addr:$src)>;
4457  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4458            (VMOVSLDUPYrr VR256:$src)>;
4459  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4460            (VMOVSLDUPYrm addr:$src)>;
4461}
4462
4463let Predicates = [UseSSE3] in {
4464  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4465            (MOVSHDUPrr VR128:$src)>;
4466  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4467            (MOVSHDUPrm addr:$src)>;
4468  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4469            (MOVSLDUPrr VR128:$src)>;
4470  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4471            (MOVSLDUPrm addr:$src)>;
4472}
4473
4474//===---------------------------------------------------------------------===//
4475// SSE3 - Replicate Double FP - MOVDDUP
4476//===---------------------------------------------------------------------===//
4477
4478multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4479def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4480                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4481                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4482                    Sched<[sched.XMM]>;
4483def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4484                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4485                    [(set VR128:$dst,
4486                      (v2f64 (X86Movddup
4487                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4488                    Sched<[sched.XMM.Folded]>;
4489}
4490
4491// FIXME: Merge with above classes when there are patterns for the ymm version
4492multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4493def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4494                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4495                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4496                    Sched<[sched.YMM]>;
4497def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4498                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4499                    [(set VR256:$dst,
4500                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4501                    Sched<[sched.YMM.Folded]>;
4502}
4503
4504let Predicates = [HasAVX, NoVLX] in {
4505  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4506                                      VEX, VEX_WIG;
4507  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4508                                        VEX, VEX_L, VEX_WIG;
4509}
4510
4511defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4512
4513
4514let Predicates = [HasAVX, NoVLX] in {
4515  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4516            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4517}
4518
4519let Predicates = [UseSSE3] in {
4520  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4521            (MOVDDUPrm addr:$src)>;
4522}
4523
4524//===---------------------------------------------------------------------===//
4525// SSE3 - Move Unaligned Integer
4526//===---------------------------------------------------------------------===//
4527
4528let Predicates = [HasAVX] in {
4529  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4530                      "vlddqu\t{$src, $dst|$dst, $src}",
4531                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4532                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4533  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4534                       "vlddqu\t{$src, $dst|$dst, $src}",
4535                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4536                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4537} // Predicates
4538
4539def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4540                   "lddqu\t{$src, $dst|$dst, $src}",
4541                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4542                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4543
4544//===---------------------------------------------------------------------===//
4545// SSE3 - Arithmetic
4546//===---------------------------------------------------------------------===//
4547
4548multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4549                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4550                       PatFrag ld_frag, bit Is2Addr = 1> {
4551let Uses = [MXCSR], mayRaiseFPException = 1 in {
4552  def rr : I<0xD0, MRMSrcReg,
4553       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4554       !if(Is2Addr,
4555           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4556           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4557       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4558       Sched<[sched]>;
4559  def rm : I<0xD0, MRMSrcMem,
4560       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4561       !if(Is2Addr,
4562           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4563           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4564       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4565       Sched<[sched.Folded, sched.ReadAfterFold]>;
4566}
4567}
4568
4569let Predicates = [HasAVX] in {
4570  let ExeDomain = SSEPackedSingle in {
4571    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4572                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4573                                 XD, VEX_4V, VEX_WIG;
4574    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4575                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4576                                  XD, VEX_4V, VEX_L, VEX_WIG;
4577  }
4578  let ExeDomain = SSEPackedDouble in {
4579    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4580                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4581                                 PD, VEX_4V, VEX_WIG;
4582    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4583                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4584                                  PD, VEX_4V, VEX_L, VEX_WIG;
4585  }
4586}
4587let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4588  let ExeDomain = SSEPackedSingle in
4589  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4590                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4591  let ExeDomain = SSEPackedDouble in
4592  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4593                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4594}
4595
4596//===---------------------------------------------------------------------===//
4597// SSE3 Instructions
4598//===---------------------------------------------------------------------===//
4599
4600// Horizontal ops
4601multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4602                   X86MemOperand x86memop, SDNode OpNode,
4603                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4604                   bit Is2Addr = 1> {
4605let Uses = [MXCSR], mayRaiseFPException = 1 in {
4606  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4607       !if(Is2Addr,
4608         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4609         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4610      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4611      Sched<[sched]>;
4612
4613  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4614       !if(Is2Addr,
4615         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4616         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4617      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4618      Sched<[sched.Folded, sched.ReadAfterFold]>;
4619}
4620}
4621multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4622                  X86MemOperand x86memop, SDNode OpNode,
4623                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4624                  bit Is2Addr = 1> {
4625let Uses = [MXCSR], mayRaiseFPException = 1 in {
4626  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4627       !if(Is2Addr,
4628         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4629         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4630      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4631        Sched<[sched]>;
4632
4633  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4634       !if(Is2Addr,
4635         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4636         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4637      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4638        Sched<[sched.Folded, sched.ReadAfterFold]>;
4639}
4640}
4641
4642let Predicates = [HasAVX] in {
4643  let ExeDomain = SSEPackedSingle in {
4644    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4645                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4646    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4647                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4648    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4649                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4650    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4651                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4652  }
4653  let ExeDomain = SSEPackedDouble in {
4654    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4655                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4656    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4657                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4658    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4659                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4660    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4661                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4662  }
4663}
4664
4665let Constraints = "$src1 = $dst" in {
4666  let ExeDomain = SSEPackedSingle in {
4667    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4668                          WriteFHAdd, memopv4f32>;
4669    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4670                          WriteFHAdd, memopv4f32>;
4671  }
4672  let ExeDomain = SSEPackedDouble in {
4673    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4674                         WriteFHAdd, memopv2f64>;
4675    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4676                         WriteFHAdd, memopv2f64>;
4677  }
4678}
4679
4680//===---------------------------------------------------------------------===//
4681// SSSE3 - Packed Absolute Instructions
4682//===---------------------------------------------------------------------===//
4683
4684/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4685multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4686                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4687  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4688                 (ins VR128:$src),
4689                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4690                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4691                 Sched<[sched.XMM]>;
4692
4693  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4694                 (ins i128mem:$src),
4695                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4696                 [(set VR128:$dst,
4697                   (vt (OpNode (ld_frag addr:$src))))]>,
4698                 Sched<[sched.XMM.Folded]>;
4699}
4700
4701/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4702multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4703                          SDNode OpNode, X86SchedWriteWidths sched> {
4704  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4705                  (ins VR256:$src),
4706                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4707                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4708                  Sched<[sched.YMM]>;
4709
4710  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4711                  (ins i256mem:$src),
4712                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4713                  [(set VR256:$dst,
4714                    (vt (OpNode (load addr:$src))))]>,
4715                  Sched<[sched.YMM.Folded]>;
4716}
4717
4718let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4719  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4720                              load>, VEX, VEX_WIG;
4721  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4722                              load>, VEX, VEX_WIG;
4723}
4724let Predicates = [HasAVX, NoVLX] in {
4725  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4726                              load>, VEX, VEX_WIG;
4727}
4728let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4729  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4730                                VEX, VEX_L, VEX_WIG;
4731  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4732                                VEX, VEX_L, VEX_WIG;
4733}
4734let Predicates = [HasAVX2, NoVLX] in {
4735  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4736                                VEX, VEX_L, VEX_WIG;
4737}
4738
4739defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4740                          memop>;
4741defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4742                          memop>;
4743defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4744                          memop>;
4745
4746//===---------------------------------------------------------------------===//
4747// SSSE3 - Packed Binary Operator Instructions
4748//===---------------------------------------------------------------------===//
4749
4750/// SS3I_binop_rm - Simple SSSE3 bin op
4751multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4752                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4753                         PatFrag memop_frag, X86MemOperand x86memop,
4754                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4755  let isCommutable = 1 in
4756  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4757       (ins RC:$src1, RC:$src2),
4758       !if(Is2Addr,
4759         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4760         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4761       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4762       Sched<[sched]>;
4763  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4764       (ins RC:$src1, x86memop:$src2),
4765       !if(Is2Addr,
4766         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4767         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4768       [(set RC:$dst,
4769         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4770       Sched<[sched.Folded, sched.ReadAfterFold]>;
4771}
4772
4773/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4774multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4775                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4776                             PatFrag ld_frag, bit Is2Addr = 1> {
4777  let isCommutable = 1 in
4778  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4779       (ins VR128:$src1, VR128:$src2),
4780       !if(Is2Addr,
4781         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4782         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4783       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4784       Sched<[sched]>;
4785  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4786       (ins VR128:$src1, i128mem:$src2),
4787       !if(Is2Addr,
4788         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4789         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4790       [(set VR128:$dst,
4791         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4792       Sched<[sched.Folded, sched.ReadAfterFold]>;
4793}
4794
4795multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4796                               Intrinsic IntId256,
4797                               X86FoldableSchedWrite sched> {
4798  let isCommutable = 1 in
4799  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4800       (ins VR256:$src1, VR256:$src2),
4801       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4802       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4803       Sched<[sched]>;
4804  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4805       (ins VR256:$src1, i256mem:$src2),
4806       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4807       [(set VR256:$dst,
4808         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4809       Sched<[sched.Folded, sched.ReadAfterFold]>;
4810}
4811
4812let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4813let isCommutable = 0 in {
4814  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4815                                  VR128, load, i128mem,
4816                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4817  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4818                                  v16i8, VR128, load, i128mem,
4819                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4820}
4821defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4822                                  VR128, load, i128mem,
4823                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4824}
4825
4826let ImmT = NoImm, Predicates = [HasAVX] in {
4827let isCommutable = 0 in {
4828  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4829                                  load, i128mem,
4830                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4831  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4832                                  load, i128mem,
4833                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4834  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4835                                  load, i128mem,
4836                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4837  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4838                                  load, i128mem,
4839                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4840  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4841                                      int_x86_ssse3_psign_b_128,
4842                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4843  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4844                                      int_x86_ssse3_psign_w_128,
4845                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4846  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4847                                      int_x86_ssse3_psign_d_128,
4848                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4849  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4850                                      int_x86_ssse3_phadd_sw_128,
4851                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4852  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4853                                      int_x86_ssse3_phsub_sw_128,
4854                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4855}
4856}
4857
4858let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4859let isCommutable = 0 in {
4860  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4861                                  VR256, load, i256mem,
4862                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4863  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4864                                   v32i8, VR256, load, i256mem,
4865                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4866}
4867defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4868                                  VR256, load, i256mem,
4869                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4870}
4871
4872let ImmT = NoImm, Predicates = [HasAVX2] in {
4873let isCommutable = 0 in {
4874  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4875                                  VR256, load, i256mem,
4876                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4877  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4878                                  load, i256mem,
4879                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4880  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4881                                  VR256, load, i256mem,
4882                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4883  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4884                                  load, i256mem,
4885                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4886  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4887                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4888  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4889                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4890  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4891                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4892  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4893                                       int_x86_avx2_phadd_sw,
4894                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4895  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4896                                       int_x86_avx2_phsub_sw,
4897                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4898}
4899}
4900
4901// None of these have i8 immediate fields.
4902let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4903let isCommutable = 0 in {
4904  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4905                                 memop, i128mem, SchedWritePHAdd.XMM>;
4906  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4907                                 memop, i128mem, SchedWritePHAdd.XMM>;
4908  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4909                                 memop, i128mem, SchedWritePHAdd.XMM>;
4910  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4911                                 memop, i128mem, SchedWritePHAdd.XMM>;
4912  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4913                                     SchedWriteVecALU.XMM, memop>;
4914  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4915                                     SchedWriteVecALU.XMM, memop>;
4916  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4917                                     SchedWriteVecALU.XMM, memop>;
4918  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4919                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4920  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4921                                     int_x86_ssse3_phadd_sw_128,
4922                                     SchedWritePHAdd.XMM, memop>;
4923  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4924                                     int_x86_ssse3_phsub_sw_128,
4925                                     SchedWritePHAdd.XMM, memop>;
4926  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4927                                 v16i8, VR128, memop, i128mem,
4928                                 SchedWriteVecIMul.XMM>;
4929}
4930defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4931                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4932}
4933
4934//===---------------------------------------------------------------------===//
4935// SSSE3 - Packed Align Instruction Patterns
4936//===---------------------------------------------------------------------===//
4937
4938multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4939                         PatFrag memop_frag, X86MemOperand x86memop,
4940                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4941  let hasSideEffects = 0 in {
4942  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4943      (ins RC:$src1, RC:$src2, u8imm:$src3),
4944      !if(Is2Addr,
4945        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4946        !strconcat(asm,
4947                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4948      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4949      Sched<[sched]>;
4950  let mayLoad = 1 in
4951  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4952      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4953      !if(Is2Addr,
4954        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4955        !strconcat(asm,
4956                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4957      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4958                                     (memop_frag addr:$src2),
4959                                     (i8 timm:$src3))))]>,
4960      Sched<[sched.Folded, sched.ReadAfterFold]>;
4961  }
4962}
4963
4964let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4965  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4966                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4967let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4968  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4969                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4970let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4971  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4972                               SchedWriteShuffle.XMM>;
4973
4974//===---------------------------------------------------------------------===//
4975// SSSE3 - Thread synchronization
4976//===---------------------------------------------------------------------===//
4977
4978let SchedRW = [WriteSystem] in {
4979let Uses = [EAX, ECX, EDX] in
4980def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4981                     TB, Requires<[HasSSE3, Not64BitMode]>;
4982let Uses = [RAX, ECX, EDX] in
4983def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4984                     TB, Requires<[HasSSE3, In64BitMode]>;
4985
4986let Uses = [ECX, EAX] in
4987def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4988                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4989} // SchedRW
4990
4991def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4992def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4993
4994def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4995      Requires<[Not64BitMode]>;
4996def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4997      Requires<[In64BitMode]>;
4998
4999//===----------------------------------------------------------------------===//
5000// SSE4.1 - Packed Move with Sign/Zero Extend
5001// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
5002//===----------------------------------------------------------------------===//
5003
5004multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5005                            RegisterClass OutRC, RegisterClass InRC,
5006                            X86FoldableSchedWrite sched> {
5007  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5008                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5009                 Sched<[sched]>;
5010
5011  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5012                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5013                 Sched<[sched.Folded]>;
5014}
5015
5016multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5017                              X86MemOperand MemOp, X86MemOperand MemYOp,
5018                              Predicate prd> {
5019  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5020                               SchedWriteShuffle.XMM>;
5021  let Predicates = [HasAVX, prd] in
5022    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5023                                     VR128, VR128, SchedWriteVecExtend.XMM>,
5024                                     VEX, VEX_WIG;
5025  let Predicates = [HasAVX2, prd] in
5026    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5027                                     VR256, VR128, SchedWriteVecExtend.YMM>,
5028                                     VEX, VEX_L, VEX_WIG;
5029}
5030
5031multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5032                          X86MemOperand MemYOp, Predicate prd> {
5033  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5034                                        MemOp, MemYOp, prd>;
5035  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5036                                        !strconcat("pmovzx", OpcodeStr),
5037                                        MemOp, MemYOp, prd>;
5038}
5039
5040defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5041defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5042defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5043
5044defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5045defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5046
5047defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5048
5049// AVX2 Patterns
5050multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5051                                     SDNode ExtOp, SDNode InVecOp> {
5052  // Register-Register patterns
5053  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5054  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5055            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5056  }
5057  let Predicates = [HasAVX2, NoVLX] in {
5058  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5059            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5060  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5061            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5062
5063  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5064            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5065  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5066            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5067
5068  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5069            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5070  }
5071
5072  // Simple Register-Memory patterns
5073  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5074  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5075            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5076
5077  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5078            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5079  }
5080
5081  let Predicates = [HasAVX2, NoVLX] in {
5082  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5083            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5084  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5085            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5086
5087  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5088            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5089  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5090            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5091
5092  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5093            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5094  }
5095
5096  // AVX2 Register-Memory patterns
5097  let Predicates = [HasAVX2, NoVLX] in {
5098  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5099            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5100
5101  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5102            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5103  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5104            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5105  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5106            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5107
5108  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5109            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5110
5111  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5112            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5113  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5114            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5115
5116  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5117            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5118  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5119            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5120  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5121            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5122  }
5123}
5124
5125defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5126defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5127
5128// SSE4.1/AVX patterns.
5129multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5130                                SDNode ExtOp> {
5131  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5132  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5133            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5134  }
5135  let Predicates = [HasAVX, NoVLX] in {
5136  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5137            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5138  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5139            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5140
5141  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5142            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5143  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5144            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5145
5146  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5147            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5148  }
5149  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5150  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5151            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5152  }
5153  let Predicates = [HasAVX, NoVLX] in {
5154  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5155            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5156  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5157            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5158
5159  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5160            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5161  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5162            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5163
5164  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5165            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5166  }
5167  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5168  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5169            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5170  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5171            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5172  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5173            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5174  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5175            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5176  }
5177  let Predicates = [HasAVX, NoVLX] in {
5178  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5179            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5180  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5181            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5182  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5183            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5184
5185  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5186            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5187  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5188            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5189
5190  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5191            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5192  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5193            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5194  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5195            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5196  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5197            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5198
5199  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5200            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5201  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5202            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5203  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5204            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5205
5206  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5207            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5208  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5209            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5210  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5211            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5212  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5213            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5214  }
5215}
5216
5217defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5218defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5219
5220let Predicates = [UseSSE41] in {
5221  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5222  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5223}
5224
5225//===----------------------------------------------------------------------===//
5226// SSE4.1 - Extract Instructions
5227//===----------------------------------------------------------------------===//
5228
5229/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5230multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5231  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5232                 (ins VR128:$src1, u8imm:$src2),
5233                 !strconcat(OpcodeStr,
5234                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5235                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5236                                         timm:$src2))]>,
5237                  Sched<[WriteVecExtract]>;
5238  let hasSideEffects = 0, mayStore = 1 in
5239  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5240                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5241                 !strconcat(OpcodeStr,
5242                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5243                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
5244                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5245}
5246
5247let Predicates = [HasAVX, NoBWI] in
5248  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5249
5250defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5251
5252
5253/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5254multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5255  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5256  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5257                   (ins VR128:$src1, u8imm:$src2),
5258                   !strconcat(OpcodeStr,
5259                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5260                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5261
5262  let hasSideEffects = 0, mayStore = 1 in
5263  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5264                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5265                 !strconcat(OpcodeStr,
5266                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5267                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
5268                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5269}
5270
5271let Predicates = [HasAVX, NoBWI] in
5272  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5273
5274defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5275
5276let Predicates = [UseSSE41] in
5277  def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5278
5279let Predicates = [HasAVX, NoBWI] in
5280  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5281
5282
5283/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5284multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5285  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5286                 (ins VR128:$src1, u8imm:$src2),
5287                 !strconcat(OpcodeStr,
5288                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5289                 [(set GR32:$dst,
5290                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5291                  Sched<[WriteVecExtract]>;
5292  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5293                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5294                 !strconcat(OpcodeStr,
5295                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5296                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5297                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5298}
5299
5300let Predicates = [HasAVX, NoDQI] in
5301  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5302
5303defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5304
5305/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5306multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5307  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5308                 (ins VR128:$src1, u8imm:$src2),
5309                 !strconcat(OpcodeStr,
5310                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5311                 [(set GR64:$dst,
5312                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5313                  Sched<[WriteVecExtract]>;
5314  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5315                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5316                 !strconcat(OpcodeStr,
5317                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5318                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5319                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5320}
5321
5322let Predicates = [HasAVX, NoDQI] in
5323  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5324
5325defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5326
5327/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5328/// destination
5329multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5330  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5331                   (ins VR128:$src1, u8imm:$src2),
5332                   !strconcat(OpcodeStr,
5333                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5334                   [(set GR32orGR64:$dst,
5335                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5336                   Sched<[WriteVecExtract]>;
5337  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5338                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5339                   !strconcat(OpcodeStr,
5340                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5341                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5342                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5343}
5344
5345let ExeDomain = SSEPackedSingle in {
5346  let Predicates = [UseAVX] in
5347    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5348  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5349}
5350
5351//===----------------------------------------------------------------------===//
5352// SSE4.1 - Insert Instructions
5353//===----------------------------------------------------------------------===//
5354
5355multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5356  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5357      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5358      !if(Is2Addr,
5359        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5360        !strconcat(asm,
5361                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5362      [(set VR128:$dst,
5363        (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
5364      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5365  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5366      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5367      !if(Is2Addr,
5368        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5369        !strconcat(asm,
5370                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5371      [(set VR128:$dst,
5372        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
5373                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5374}
5375
5376let Predicates = [HasAVX, NoBWI] in {
5377  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5378  def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
5379            (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
5380                       GR8:$src2, sub_8bit), timm:$src3)>;
5381}
5382
5383let Constraints = "$src1 = $dst" in
5384  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5385
5386multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5387  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5388      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5389      !if(Is2Addr,
5390        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5391        !strconcat(asm,
5392                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5393      [(set VR128:$dst,
5394        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5395      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5396  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5397      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5398      !if(Is2Addr,
5399        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5400        !strconcat(asm,
5401                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5402      [(set VR128:$dst,
5403        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5404                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5405}
5406
5407let Predicates = [HasAVX, NoDQI] in
5408  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5409let Constraints = "$src1 = $dst" in
5410  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5411
5412multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5413  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5414      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5415      !if(Is2Addr,
5416        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5417        !strconcat(asm,
5418                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5419      [(set VR128:$dst,
5420        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5421      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5422  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5423      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5424      !if(Is2Addr,
5425        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5426        !strconcat(asm,
5427                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5428      [(set VR128:$dst,
5429        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5430                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5431}
5432
5433let Predicates = [HasAVX, NoDQI] in
5434  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5435let Constraints = "$src1 = $dst" in
5436  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5437
5438// insertps has a few different modes, there's the first two here below which
5439// are optimized inserts that won't zero arbitrary elements in the destination
5440// vector. The next one matches the intrinsic and could zero arbitrary elements
5441// in the target vector.
5442multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5443  let isCommutable = 1 in
5444  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5445      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5446      !if(Is2Addr,
5447        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5448        !strconcat(asm,
5449                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5450      [(set VR128:$dst,
5451        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5452      Sched<[SchedWriteFShuffle.XMM]>;
5453  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5454      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5455      !if(Is2Addr,
5456        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5457        !strconcat(asm,
5458                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5459      [(set VR128:$dst,
5460        (X86insertps VR128:$src1,
5461                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5462                    timm:$src3))]>,
5463      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5464}
5465
5466let ExeDomain = SSEPackedSingle in {
5467  let Predicates = [UseAVX] in
5468    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5469                     VEX_4V, VEX_WIG;
5470  let Constraints = "$src1 = $dst" in
5471    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5472}
5473
5474//===----------------------------------------------------------------------===//
5475// SSE4.1 - Round Instructions
5476//===----------------------------------------------------------------------===//
5477
5478multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5479                           X86MemOperand x86memop, RegisterClass RC,
5480                           ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
5481                           X86FoldableSchedWrite sched> {
5482  // Intrinsic operation, reg.
5483  // Vector intrinsic operation, reg
5484let Uses = [MXCSR], mayRaiseFPException = 1 in {
5485  def r : SS4AIi8<opc, MRMSrcReg,
5486                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5487                  !strconcat(OpcodeStr,
5488                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5489                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5490                  Sched<[sched]>;
5491
5492  // Vector intrinsic operation, mem
5493  def m : SS4AIi8<opc, MRMSrcMem,
5494                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5495                  !strconcat(OpcodeStr,
5496                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5497                  [(set RC:$dst,
5498                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5499                  Sched<[sched.Folded]>;
5500}
5501}
5502
5503multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5504                          string OpcodeStr, X86FoldableSchedWrite sched> {
5505let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5506  def SSr : SS4AIi8<opcss, MRMSrcReg,
5507        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5508        !strconcat(OpcodeStr,
5509            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5510      []>, Sched<[sched]>;
5511
5512  let mayLoad = 1 in
5513  def SSm : SS4AIi8<opcss, MRMSrcMem,
5514        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5515        !strconcat(OpcodeStr,
5516             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5517        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5518} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5519
5520let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5521  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5522        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5523        !strconcat(OpcodeStr,
5524              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5525        []>, Sched<[sched]>;
5526
5527  let mayLoad = 1 in
5528  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5529        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5530        !strconcat(OpcodeStr,
5531             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5532        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5533} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5534}
5535
5536multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5537                           string OpcodeStr, X86FoldableSchedWrite sched> {
5538let Uses = [MXCSR], mayRaiseFPException = 1 in {
5539let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5540  def SSr : SS4AIi8<opcss, MRMSrcReg,
5541                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5542                    !strconcat(OpcodeStr,
5543                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5544                    []>, Sched<[sched]>;
5545
5546  let mayLoad = 1 in
5547  def SSm : SS4AIi8<opcss, MRMSrcMem,
5548                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5549                    !strconcat(OpcodeStr,
5550                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5551                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5552} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5553
5554let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5555  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5556                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5557                    !strconcat(OpcodeStr,
5558                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5559                    []>, Sched<[sched]>;
5560
5561  let mayLoad = 1 in
5562  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5563                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5564                    !strconcat(OpcodeStr,
5565                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5566                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5567} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5568}
5569}
5570
5571multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5572                            string OpcodeStr, X86FoldableSchedWrite sched,
5573                            ValueType VT32, ValueType VT64,
5574                            SDNode OpNode, bit Is2Addr = 1> {
5575let Uses = [MXCSR], mayRaiseFPException = 1 in {
5576let ExeDomain = SSEPackedSingle in {
5577  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5578        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5579        !if(Is2Addr,
5580            !strconcat(OpcodeStr,
5581                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5582            !strconcat(OpcodeStr,
5583                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5584        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5585        Sched<[sched]>;
5586
5587  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5588        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5589        !if(Is2Addr,
5590            !strconcat(OpcodeStr,
5591                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5592            !strconcat(OpcodeStr,
5593                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5594        [(set VR128:$dst,
5595             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5596        Sched<[sched.Folded, sched.ReadAfterFold]>;
5597} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5598
5599let ExeDomain = SSEPackedDouble in {
5600  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5601        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5602        !if(Is2Addr,
5603            !strconcat(OpcodeStr,
5604                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5605            !strconcat(OpcodeStr,
5606                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5607        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5608        Sched<[sched]>;
5609
5610  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5611        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5612        !if(Is2Addr,
5613            !strconcat(OpcodeStr,
5614                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5615            !strconcat(OpcodeStr,
5616                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5617        [(set VR128:$dst,
5618              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5619        Sched<[sched.Folded, sched.ReadAfterFold]>;
5620} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5621}
5622}
5623
5624// FP round - roundss, roundps, roundsd, roundpd
5625let Predicates = [HasAVX, NoVLX] in {
5626  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5627    // Intrinsic form
5628    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5629                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5630                                   VEX, VEX_WIG;
5631    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5632                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5633                                   VEX, VEX_L, VEX_WIG;
5634  }
5635
5636  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5637    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5638                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5639                                   VEX, VEX_WIG;
5640    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5641                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5642                                   VEX, VEX_L, VEX_WIG;
5643  }
5644}
5645let Predicates = [UseAVX] in {
5646  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5647                                  v4f32, v2f64, X86RndScales, 0>,
5648                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5649  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5650                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5651}
5652
5653let Predicates = [UseAVX] in {
5654  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5655            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5656  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5657            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5658}
5659
5660let Predicates = [UseAVX, OptForSize] in {
5661  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5662            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5663  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5664            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5665}
5666
5667let ExeDomain = SSEPackedSingle in
5668defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5669                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5670let ExeDomain = SSEPackedDouble in
5671defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5672                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5673
5674defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5675
5676let Constraints = "$src1 = $dst" in
5677defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5678                               v4f32, v2f64, X86RndScales>;
5679
5680let Predicates = [UseSSE41] in {
5681  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5682            (ROUNDSSr FR32:$src1, timm:$src2)>;
5683  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5684            (ROUNDSDr FR64:$src1, timm:$src2)>;
5685}
5686
5687let Predicates = [UseSSE41, OptForSize] in {
5688  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5689            (ROUNDSSm addr:$src1, timm:$src2)>;
5690  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5691            (ROUNDSDm addr:$src1, timm:$src2)>;
5692}
5693
5694//===----------------------------------------------------------------------===//
5695// SSE4.1 - Packed Bit Test
5696//===----------------------------------------------------------------------===//
5697
5698// ptest instruction we'll lower to this in X86ISelLowering primarily from
5699// the intel intrinsic that corresponds to this.
5700let Defs = [EFLAGS], Predicates = [HasAVX] in {
5701def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5702                "vptest\t{$src2, $src1|$src1, $src2}",
5703                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5704                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5705def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5706                "vptest\t{$src2, $src1|$src1, $src2}",
5707                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5708                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5709                VEX, VEX_WIG;
5710
5711def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5712                "vptest\t{$src2, $src1|$src1, $src2}",
5713                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5714                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5715def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5716                "vptest\t{$src2, $src1|$src1, $src2}",
5717                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5718                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5719                VEX, VEX_L, VEX_WIG;
5720}
5721
5722let Defs = [EFLAGS] in {
5723def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5724              "ptest\t{$src2, $src1|$src1, $src2}",
5725              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5726              Sched<[SchedWriteVecTest.XMM]>;
5727def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5728              "ptest\t{$src2, $src1|$src1, $src2}",
5729              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5730              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5731}
5732
5733// The bit test instructions below are AVX only
5734multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5735                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5736                       X86FoldableSchedWrite sched> {
5737  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5738            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5739            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5740            Sched<[sched]>, VEX;
5741  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5742            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5743            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5744            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5745}
5746
5747let Defs = [EFLAGS], Predicates = [HasAVX] in {
5748let ExeDomain = SSEPackedSingle in {
5749defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5750                            SchedWriteFTest.XMM>;
5751defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5752                            SchedWriteFTest.YMM>, VEX_L;
5753}
5754let ExeDomain = SSEPackedDouble in {
5755defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5756                            SchedWriteFTest.XMM>;
5757defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5758                            SchedWriteFTest.YMM>, VEX_L;
5759}
5760}
5761
5762//===----------------------------------------------------------------------===//
5763// SSE4.1 - Misc Instructions
5764//===----------------------------------------------------------------------===//
5765
5766let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5767  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5768                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5769                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5770                     Sched<[WritePOPCNT]>, OpSize16, XS;
5771  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5772                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5773                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5774                      (implicit EFLAGS)]>,
5775                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5776
5777  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5778                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5779                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5780                     Sched<[WritePOPCNT]>, OpSize32, XS;
5781
5782  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5783                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5784                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5785                      (implicit EFLAGS)]>,
5786                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5787
5788  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5789                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5790                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5791                      Sched<[WritePOPCNT]>, XS;
5792  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5793                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5794                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5795                       (implicit EFLAGS)]>,
5796                       Sched<[WritePOPCNT.Folded]>, XS;
5797}
5798
5799// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5800multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5801                                 SDNode OpNode, PatFrag ld_frag,
5802                                 X86FoldableSchedWrite Sched> {
5803  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5804                 (ins VR128:$src),
5805                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5806                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5807                 Sched<[Sched]>;
5808  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5809                  (ins i128mem:$src),
5810                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5811                  [(set VR128:$dst,
5812                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5813                 Sched<[Sched.Folded]>;
5814}
5815
5816// PHMIN has the same profile as PSAD, thus we use the same scheduling
5817// model, although the naming is misleading.
5818let Predicates = [HasAVX] in
5819defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5820                                         X86phminpos, load,
5821                                         WritePHMINPOS>, VEX, VEX_WIG;
5822defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5823                                         X86phminpos, memop,
5824                                         WritePHMINPOS>;
5825
5826/// SS48I_binop_rm - Simple SSE41 binary operator.
5827multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5828                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5829                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5830                          bit Is2Addr = 1> {
5831  let isCommutable = 1 in
5832  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5833       (ins RC:$src1, RC:$src2),
5834       !if(Is2Addr,
5835           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5836           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5837       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5838       Sched<[sched]>;
5839  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5840       (ins RC:$src1, x86memop:$src2),
5841       !if(Is2Addr,
5842           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5843           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5844       [(set RC:$dst,
5845         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5846       Sched<[sched.Folded, sched.ReadAfterFold]>;
5847}
5848
5849let Predicates = [HasAVX, NoVLX] in {
5850  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5851                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5852                                  VEX_4V, VEX_WIG;
5853  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5854                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5855                                  VEX_4V, VEX_WIG;
5856  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5857                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5858                                  VEX_4V, VEX_WIG;
5859  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5860                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5861                                  VEX_4V, VEX_WIG;
5862  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5863                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5864                                  VEX_4V, VEX_WIG;
5865}
5866let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5867  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5868                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5869                                  VEX_4V, VEX_WIG;
5870  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5871                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5872                                  VEX_4V, VEX_WIG;
5873  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5874                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5875                                  VEX_4V, VEX_WIG;
5876  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5877                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5878                                  VEX_4V, VEX_WIG;
5879}
5880
5881let Predicates = [HasAVX2, NoVLX] in {
5882  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5883                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5884                                  VEX_4V, VEX_L, VEX_WIG;
5885  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5886                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5887                                  VEX_4V, VEX_L, VEX_WIG;
5888  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5889                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5890                                  VEX_4V, VEX_L, VEX_WIG;
5891  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5892                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5893                                  VEX_4V, VEX_L, VEX_WIG;
5894  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5895                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5896                                  VEX_4V, VEX_L, VEX_WIG;
5897}
5898let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5899  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5900                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5901                                  VEX_4V, VEX_L, VEX_WIG;
5902  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5903                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5904                                  VEX_4V, VEX_L, VEX_WIG;
5905  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5906                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5907                                  VEX_4V, VEX_L, VEX_WIG;
5908  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5909                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5910                                  VEX_4V, VEX_L, VEX_WIG;
5911}
5912
5913let Constraints = "$src1 = $dst" in {
5914  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5915                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5916  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5917                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5918  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5919                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5920  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5921                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5922  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5923                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5924  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5925                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5926  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5927                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5928  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5929                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5930  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5931                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5932}
5933
5934let Predicates = [HasAVX, NoVLX] in
5935  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5936                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5937                                 VEX_4V, VEX_WIG;
5938let Predicates = [HasAVX] in
5939  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5940                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5941                                 VEX_4V, VEX_WIG;
5942
5943let Predicates = [HasAVX2, NoVLX] in
5944  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5945                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5946                                  VEX_4V, VEX_L, VEX_WIG;
5947let Predicates = [HasAVX2] in
5948  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5949                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5950                                  VEX_4V, VEX_L, VEX_WIG;
5951
5952let Constraints = "$src1 = $dst" in {
5953  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5954                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5955  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5956                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5957}
5958
5959/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5960multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5961                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5962                 X86MemOperand x86memop, bit Is2Addr,
5963                 X86FoldableSchedWrite sched> {
5964  let isCommutable = 1 in
5965  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5966        (ins RC:$src1, RC:$src2, u8imm:$src3),
5967        !if(Is2Addr,
5968            !strconcat(OpcodeStr,
5969                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5970            !strconcat(OpcodeStr,
5971                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5972        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5973        Sched<[sched]>;
5974  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5975        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5976        !if(Is2Addr,
5977            !strconcat(OpcodeStr,
5978                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5979            !strconcat(OpcodeStr,
5980                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5981        [(set RC:$dst,
5982          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5983        Sched<[sched.Folded, sched.ReadAfterFold]>;
5984}
5985
5986/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5987multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5988                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5989                           X86MemOperand x86memop, bit Is2Addr,
5990                           X86FoldableSchedWrite sched> {
5991  let isCommutable = 1 in
5992  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5993        (ins RC:$src1, RC:$src2, u8imm:$src3),
5994        !if(Is2Addr,
5995            !strconcat(OpcodeStr,
5996                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5997            !strconcat(OpcodeStr,
5998                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5999        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6000        Sched<[sched]>;
6001  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6002        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6003        !if(Is2Addr,
6004            !strconcat(OpcodeStr,
6005                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6006            !strconcat(OpcodeStr,
6007                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6008        [(set RC:$dst,
6009          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6010        Sched<[sched.Folded, sched.ReadAfterFold]>;
6011}
6012
6013def BlendCommuteImm2 : SDNodeXForm<timm, [{
6014  uint8_t Imm = N->getZExtValue() & 0x03;
6015  return getI8Imm(Imm ^ 0x03, SDLoc(N));
6016}]>;
6017
6018def BlendCommuteImm4 : SDNodeXForm<timm, [{
6019  uint8_t Imm = N->getZExtValue() & 0x0f;
6020  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6021}]>;
6022
6023def BlendCommuteImm8 : SDNodeXForm<timm, [{
6024  uint8_t Imm = N->getZExtValue() & 0xff;
6025  return getI8Imm(Imm ^ 0xff, SDLoc(N));
6026}]>;
6027
6028// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
6029def BlendScaleImm4 : SDNodeXForm<timm, [{
6030  uint8_t Imm = N->getZExtValue();
6031  uint8_t NewImm = 0;
6032  for (unsigned i = 0; i != 4; ++i) {
6033    if (Imm & (1 << i))
6034      NewImm |= 0x3 << (i * 2);
6035  }
6036  return getI8Imm(NewImm, SDLoc(N));
6037}]>;
6038
6039// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
6040def BlendScaleImm2 : SDNodeXForm<timm, [{
6041  uint8_t Imm = N->getZExtValue();
6042  uint8_t NewImm = 0;
6043  for (unsigned i = 0; i != 2; ++i) {
6044    if (Imm & (1 << i))
6045      NewImm |= 0xf << (i * 4);
6046  }
6047  return getI8Imm(NewImm, SDLoc(N));
6048}]>;
6049
6050// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
6051def BlendScaleImm2to4 : SDNodeXForm<timm, [{
6052  uint8_t Imm = N->getZExtValue();
6053  uint8_t NewImm = 0;
6054  for (unsigned i = 0; i != 2; ++i) {
6055    if (Imm & (1 << i))
6056      NewImm |= 0x3 << (i * 2);
6057  }
6058  return getI8Imm(NewImm, SDLoc(N));
6059}]>;
6060
6061// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
6062def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
6063  uint8_t Imm = N->getZExtValue();
6064  uint8_t NewImm = 0;
6065  for (unsigned i = 0; i != 4; ++i) {
6066    if (Imm & (1 << i))
6067      NewImm |= 0x3 << (i * 2);
6068  }
6069  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6070}]>;
6071
6072// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
6073def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
6074  uint8_t Imm = N->getZExtValue();
6075  uint8_t NewImm = 0;
6076  for (unsigned i = 0; i != 2; ++i) {
6077    if (Imm & (1 << i))
6078      NewImm |= 0xf << (i * 4);
6079  }
6080  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6081}]>;
6082
6083// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
6084def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
6085  uint8_t Imm = N->getZExtValue();
6086  uint8_t NewImm = 0;
6087  for (unsigned i = 0; i != 2; ++i) {
6088    if (Imm & (1 << i))
6089      NewImm |= 0x3 << (i * 2);
6090  }
6091  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6092}]>;
6093
6094let Predicates = [HasAVX] in {
6095  let isCommutable = 0 in {
6096    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6097                                        VR128, load, i128mem, 0,
6098                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6099  }
6100
6101let Uses = [MXCSR], mayRaiseFPException = 1 in {
6102  let ExeDomain = SSEPackedSingle in
6103  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6104                                   VR128, load, f128mem, 0,
6105                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6106  let ExeDomain = SSEPackedDouble in
6107  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6108                                   VR128, load, f128mem, 0,
6109                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6110  let ExeDomain = SSEPackedSingle in
6111  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6112                                    VR256, load, i256mem, 0,
6113                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6114}
6115}
6116
6117let Predicates = [HasAVX2] in {
6118  let isCommutable = 0 in {
6119  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6120                                  VR256, load, i256mem, 0,
6121                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6122  }
6123}
6124
6125let Constraints = "$src1 = $dst" in {
6126  let isCommutable = 0 in {
6127  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6128                                     VR128, memop, i128mem, 1,
6129                                     SchedWriteMPSAD.XMM>;
6130  }
6131
6132  let ExeDomain = SSEPackedSingle in
6133  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6134                                  VR128, memop, f128mem, 1,
6135                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6136  let ExeDomain = SSEPackedDouble in
6137  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6138                                  VR128, memop, f128mem, 1,
6139                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6140}
6141
6142/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6143multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6144                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6145                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6146                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6147let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6148  let isCommutable = 1 in
6149  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6150        (ins RC:$src1, RC:$src2, u8imm:$src3),
6151        !if(Is2Addr,
6152            !strconcat(OpcodeStr,
6153                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6154            !strconcat(OpcodeStr,
6155                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6156        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6157        Sched<[sched]>;
6158  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6159        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6160        !if(Is2Addr,
6161            !strconcat(OpcodeStr,
6162                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6163            !strconcat(OpcodeStr,
6164                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6165        [(set RC:$dst,
6166          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6167        Sched<[sched.Folded, sched.ReadAfterFold]>;
6168}
6169
6170  // Pattern to commute if load is in first source.
6171  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6172            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6173                                            (commuteXForm timm:$src3))>;
6174}
6175
6176let Predicates = [HasAVX] in {
6177  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6178                                  VR128, load, f128mem, 0, SSEPackedSingle,
6179                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6180                                  VEX_4V, VEX_WIG;
6181  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6182                                   VR256, load, f256mem, 0, SSEPackedSingle,
6183                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6184                                   VEX_4V, VEX_L, VEX_WIG;
6185  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6186                                  VR128, load, f128mem, 0, SSEPackedDouble,
6187                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6188                                  VEX_4V, VEX_WIG;
6189  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6190                                   VR256, load, f256mem, 0, SSEPackedDouble,
6191                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6192                                   VEX_4V, VEX_L, VEX_WIG;
6193  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6194                                  VR128, load, i128mem, 0, SSEPackedInt,
6195                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6196                                  VEX_4V, VEX_WIG;
6197}
6198
6199let Predicates = [HasAVX2] in {
6200  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6201                                   VR256, load, i256mem, 0, SSEPackedInt,
6202                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6203                                   VEX_4V, VEX_L, VEX_WIG;
6204}
6205
6206// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6207// ExecutionDomainFixPass will cleanup domains later on.
6208let Predicates = [HasAVX1Only] in {
6209def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6210          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6211def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6212          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6213def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6214          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6215
6216// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6217// it from becoming movsd via commuting under optsize.
6218def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6219          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6220def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6221          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6222def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6223          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6224
6225def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6226          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6227def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6228          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6229def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6230          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6231
6232// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6233// it from becoming movss via commuting under optsize.
6234def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6235          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6236def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6237          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6238def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6239          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6240}
6241
6242defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6243                               VR128, memop, f128mem, 1, SSEPackedSingle,
6244                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6245defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6246                               VR128, memop, f128mem, 1, SSEPackedDouble,
6247                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6248defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6249                               VR128, memop, i128mem, 1, SSEPackedInt,
6250                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6251
6252let Predicates = [UseSSE41] in {
6253// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6254// it from becoming movss via commuting under optsize.
6255def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6256          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6257def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6258          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6259def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6260          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6261
6262def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6263          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6264def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6265          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6266def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6267          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6268}
6269
6270// For insertion into the zero index (low half) of a 256-bit vector, it is
6271// more efficient to generate a blend with immediate instead of an insert*128.
6272let Predicates = [HasAVX] in {
6273def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6274          (VBLENDPDYrri VR256:$src1,
6275                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6276                                       VR128:$src2, sub_xmm), 0x3)>;
6277def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6278          (VBLENDPSYrri VR256:$src1,
6279                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6280                                       VR128:$src2, sub_xmm), 0xf)>;
6281
6282def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6283          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6284                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6285def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6286          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6287                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6288}
6289
6290/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6291multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6292                                X86MemOperand x86memop, ValueType VT,
6293                                PatFrag mem_frag, SDNode OpNode,
6294                                X86FoldableSchedWrite sched> {
6295  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6296                  (ins RC:$src1, RC:$src2, RC:$src3),
6297                  !strconcat(OpcodeStr,
6298                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6299                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6300                  SSEPackedInt>, TAPD, VEX_4V,
6301                Sched<[sched]>;
6302
6303  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6304                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6305                  !strconcat(OpcodeStr,
6306                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6307                  [(set RC:$dst,
6308                        (OpNode RC:$src3, (mem_frag addr:$src2),
6309                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6310                Sched<[sched.Folded, sched.ReadAfterFold,
6311                       // x86memop:$src2
6312                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6313                       ReadDefault,
6314                       // RC::$src3
6315                       sched.ReadAfterFold]>;
6316}
6317
6318let Predicates = [HasAVX] in {
6319let ExeDomain = SSEPackedDouble in {
6320defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6321                                       v2f64, loadv2f64, X86Blendv,
6322                                       SchedWriteFVarBlend.XMM>;
6323defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6324                                       v4f64, loadv4f64, X86Blendv,
6325                                       SchedWriteFVarBlend.YMM>, VEX_L;
6326} // ExeDomain = SSEPackedDouble
6327let ExeDomain = SSEPackedSingle in {
6328defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6329                                       v4f32, loadv4f32, X86Blendv,
6330                                       SchedWriteFVarBlend.XMM>;
6331defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6332                                       v8f32, loadv8f32, X86Blendv,
6333                                       SchedWriteFVarBlend.YMM>, VEX_L;
6334} // ExeDomain = SSEPackedSingle
6335defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6336                                       v16i8, loadv16i8, X86Blendv,
6337                                       SchedWriteVarBlend.XMM>;
6338}
6339
6340let Predicates = [HasAVX2] in {
6341defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6342                                       v32i8, loadv32i8, X86Blendv,
6343                                       SchedWriteVarBlend.YMM>, VEX_L;
6344}
6345
6346let Predicates = [HasAVX] in {
6347  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6348                              (v4i32 VR128:$src2))),
6349            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6350  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6351                              (v2i64 VR128:$src2))),
6352            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6353  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6354                              (v8i32 VR256:$src2))),
6355            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6356  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6357                              (v4i64 VR256:$src2))),
6358            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6359}
6360
6361// Prefer a movss or movsd over a blendps when optimizing for size. these were
6362// changed to use blends because blends have better throughput on sandybridge
6363// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6364let Predicates = [HasAVX, OptForSpeed] in {
6365  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6366            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6367  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6368            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6369
6370  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6371            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6372  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6373            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6374  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6375            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6376
6377  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6378            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6379  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6380            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6381  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6382            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6383
6384  // Move low f32 and clear high bits.
6385  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6386            (SUBREG_TO_REG (i32 0),
6387             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6388                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6389                          (i8 1))), sub_xmm)>;
6390  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6391            (SUBREG_TO_REG (i32 0),
6392             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6393                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6394                          (i8 3))), sub_xmm)>;
6395}
6396
6397// Prefer a movss or movsd over a blendps when optimizing for size. these were
6398// changed to use blends because blends have better throughput on sandybridge
6399// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6400let Predicates = [UseSSE41, OptForSpeed] in {
6401  // With SSE41 we can use blends for these patterns.
6402  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6403            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6404  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6405            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6406
6407  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6408            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6409  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6410            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6411  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6412            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6413
6414  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6415            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6416  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6417            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6418  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6419            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6420}
6421
6422
6423/// SS41I_ternary - SSE 4.1 ternary operator
6424let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6425  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6426                           PatFrag mem_frag, X86MemOperand x86memop,
6427                           SDNode OpNode, X86FoldableSchedWrite sched> {
6428    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6429                    (ins VR128:$src1, VR128:$src2),
6430                    !strconcat(OpcodeStr,
6431                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6432                    [(set VR128:$dst,
6433                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6434                    Sched<[sched]>;
6435
6436    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6437                    (ins VR128:$src1, x86memop:$src2),
6438                    !strconcat(OpcodeStr,
6439                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6440                    [(set VR128:$dst,
6441                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6442                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6443  }
6444}
6445
6446let ExeDomain = SSEPackedDouble in
6447defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6448                              X86Blendv, SchedWriteFVarBlend.XMM>;
6449let ExeDomain = SSEPackedSingle in
6450defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6451                              X86Blendv, SchedWriteFVarBlend.XMM>;
6452defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6453                              X86Blendv, SchedWriteVarBlend.XMM>;
6454
6455// Aliases with the implicit xmm0 argument
6456def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6457                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6458def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6459                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6460def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6461                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6462def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6463                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6464def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6465                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6466def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6467                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6468
6469let Predicates = [UseSSE41] in {
6470  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6471                              (v4i32 VR128:$src2))),
6472            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6473  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6474                              (v2i64 VR128:$src2))),
6475            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6476}
6477
6478let AddedComplexity = 400 in { // Prefer non-temporal versions
6479
6480let Predicates = [HasAVX, NoVLX] in
6481def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6482                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6483                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6484let Predicates = [HasAVX2, NoVLX] in
6485def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6486                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6487                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6488def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6489                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6490                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6491
6492let Predicates = [HasAVX2, NoVLX] in {
6493  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6494            (VMOVNTDQAYrm addr:$src)>;
6495  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6496            (VMOVNTDQAYrm addr:$src)>;
6497  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6498            (VMOVNTDQAYrm addr:$src)>;
6499  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6500            (VMOVNTDQAYrm addr:$src)>;
6501  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6502            (VMOVNTDQAYrm addr:$src)>;
6503  def : Pat<(v16f16 (alignednontemporalload addr:$src)),
6504            (VMOVNTDQAYrm addr:$src)>;
6505  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6506            (VMOVNTDQAYrm addr:$src)>;
6507}
6508
6509let Predicates = [HasAVX, NoVLX] in {
6510  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6511            (VMOVNTDQArm addr:$src)>;
6512  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6513            (VMOVNTDQArm addr:$src)>;
6514  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6515            (VMOVNTDQArm addr:$src)>;
6516  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6517            (VMOVNTDQArm addr:$src)>;
6518  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6519            (VMOVNTDQArm addr:$src)>;
6520  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6521            (VMOVNTDQArm addr:$src)>;
6522  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6523            (VMOVNTDQArm addr:$src)>;
6524}
6525
6526let Predicates = [UseSSE41] in {
6527  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6528            (MOVNTDQArm addr:$src)>;
6529  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6530            (MOVNTDQArm addr:$src)>;
6531  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6532            (MOVNTDQArm addr:$src)>;
6533  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6534            (MOVNTDQArm addr:$src)>;
6535  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6536            (MOVNTDQArm addr:$src)>;
6537  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6538            (MOVNTDQArm addr:$src)>;
6539  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6540            (MOVNTDQArm addr:$src)>;
6541}
6542
6543} // AddedComplexity
6544
6545//===----------------------------------------------------------------------===//
6546// SSE4.2 - Compare Instructions
6547//===----------------------------------------------------------------------===//
6548
6549/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6550multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6551                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6552                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6553                          bit Is2Addr = 1> {
6554  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6555       (ins RC:$src1, RC:$src2),
6556       !if(Is2Addr,
6557           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6558           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6559       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6560       Sched<[sched]>;
6561  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6562       (ins RC:$src1, x86memop:$src2),
6563       !if(Is2Addr,
6564           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6565           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6566       [(set RC:$dst,
6567         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6568       Sched<[sched.Folded, sched.ReadAfterFold]>;
6569}
6570
6571let Predicates = [HasAVX] in
6572  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6573                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6574                                 VEX_4V, VEX_WIG;
6575
6576let Predicates = [HasAVX2] in
6577  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6578                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6579                                  VEX_4V, VEX_L, VEX_WIG;
6580
6581let Constraints = "$src1 = $dst" in
6582  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6583                                memop, i128mem, SchedWriteVecALU.XMM>;
6584
6585//===----------------------------------------------------------------------===//
6586// SSE4.2 - String/text Processing Instructions
6587//===----------------------------------------------------------------------===//
6588
6589multiclass pcmpistrm_SS42AI<string asm> {
6590  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6591    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6592    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6593    []>, Sched<[WritePCmpIStrM]>;
6594  let mayLoad = 1 in
6595  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6596    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6597    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6598    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6599}
6600
6601let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6602  let Predicates = [HasAVX] in
6603  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
6604  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6605}
6606
6607multiclass SS42AI_pcmpestrm<string asm> {
6608  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6609    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6610    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6611    []>, Sched<[WritePCmpEStrM]>;
6612  let mayLoad = 1 in
6613  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6614    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6615    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6616    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6617}
6618
6619let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6620  let Predicates = [HasAVX] in
6621  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
6622  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6623}
6624
6625multiclass SS42AI_pcmpistri<string asm> {
6626  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6627    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6628    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6629    []>, Sched<[WritePCmpIStrI]>;
6630  let mayLoad = 1 in
6631  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6632    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6633    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6634    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6635}
6636
6637let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6638  let Predicates = [HasAVX] in
6639  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
6640  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6641}
6642
6643multiclass SS42AI_pcmpestri<string asm> {
6644  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6645    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6646    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6647    []>, Sched<[WritePCmpEStrI]>;
6648  let mayLoad = 1 in
6649  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6650    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6651    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6652    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6653}
6654
6655let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6656  let Predicates = [HasAVX] in
6657  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
6658  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6659}
6660
6661//===----------------------------------------------------------------------===//
6662// SSE4.2 - CRC Instructions
6663//===----------------------------------------------------------------------===//
6664
6665// No CRC instructions have AVX equivalents
6666
6667// crc intrinsic instruction
6668// This set of instructions are only rm, the only difference is the size
6669// of r and m.
6670class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6671                   RegisterClass RCIn, SDPatternOperator Int> :
6672  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6673         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6674         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6675         Sched<[WriteCRC32]>;
6676
6677class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6678                   X86MemOperand x86memop, SDPatternOperator Int> :
6679  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6680         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6681         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6682         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6683
6684let Constraints = "$src1 = $dst" in {
6685  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6686                                 int_x86_sse42_crc32_32_8>;
6687  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6688                                 int_x86_sse42_crc32_32_8>;
6689  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6690                                 int_x86_sse42_crc32_32_16>, OpSize16;
6691  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6692                                 int_x86_sse42_crc32_32_16>, OpSize16;
6693  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6694                                 int_x86_sse42_crc32_32_32>, OpSize32;
6695  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6696                                 int_x86_sse42_crc32_32_32>, OpSize32;
6697  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6698                                 int_x86_sse42_crc32_64_64>, REX_W;
6699  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6700                                 int_x86_sse42_crc32_64_64>, REX_W;
6701  let hasSideEffects = 0 in {
6702    let mayLoad = 1 in
6703    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6704                                   null_frag>, REX_W;
6705    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6706                                   null_frag>, REX_W;
6707  }
6708}
6709
6710//===----------------------------------------------------------------------===//
6711// SHA-NI Instructions
6712//===----------------------------------------------------------------------===//
6713
6714// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6715multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6716                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6717  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6718             (ins VR128:$src1, VR128:$src2),
6719             !if(UsesXMM0,
6720                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6721                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6722             [!if(UsesXMM0,
6723                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6724                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6725             T8PS, Sched<[sched]>;
6726
6727  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6728             (ins VR128:$src1, i128mem:$src2),
6729             !if(UsesXMM0,
6730                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6731                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6732             [!if(UsesXMM0,
6733                  (set VR128:$dst, (IntId VR128:$src1,
6734                    (memop addr:$src2), XMM0)),
6735                  (set VR128:$dst, (IntId VR128:$src1,
6736                    (memop addr:$src2))))]>, T8PS,
6737             Sched<[sched.Folded, sched.ReadAfterFold]>;
6738}
6739
6740let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6741  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6742                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6743                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6744                         [(set VR128:$dst,
6745                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6746                            (i8 timm:$src3)))]>, TAPS,
6747                         Sched<[SchedWriteVecIMul.XMM]>;
6748  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6749                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6750                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6751                         [(set VR128:$dst,
6752                           (int_x86_sha1rnds4 VR128:$src1,
6753                            (memop addr:$src2),
6754                            (i8 timm:$src3)))]>, TAPS,
6755                         Sched<[SchedWriteVecIMul.XMM.Folded,
6756                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6757
6758  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6759                              SchedWriteVecIMul.XMM>;
6760  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6761                              SchedWriteVecIMul.XMM>;
6762  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6763                              SchedWriteVecIMul.XMM>;
6764
6765  let Uses=[XMM0] in
6766  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6767                                SchedWriteVecIMul.XMM, 1>;
6768
6769  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6770                               SchedWriteVecIMul.XMM>;
6771  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6772                               SchedWriteVecIMul.XMM>;
6773}
6774
6775// Aliases with explicit %xmm0
6776def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6777                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6778def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6779                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6780
6781//===----------------------------------------------------------------------===//
6782// AES-NI Instructions
6783//===----------------------------------------------------------------------===//
6784
6785multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6786                             Intrinsic IntId, PatFrag ld_frag,
6787                             bit Is2Addr = 0, RegisterClass RC = VR128,
6788                             X86MemOperand MemOp = i128mem> {
6789  let AsmString = OpcodeStr#
6790                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6791                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6792    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6793                   (ins RC:$src1, RC:$src2), "",
6794                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6795                   Sched<[WriteAESDecEnc]>;
6796    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6797                   (ins RC:$src1, MemOp:$src2), "",
6798                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6799                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6800  }
6801}
6802
6803// Perform One Round of an AES Encryption/Decryption Flow
6804let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6805  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6806                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6807  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6808                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6809  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6810                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6811  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6812                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6813}
6814
6815let Predicates = [NoVLX, HasVAES] in {
6816  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6817                         int_x86_aesni_aesenc_256, load, 0, VR256,
6818                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6819  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6820                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6821                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6822  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6823                         int_x86_aesni_aesdec_256, load, 0, VR256,
6824                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6825  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6826                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6827                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6828}
6829
6830let Constraints = "$src1 = $dst" in {
6831  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6832                         int_x86_aesni_aesenc, memop, 1>;
6833  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6834                         int_x86_aesni_aesenclast, memop, 1>;
6835  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6836                         int_x86_aesni_aesdec, memop, 1>;
6837  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6838                         int_x86_aesni_aesdeclast, memop, 1>;
6839}
6840
6841// Perform the AES InvMixColumn Transformation
6842let Predicates = [HasAVX, HasAES] in {
6843  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6844      (ins VR128:$src1),
6845      "vaesimc\t{$src1, $dst|$dst, $src1}",
6846      [(set VR128:$dst,
6847        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6848      VEX, VEX_WIG;
6849  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6850      (ins i128mem:$src1),
6851      "vaesimc\t{$src1, $dst|$dst, $src1}",
6852      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6853      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6854}
6855def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6856  (ins VR128:$src1),
6857  "aesimc\t{$src1, $dst|$dst, $src1}",
6858  [(set VR128:$dst,
6859    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6860def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6861  (ins i128mem:$src1),
6862  "aesimc\t{$src1, $dst|$dst, $src1}",
6863  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6864  Sched<[WriteAESIMC.Folded]>;
6865
6866// AES Round Key Generation Assist
6867let Predicates = [HasAVX, HasAES] in {
6868  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6869      (ins VR128:$src1, u8imm:$src2),
6870      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6871      [(set VR128:$dst,
6872        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6873      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6874  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6875      (ins i128mem:$src1, u8imm:$src2),
6876      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6877      [(set VR128:$dst,
6878        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6879      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6880}
6881def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6882  (ins VR128:$src1, u8imm:$src2),
6883  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6884  [(set VR128:$dst,
6885    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6886  Sched<[WriteAESKeyGen]>;
6887def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6888  (ins i128mem:$src1, u8imm:$src2),
6889  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6890  [(set VR128:$dst,
6891    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6892  Sched<[WriteAESKeyGen.Folded]>;
6893
6894//===----------------------------------------------------------------------===//
6895// PCLMUL Instructions
6896//===----------------------------------------------------------------------===//
6897
6898// Immediate transform to help with commuting.
6899def PCLMULCommuteImm : SDNodeXForm<timm, [{
6900  uint8_t Imm = N->getZExtValue();
6901  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6902}]>;
6903
6904// SSE carry-less Multiplication instructions
6905let Predicates = [NoAVX, HasPCLMUL] in {
6906  let Constraints = "$src1 = $dst" in {
6907    let isCommutable = 1 in
6908    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6909              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6910              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6911              [(set VR128:$dst,
6912                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6913                Sched<[WriteCLMul]>;
6914
6915    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6916              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6917              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6918              [(set VR128:$dst,
6919                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6920                  timm:$src3))]>,
6921              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6922  } // Constraints = "$src1 = $dst"
6923
6924  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6925                                (i8 timm:$src3)),
6926            (PCLMULQDQrm VR128:$src1, addr:$src2,
6927                          (PCLMULCommuteImm timm:$src3))>;
6928} // Predicates = [NoAVX, HasPCLMUL]
6929
6930// SSE aliases
6931foreach HI = ["hq","lq"] in
6932foreach LO = ["hq","lq"] in {
6933  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6934                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6935                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6936  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6937                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6938                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6939}
6940
6941// AVX carry-less Multiplication instructions
6942multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6943                      PatFrag LdFrag, Intrinsic IntId> {
6944  let isCommutable = 1 in
6945  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6946            (ins RC:$src1, RC:$src2, u8imm:$src3),
6947            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6948            [(set RC:$dst,
6949              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6950            Sched<[WriteCLMul]>;
6951
6952  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6953            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6954            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6955            [(set RC:$dst,
6956               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6957            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6958
6959  // We can commute a load in the first operand by swapping the sources and
6960  // rotating the immediate.
6961  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6962            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6963                                           (PCLMULCommuteImm timm:$src3))>;
6964}
6965
6966let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6967defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6968                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6969
6970let Predicates = [NoVLX, HasVPCLMULQDQ] in
6971defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6972                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6973
6974multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6975                                   X86MemOperand MemOp, string Hi, string Lo> {
6976  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6977                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6978                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6979  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6980                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6981                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6982}
6983
6984multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6985                              X86MemOperand MemOp> {
6986  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6987  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6988  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6989  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6990}
6991
6992// AVX aliases
6993defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6994defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6995
6996//===----------------------------------------------------------------------===//
6997// SSE4A Instructions
6998//===----------------------------------------------------------------------===//
6999
7000let Predicates = [HasSSE4A] in {
7001
7002let ExeDomain = SSEPackedInt in {
7003let Constraints = "$src = $dst" in {
7004def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
7005                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
7006                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7007                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
7008                                    timm:$idx))]>,
7009                 PD, Sched<[SchedWriteVecALU.XMM]>;
7010def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7011              (ins VR128:$src, VR128:$mask),
7012              "extrq\t{$mask, $src|$src, $mask}",
7013              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7014                                 VR128:$mask))]>,
7015              PD, Sched<[SchedWriteVecALU.XMM]>;
7016
7017def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7018                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7019                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7020                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7021                                      timm:$len, timm:$idx))]>,
7022                   XD, Sched<[SchedWriteVecALU.XMM]>;
7023def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7024                 (ins VR128:$src, VR128:$mask),
7025                 "insertq\t{$mask, $src|$src, $mask}",
7026                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7027                                    VR128:$mask))]>,
7028                 XD, Sched<[SchedWriteVecALU.XMM]>;
7029}
7030} // ExeDomain = SSEPackedInt
7031
7032// Non-temporal (unaligned) scalar stores.
7033let AddedComplexity = 400 in { // Prefer non-temporal versions
7034let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7035def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7036                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7037
7038def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7039                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7040} // SchedRW
7041
7042def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7043          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7044
7045def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7046          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7047
7048} // AddedComplexity
7049} // HasSSE4A
7050
7051//===----------------------------------------------------------------------===//
7052// AVX Instructions
7053//===----------------------------------------------------------------------===//
7054
7055//===----------------------------------------------------------------------===//
7056// VBROADCAST - Load from memory and broadcast to all elements of the
7057//              destination operand
7058//
7059class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7060                           X86MemOperand x86memop, ValueType VT,
7061                           PatFrag bcast_frag, SchedWrite Sched> :
7062  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7063        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7064        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
7065        Sched<[Sched]>, VEX;
7066
7067// AVX2 adds register forms
7068class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7069                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7070  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7071         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7072         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7073         Sched<[Sched]>, VEX;
7074
7075let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7076  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7077                                         f32mem, v4f32, X86VBroadcastld32,
7078                                         SchedWriteFShuffle.XMM.Folded>;
7079  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7080                                         f32mem, v8f32, X86VBroadcastld32,
7081                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
7082}
7083let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7084def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7085                                        v4f64, X86VBroadcastld64,
7086                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
7087
7088let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7089  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7090                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
7091  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7092                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7093}
7094let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7095def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7096                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7097
7098//===----------------------------------------------------------------------===//
7099// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7100//                  halves of a 256-bit vector.
7101//
7102let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7103def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7104                           (ins i128mem:$src),
7105                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7106                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7107
7108let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7109    ExeDomain = SSEPackedSingle in
7110def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7111                           (ins f128mem:$src),
7112                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7113                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7114
7115let Predicates = [HasAVX, NoVLX] in {
7116def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
7117          (VBROADCASTF128 addr:$src)>;
7118def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
7119          (VBROADCASTF128 addr:$src)>;
7120// NOTE: We're using FP instructions here, but execution domain fixing can
7121// convert to integer when profitable.
7122def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
7123          (VBROADCASTF128 addr:$src)>;
7124def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
7125          (VBROADCASTF128 addr:$src)>;
7126def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
7127          (VBROADCASTF128 addr:$src)>;
7128def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
7129          (VBROADCASTF128 addr:$src)>;
7130def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
7131          (VBROADCASTF128 addr:$src)>;
7132}
7133
7134//===----------------------------------------------------------------------===//
7135// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7136//
7137
7138let ExeDomain = SSEPackedSingle in {
7139let isCommutable = 1 in
7140def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7141          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7142          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7143          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
7144def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7145          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7146          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7147          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7148}
7149
7150// Immediate transform to help with commuting.
7151def Perm2XCommuteImm : SDNodeXForm<timm, [{
7152  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7153}]>;
7154
7155multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7156  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7157            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7158  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7159            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7160  // Pattern with load in other operand.
7161  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7162            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7163                                             (Perm2XCommuteImm timm:$imm))>;
7164}
7165
7166let Predicates = [HasAVX] in {
7167  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7168  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
7169}
7170
7171let Predicates = [HasAVX1Only] in {
7172  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
7173  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
7174  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7175  defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
7176  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
7177}
7178
7179//===----------------------------------------------------------------------===//
7180// VINSERTF128 - Insert packed floating-point values
7181//
7182let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7183def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7184          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7185          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7186          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7187let mayLoad = 1 in
7188def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7189          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7190          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7191          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7192}
7193
7194// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7195// with YMM register containing zero.
7196// FIXME: Avoid producing vxorps to clear the fake inputs.
7197let Predicates = [HasAVX1Only] in {
7198def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7199}
7200
7201multiclass vinsert_lowering<string InstrStr, string PermStr,
7202                            ValueType From, ValueType To,
7203                            PatFrag frommemop_frag, PatFrag tomemop_frag> {
7204  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7205                                   (iPTR imm)),
7206            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7207                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7208  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7209                                    (From (frommemop_frag addr:$src2)),
7210                                    (iPTR imm)),
7211            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7212                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7213  // Folding "To" vector - convert to perm2x128 and commute inputs.
7214  def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
7215                                    (From VR128:$src2),
7216                                    (iPTR imm)),
7217            (!cast<Instruction>(PermStr#rm)
7218              (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
7219              addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
7220}
7221
7222let Predicates = [HasAVX, NoVLX] in {
7223  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
7224  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
7225}
7226
7227let Predicates = [HasAVX1Only] in {
7228  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
7229  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
7230  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
7231  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
7232  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7233  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7234}
7235
7236//===----------------------------------------------------------------------===//
7237// VEXTRACTF128 - Extract packed floating-point values
7238//
7239let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7240def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7241          (ins VR256:$src1, u8imm:$src2),
7242          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7243          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7244let mayStore = 1 in
7245def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7246          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7247          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7248          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7249}
7250
7251multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7252  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7253            (To (!cast<Instruction>(InstrStr#rr)
7254                                    (From VR256:$src1),
7255                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7256  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7257                                                 (iPTR imm))), addr:$dst),
7258            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7259             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7260}
7261
7262// AVX1 patterns
7263let Predicates = [HasAVX, NoVLX] in {
7264  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7265  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7266}
7267
7268let Predicates = [HasAVX1Only] in {
7269  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7270  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7271  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7272  defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
7273  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7274  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7275}
7276
7277//===----------------------------------------------------------------------===//
7278// VMASKMOV - Conditional SIMD Packed Loads and Stores
7279//
7280multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7281                          Intrinsic IntLd, Intrinsic IntLd256,
7282                          Intrinsic IntSt, Intrinsic IntSt256,
7283                          X86SchedWriteMaskMove schedX,
7284                          X86SchedWriteMaskMove schedY> {
7285  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7286             (ins VR128:$src1, f128mem:$src2),
7287             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7288             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7289             VEX_4V, Sched<[schedX.RM]>;
7290  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7291             (ins VR256:$src1, f256mem:$src2),
7292             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7293             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7294             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7295  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7296             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7297             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7298             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7299             VEX_4V, Sched<[schedX.MR]>;
7300  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7301             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7302             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7303             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7304             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7305}
7306
7307let ExeDomain = SSEPackedSingle in
7308defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7309                                 int_x86_avx_maskload_ps,
7310                                 int_x86_avx_maskload_ps_256,
7311                                 int_x86_avx_maskstore_ps,
7312                                 int_x86_avx_maskstore_ps_256,
7313                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7314let ExeDomain = SSEPackedDouble in
7315defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7316                                 int_x86_avx_maskload_pd,
7317                                 int_x86_avx_maskload_pd_256,
7318                                 int_x86_avx_maskstore_pd,
7319                                 int_x86_avx_maskstore_pd_256,
7320                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7321
7322//===----------------------------------------------------------------------===//
7323// AVX_VNNI
7324//===----------------------------------------------------------------------===//
7325let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
7326    ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
7327multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7328                       bit IsCommutable> {
7329  let isCommutable = IsCommutable in
7330  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
7331             (ins VR128:$src1, VR128:$src2, VR128:$src3),
7332             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7333             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
7334                                       VR128:$src2, VR128:$src3)))]>,
7335             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7336
7337  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
7338             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
7339             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7340             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
7341                                      (loadv4i32 addr:$src3))))]>,
7342             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7343
7344  let isCommutable = IsCommutable in
7345  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
7346             (ins VR256:$src1, VR256:$src2, VR256:$src3),
7347             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7348             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
7349                                       VR256:$src2, VR256:$src3)))]>,
7350             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7351
7352  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
7353             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
7354             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7355             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
7356                                      (loadv8i32 addr:$src3))))]>,
7357             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7358}
7359
7360defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
7361defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
7362defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>;
7363defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
7364
7365def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
7366                             (X86vpmaddwd node:$lhs, node:$rhs), [{
7367  return N->hasOneUse();
7368}]>;
7369
7370let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
7371  def : Pat<(v8i32 (add VR256:$src1,
7372                        (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
7373            (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
7374  def : Pat<(v8i32 (add VR256:$src1,
7375                        (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
7376            (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
7377  def : Pat<(v4i32 (add VR128:$src1,
7378                        (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
7379            (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
7380  def : Pat<(v4i32 (add VR128:$src1,
7381                        (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
7382            (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
7383}
7384
7385//===----------------------------------------------------------------------===//
7386// VPERMIL - Permute Single and Double Floating-Point Values
7387//
7388
7389multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7390                      RegisterClass RC, X86MemOperand x86memop_f,
7391                      X86MemOperand x86memop_i,
7392                      ValueType f_vt, ValueType i_vt,
7393                      X86FoldableSchedWrite sched,
7394                      X86FoldableSchedWrite varsched> {
7395  let Predicates = [HasAVX, NoVLX] in {
7396    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7397               (ins RC:$src1, RC:$src2),
7398               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7399               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7400               Sched<[varsched]>;
7401    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7402               (ins RC:$src1, x86memop_i:$src2),
7403               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7404               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7405                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7406               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7407
7408    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7409             (ins RC:$src1, u8imm:$src2),
7410             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7411             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7412             Sched<[sched]>;
7413    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7414             (ins x86memop_f:$src1, u8imm:$src2),
7415             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7416             [(set RC:$dst,
7417               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7418             Sched<[sched.Folded]>;
7419  }// Predicates = [HasAVX, NoVLX]
7420}
7421
7422let ExeDomain = SSEPackedSingle in {
7423  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7424                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7425                               SchedWriteFVarShuffle.XMM>;
7426  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7427                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7428                               SchedWriteFVarShuffle.YMM>, VEX_L;
7429}
7430let ExeDomain = SSEPackedDouble in {
7431  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7432                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7433                               SchedWriteFVarShuffle.XMM>;
7434  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7435                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7436                               SchedWriteFVarShuffle.YMM>, VEX_L;
7437}
7438
7439//===----------------------------------------------------------------------===//
7440// VZERO - Zero YMM registers
7441// Note: These instruction do not affect the YMM16-YMM31.
7442//
7443
7444let SchedRW = [WriteSystem] in {
7445let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7446            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7447  // Zero All YMM registers
7448  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7449                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7450                  Requires<[HasAVX]>, VEX_WIG;
7451
7452  // Zero Upper bits of YMM registers
7453  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7454                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7455                     Requires<[HasAVX]>, VEX_WIG;
7456} // Defs
7457} // SchedRW
7458
7459//===----------------------------------------------------------------------===//
7460// Half precision conversion instructions
7461//
7462
7463multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7464                      X86FoldableSchedWrite sched> {
7465  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7466             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7467             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7468             T8PD, VEX, Sched<[sched]>;
7469  let hasSideEffects = 0, mayLoad = 1 in
7470  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7471             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7472             []>, T8PD, VEX, Sched<[sched.Folded]>;
7473}
7474
7475multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7476                      SchedWrite RR, SchedWrite MR> {
7477  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7478               (ins RC:$src1, i32u8imm:$src2),
7479               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7480               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7481               TAPD, VEX, Sched<[RR]>;
7482  let hasSideEffects = 0, mayStore = 1 in
7483  def mr : Ii8<0x1D, MRMDestMem, (outs),
7484               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7485               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7486               TAPD, VEX, Sched<[MR]>;
7487}
7488
7489let Predicates = [HasF16C, NoVLX] in {
7490  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7491  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7492  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7493                               WriteCvtPS2PHSt>, SIMD_EXC;
7494  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7495                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7496
7497  // Pattern match vcvtph2ps of a scalar i64 load.
7498  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7499            (VCVTPH2PSrm addr:$src)>;
7500  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7501              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7502            (VCVTPH2PSrm addr:$src)>;
7503  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7504            (VCVTPH2PSYrm addr:$src)>;
7505
7506  def : Pat<(store (f64 (extractelt
7507                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7508                         (iPTR 0))), addr:$dst),
7509            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7510  def : Pat<(store (i64 (extractelt
7511                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7512                         (iPTR 0))), addr:$dst),
7513            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7514  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7515            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7516}
7517
7518//===----------------------------------------------------------------------===//
7519// AVX2 Instructions
7520//===----------------------------------------------------------------------===//
7521
7522/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7523multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7524                          ValueType OpVT, X86FoldableSchedWrite sched,
7525                          RegisterClass RC,
7526                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7527  let isCommutable = 1 in
7528  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7529        (ins RC:$src1, RC:$src2, u8imm:$src3),
7530        !strconcat(OpcodeStr,
7531            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7532        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7533        Sched<[sched]>, VEX_4V;
7534  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7535        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7536        !strconcat(OpcodeStr,
7537            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7538        [(set RC:$dst,
7539          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7540        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7541
7542  // Pattern to commute if load is in first source.
7543  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7544            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7545                                            (commuteXForm timm:$src3))>;
7546}
7547
7548let Predicates = [HasAVX2] in {
7549defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7550                               SchedWriteBlend.XMM, VR128, i128mem,
7551                               BlendCommuteImm4>;
7552defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7553                                SchedWriteBlend.YMM, VR256, i256mem,
7554                                BlendCommuteImm8>, VEX_L;
7555
7556def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7557          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7558def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7559          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7560def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7561          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7562
7563def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7564          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7565def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7566          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7567def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7568          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7569}
7570
7571// For insertion into the zero index (low half) of a 256-bit vector, it is
7572// more efficient to generate a blend with immediate instead of an insert*128.
7573// NOTE: We're using FP instructions here, but execution domain fixing should
7574// take care of using integer instructions when profitable.
7575let Predicates = [HasAVX] in {
7576def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7577          (VBLENDPSYrri VR256:$src1,
7578                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7579                                       VR128:$src2, sub_xmm), 0xf)>;
7580def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7581          (VBLENDPSYrri VR256:$src1,
7582                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7583                                       VR128:$src2, sub_xmm), 0xf)>;
7584def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7585          (VBLENDPSYrri VR256:$src1,
7586                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7587                                       VR128:$src2, sub_xmm), 0xf)>;
7588def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
7589          (VBLENDPSYrri VR256:$src1,
7590                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7591                                       VR128:$src2, sub_xmm), 0xf)>;
7592def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7593          (VBLENDPSYrri VR256:$src1,
7594                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7595                                       VR128:$src2, sub_xmm), 0xf)>;
7596
7597def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7598          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7599                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7600def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7601          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7602                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7603def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7604          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7605                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7606def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
7607          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7608                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7609def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7610          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7611                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7612}
7613
7614//===----------------------------------------------------------------------===//
7615// VPBROADCAST - Load from memory and broadcast to all elements of the
7616//               destination operand
7617//
7618multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7619                          X86MemOperand x86memop, PatFrag bcast_frag,
7620                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7621  let Predicates = [HasAVX2, prd] in {
7622    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7623                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7624                  [(set VR128:$dst,
7625                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7626                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7627    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7628                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7629                  [(set VR128:$dst,
7630                   (OpVT128 (bcast_frag addr:$src)))]>,
7631                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7632    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7633                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7634                   [(set VR256:$dst,
7635                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7636                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7637    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7638                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7639                   [(set VR256:$dst,
7640                    (OpVT256 (bcast_frag addr:$src)))]>,
7641                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7642
7643    // Provide aliases for broadcast from the same register class that
7644    // automatically does the extract.
7645    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7646              (!cast<Instruction>(NAME#"Yrr")
7647                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7648  }
7649}
7650
7651defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7652                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7653defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7654                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7655defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7656                                    v4i32, v8i32, NoVLX>;
7657defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7658                                    v2i64, v4i64, NoVLX>;
7659
7660let Predicates = [HasAVX2, NoVLX] in {
7661  // Provide fallback in case the load node that is used in the patterns above
7662  // is used by additional users, which prevents the pattern selection.
7663    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7664              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7665    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7666              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7667    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7668              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7669}
7670
7671let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7672  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7673        (VPBROADCASTBrr (VMOVDI2PDIrr
7674                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7675                                             GR8:$src, sub_8bit))))>;
7676  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7677        (VPBROADCASTBYrr (VMOVDI2PDIrr
7678                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7679                                              GR8:$src, sub_8bit))))>;
7680
7681  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7682        (VPBROADCASTWrr (VMOVDI2PDIrr
7683                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7684                                             GR16:$src, sub_16bit))))>;
7685  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7686        (VPBROADCASTWYrr (VMOVDI2PDIrr
7687                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7688                                              GR16:$src, sub_16bit))))>;
7689
7690  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
7691            (VPBROADCASTWrm addr:$src)>;
7692  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
7693            (VPBROADCASTWYrm addr:$src)>;
7694
7695  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
7696            (VPBROADCASTWrr VR128:$src)>;
7697  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
7698            (VPBROADCASTWYrr VR128:$src)>;
7699
7700  def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
7701            (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7702  def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
7703            (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7704}
7705let Predicates = [HasAVX2, NoVLX] in {
7706  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7707            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7708  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7709            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7710  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7711            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7712  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7713            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7714}
7715
7716// AVX1 broadcast patterns
7717let Predicates = [HasAVX1Only] in {
7718def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7719          (VBROADCASTSSYrm addr:$src)>;
7720def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7721          (VBROADCASTSDYrm addr:$src)>;
7722def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7723          (VBROADCASTSSrm addr:$src)>;
7724}
7725
7726  // Provide fallback in case the load node that is used in the patterns above
7727  // is used by additional users, which prevents the pattern selection.
7728let Predicates = [HasAVX, NoVLX] in {
7729  // 128bit broadcasts:
7730  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7731            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7732  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7733            (VMOVDDUPrm addr:$src)>;
7734
7735  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7736            (VMOVDDUPrr VR128:$src)>;
7737}
7738
7739let Predicates = [HasAVX1Only] in {
7740  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7741            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7742  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7743            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7744              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7745              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7746  def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
7747            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7748              (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
7749              (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
7750  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7751            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7752              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7753              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7754  def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
7755            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7756              (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
7757              (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
7758
7759  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7760            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7761  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7762            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7763              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7764              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7765  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7766            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7767              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7768              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7769
7770  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7771            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7772  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7773            (VMOVDDUPrm addr:$src)>;
7774}
7775
7776//===----------------------------------------------------------------------===//
7777// VPERM - Permute instructions
7778//
7779
7780multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7781                     ValueType OpVT, X86FoldableSchedWrite Sched,
7782                     X86MemOperand memOp> {
7783  let Predicates = [HasAVX2, NoVLX] in {
7784    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7785                     (ins VR256:$src1, VR256:$src2),
7786                     !strconcat(OpcodeStr,
7787                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7788                     [(set VR256:$dst,
7789                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7790                     Sched<[Sched]>, VEX_4V, VEX_L;
7791    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7792                     (ins VR256:$src1, memOp:$src2),
7793                     !strconcat(OpcodeStr,
7794                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7795                     [(set VR256:$dst,
7796                       (OpVT (X86VPermv VR256:$src1,
7797                              (load addr:$src2))))]>,
7798                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7799  }
7800}
7801
7802defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7803let ExeDomain = SSEPackedSingle in
7804defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7805
7806multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7807                         ValueType OpVT, X86FoldableSchedWrite Sched,
7808                         X86MemOperand memOp> {
7809  let Predicates = [HasAVX2, NoVLX] in {
7810    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7811                       (ins VR256:$src1, u8imm:$src2),
7812                       !strconcat(OpcodeStr,
7813                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7814                       [(set VR256:$dst,
7815                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7816                       Sched<[Sched]>, VEX, VEX_L;
7817    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7818                       (ins memOp:$src1, u8imm:$src2),
7819                       !strconcat(OpcodeStr,
7820                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7821                       [(set VR256:$dst,
7822                         (OpVT (X86VPermi (mem_frag addr:$src1),
7823                                (i8 timm:$src2))))]>,
7824                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7825  }
7826}
7827
7828defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7829                            WriteShuffle256, i256mem>, VEX_W;
7830let ExeDomain = SSEPackedDouble in
7831defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7832                             WriteFShuffle256, f256mem>, VEX_W;
7833
7834//===----------------------------------------------------------------------===//
7835// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
7836//
7837let isCommutable = 1 in
7838def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7839          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7840          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7841          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7842def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7843          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7844          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7845          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7846
7847let Predicates = [HasAVX2] in {
7848  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
7849  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
7850  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7851  defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
7852  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7853  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7854}
7855
7856//===----------------------------------------------------------------------===//
7857// VINSERTI128 - Insert packed integer values
7858//
7859let hasSideEffects = 0 in {
7860def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7861          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7862          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7863          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7864let mayLoad = 1 in
7865def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7866          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7867          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7868          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7869}
7870
7871let Predicates = [HasAVX2, NoVLX] in {
7872  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
7873  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
7874  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
7875  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16,  loadv16f16>;
7876  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7877  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7878}
7879
7880//===----------------------------------------------------------------------===//
7881// VEXTRACTI128 - Extract packed integer values
7882//
7883def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7884          (ins VR256:$src1, u8imm:$src2),
7885          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7886          Sched<[WriteShuffle256]>, VEX, VEX_L;
7887let hasSideEffects = 0, mayStore = 1 in
7888def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7889          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7890          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7891          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7892
7893let Predicates = [HasAVX2, NoVLX] in {
7894  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7895  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7896  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7897  defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
7898  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7899  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7900}
7901
7902//===----------------------------------------------------------------------===//
7903// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7904//
7905multiclass avx2_pmovmask<string OpcodeStr,
7906                         Intrinsic IntLd128, Intrinsic IntLd256,
7907                         Intrinsic IntSt128, Intrinsic IntSt256,
7908                         X86SchedWriteMaskMove schedX,
7909                         X86SchedWriteMaskMove schedY> {
7910  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7911             (ins VR128:$src1, i128mem:$src2),
7912             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7913             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7914             VEX_4V, Sched<[schedX.RM]>;
7915  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7916             (ins VR256:$src1, i256mem:$src2),
7917             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7918             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7919             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7920  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7921             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7922             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7923             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7924             VEX_4V, Sched<[schedX.MR]>;
7925  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7926             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7927             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7928             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7929             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7930}
7931
7932defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7933                                int_x86_avx2_maskload_d,
7934                                int_x86_avx2_maskload_d_256,
7935                                int_x86_avx2_maskstore_d,
7936                                int_x86_avx2_maskstore_d_256,
7937                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
7938defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7939                                int_x86_avx2_maskload_q,
7940                                int_x86_avx2_maskload_q_256,
7941                                int_x86_avx2_maskstore_q,
7942                                int_x86_avx2_maskstore_q_256,
7943                                WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
7944
7945multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7946                          ValueType MaskVT> {
7947    // masked store
7948    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7949             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7950    // masked load
7951    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7952             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7953    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7954                              (VT immAllZerosV))),
7955             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7956}
7957let Predicates = [HasAVX] in {
7958  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7959  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7960  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7961  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7962}
7963let Predicates = [HasAVX1Only] in {
7964  // load/store i32/i64 not supported use ps/pd version
7965  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7966  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7967  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7968  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7969}
7970let Predicates = [HasAVX2] in {
7971  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7972  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7973  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7974  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7975}
7976
7977//===----------------------------------------------------------------------===//
7978// Variable Bit Shifts
7979//
7980multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7981                          ValueType vt128, ValueType vt256> {
7982  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7983             (ins VR128:$src1, VR128:$src2),
7984             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7985             [(set VR128:$dst,
7986               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7987             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7988  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7989             (ins VR128:$src1, i128mem:$src2),
7990             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7991             [(set VR128:$dst,
7992               (vt128 (OpNode VR128:$src1,
7993                       (vt128 (load addr:$src2)))))]>,
7994             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7995                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7996  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7997             (ins VR256:$src1, VR256:$src2),
7998             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7999             [(set VR256:$dst,
8000               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
8001             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
8002  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8003             (ins VR256:$src1, i256mem:$src2),
8004             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8005             [(set VR256:$dst,
8006               (vt256 (OpNode VR256:$src1,
8007                       (vt256 (load addr:$src2)))))]>,
8008             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
8009                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
8010}
8011
8012let Predicates = [HasAVX2, NoVLX] in {
8013  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
8014  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
8015  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
8016  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
8017  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
8018}
8019
8020//===----------------------------------------------------------------------===//
8021// VGATHER - GATHER Operations
8022
8023// FIXME: Improve scheduling of gather instructions.
8024multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8025                       X86MemOperand memop128, X86MemOperand memop256> {
8026let mayLoad = 1, hasSideEffects = 0 in {
8027  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8028            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8029            !strconcat(OpcodeStr,
8030              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8031            []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8032  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8033            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8034            !strconcat(OpcodeStr,
8035              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8036            []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8037}
8038}
8039
8040let Predicates = [HasAVX2] in {
8041  let mayLoad = 1, hasSideEffects = 0, Constraints
8042    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8043    in {
8044    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
8045                                  VR256, vx128mem, vx256mem>, VEX_W;
8046    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
8047                                  VR256, vx128mem, vy256mem>, VEX_W;
8048    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
8049                                  VR256, vx128mem, vy256mem>;
8050    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
8051                                  VR128, vx64mem, vy128mem>;
8052
8053    let ExeDomain = SSEPackedDouble in {
8054      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
8055                                    VR256, vx128mem, vx256mem>, VEX_W;
8056      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
8057                                    VR256, vx128mem, vy256mem>, VEX_W;
8058    }
8059
8060    let ExeDomain = SSEPackedSingle in {
8061      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
8062                                    VR256, vx128mem, vy256mem>;
8063      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
8064                                    VR128, vx64mem, vy128mem>;
8065    }
8066  }
8067}
8068
8069//===----------------------------------------------------------------------===//
8070// GFNI instructions
8071//===----------------------------------------------------------------------===//
8072
8073multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8074                        RegisterClass RC, PatFrag MemOpFrag,
8075                        X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8076                        bit Is2Addr = 0> {
8077  let ExeDomain = SSEPackedInt,
8078      AsmString = !if(Is2Addr,
8079        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
8080        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8081    let isCommutable = 1 in
8082    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8083                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8084             Sched<[sched]>, T8PD;
8085
8086    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8087                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8088                                 (MemOpFrag addr:$src2))))]>,
8089             Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
8090  }
8091}
8092
8093multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8094                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8095                           X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8096                           bit Is2Addr = 0> {
8097  let AsmString = !if(Is2Addr,
8098      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8099      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8100  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8101              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8102              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
8103              SSEPackedInt>, Sched<[sched]>;
8104  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8105              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8106              [(set RC:$dst, (OpVT (OpNode RC:$src1,
8107                                    (MemOpFrag addr:$src2),
8108                              timm:$src3)))], SSEPackedInt>,
8109              Sched<[sched.Folded, sched.ReadAfterFold]>;
8110  }
8111}
8112
8113multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8114  let Constraints = "$src1 = $dst",
8115      Predicates  = [HasGFNI, UseSSE2] in
8116  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8117                                      VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>;
8118  let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
8119    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
8120                                     load, i128mem, SchedWriteVecIMul.XMM>,
8121                                     VEX_4V, VEX_W;
8122    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
8123                                     load, i256mem, SchedWriteVecIMul.YMM>,
8124                                     VEX_4V, VEX_L, VEX_W;
8125  }
8126}
8127
8128// GF2P8MULB
8129let Constraints = "$src1 = $dst",
8130    Predicates  = [HasGFNI, UseSSE2] in
8131defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8132                                    i128mem, SchedWriteVecALU.XMM, 1>;
8133let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
8134  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8135                                   i128mem, SchedWriteVecALU.XMM>, VEX_4V;
8136  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8137                                   i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
8138}
8139// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8140let isCommutable = 0 in {
8141  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8142                                             X86GF2P8affineinvqb>, TAPD;
8143  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8144                                             X86GF2P8affineqb>, TAPD;
8145}
8146
8147// AVX-IFMA
8148let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst",
8149    checkVEXPredicate = 1 in
8150multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
8151  // NOTE: The SDNode have the multiply operands first with the add last.
8152  // This enables commuted load patterns to be autogenerated by tablegen.
8153  let isCommutable = 1 in {
8154    def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
8155               (ins VR128:$src1, VR128:$src2, VR128:$src3),
8156               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8157               [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8158                                         VR128:$src3, VR128:$src1)))]>,
8159               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8160  }
8161    def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
8162               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8163               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8164               [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8165                                        (loadv2i64 addr:$src3), VR128:$src1)))]>,
8166               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8167  let isCommutable = 1 in {
8168    def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
8169               (ins VR256:$src1, VR256:$src2, VR256:$src3),
8170               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8171               [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8172                                         VR256:$src3, VR256:$src1)))]>,
8173               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8174  }
8175    def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
8176               (ins VR256:$src1, VR256:$src2, i256mem:$src3),
8177               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8178               [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8179                                        (loadv4i64 addr:$src3), VR256:$src1)))]>,
8180               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8181}
8182
8183defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, VEX_W, ExplicitVEXPrefix;
8184defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, VEX_W, ExplicitVEXPrefix;
8185
8186// AVX-VNNI-INT8
8187let Constraints = "$src1 = $dst" in
8188multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
8189                          RegisterClass RC, PatFrag MemOpFrag,
8190                          X86MemOperand X86memop, SDNode OpNode,
8191                          X86FoldableSchedWrite Sched,
8192                          bit IsCommutable> {
8193  let isCommutable = IsCommutable in
8194  def rr  :  I<Opc, MRMSrcReg, (outs RC:$dst),
8195             (ins RC:$src1, RC:$src2, RC:$src3),
8196             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8197             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
8198             VEX_4V, Sched<[Sched]>;
8199  def rm  :  I<Opc, MRMSrcMem, (outs RC:$dst),
8200             (ins RC:$src1, RC:$src2, X86memop:$src3),
8201             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8202             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
8203                                   (MemOpFrag addr:$src3))))]>,
8204             VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
8205}
8206
8207let Predicates = [HasAVXVNNIINT8] in {
8208  defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
8209                                   i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
8210                                   1>, T8XD;
8211  defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
8212                                   i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
8213                                   1>, VEX_L, T8XD;
8214  defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
8215                                   i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
8216                                   1>, T8PS;
8217  defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
8218                                   i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
8219                                   1>, VEX_L, T8PS;
8220  defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
8221                                   i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
8222                                   1>, T8XD;
8223  defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
8224                                   i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
8225                                   1>, VEX_L, T8XD;
8226  defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
8227                                   i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
8228                                   1>, T8PS;
8229  defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
8230                                   i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
8231                                   1>, VEX_L, T8PS;
8232  defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
8233                                   i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
8234                                   0>, T8XS;
8235  defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
8236                                   i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
8237                                   0>,  VEX_L, T8XS;
8238  defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
8239                                   i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
8240                                   0>, T8XS;
8241  defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
8242                                   i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
8243                                   0>, VEX_L, T8XS;
8244}
8245
8246// AVX-NE-CONVERT
8247multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr,
8248                  X86MemOperand MemOp128, X86MemOperand MemOp256> {
8249  def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src),
8250              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8251              [(set VR128:$dst,
8252                (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>,
8253              Sched<[WriteCvtPH2PS]>, VEX;
8254  def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src),
8255              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8256              [(set VR256:$dst,
8257                (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>,
8258              Sched<[WriteCvtPH2PSY]>, VEX, VEX_L;
8259}
8260
8261multiclass VCVTNEPS2BF16_BASE {
8262  def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8263             "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8264             [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>,
8265             Sched<[WriteCvtPH2PS]>;
8266  def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
8267             "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}",
8268             [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>,
8269             Sched<[WriteCvtPH2PS]>;
8270  def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
8271             "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8272             [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>,
8273             Sched<[WriteCvtPH2PSY]>, VEX_L;
8274  def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
8275             "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}",
8276             [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>,
8277             Sched<[WriteCvtPH2PSY]>, VEX_L;
8278}
8279
8280let Predicates = [HasAVXNECONVERT] in {
8281  defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
8282       f16mem>, T8XS;
8283  defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
8284       T8PD;
8285  defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
8286       f256mem>, T8XS;
8287  defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
8288       f256mem>, T8PD;
8289  defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
8290       f256mem>, T8XD;
8291  defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
8292       f256mem>, T8PS;
8293  let checkVEXPredicate = 1 in
8294  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
8295}
8296
8297def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
8298                (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">;
8299def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}",
8300                (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">;
8301