xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision af23369a6deaaeb612ab266eb88b8bb8d560c322)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               PatFrags mem_frags, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
116                   [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
117  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
118                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
119  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
120                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
121  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
122                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123}
124
125//===----------------------------------------------------------------------===//
126// AVX & SSE - Zero/One Vectors
127//===----------------------------------------------------------------------===//
128
129// Alias instruction that maps zero vector to pxor / xorp* for sse.
130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
131// swizzled by ExecutionDomainFix to pxor.
132// We set canFoldAsLoad because this can be converted to a constant-pool
133// load of an all-zeros value if folding it would be beneficial.
134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
135    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
137               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138}
139
140let Predicates = [NoAVX512] in {
141def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
142def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
143def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
144def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
145def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
146def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147}
148
149
150// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
151// and doesn't need it because on sandy bridge the register is set to zero
152// at the rename stage without using any execution unit, so SET0PSY
153// and SET0PDY can be used for vector int instructions without penalty
154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
155    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
157                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
158}
159
160let Predicates = [NoAVX512] in {
161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
167}
168
169// We set canFoldAsLoad because this can be converted to a constant-pool
170// load of an all-ones value if folding it would be beneficial.
171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
172    isPseudo = 1, SchedRW = [WriteZero] in {
173  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
174                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
175  let Predicates = [HasAVX1Only, OptForMinSize] in {
176  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178  }
179  let Predicates = [HasAVX2] in
180  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
181                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
182}
183
184//===----------------------------------------------------------------------===//
185// SSE 1 & 2 - Move FP Scalar Instructions
186//
187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
188// register copies because it's a partial register update; Register-to-register
189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
190// that the insert be implementable in terms of a copy, and just mentioned, we
191// don't use movss/movsd for copies.
192//===----------------------------------------------------------------------===//
193
194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
195                         string asm_opr, Domain d, string Name> {
196  let isCommutable = 1 in
197  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
198              (ins VR128:$src1, VR128:$src2),
199              !strconcat(base_opc, asm_opr),
200              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
201              Sched<[SchedWriteFShuffle.XMM]>;
202
203  // For the disassembler
204  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
205  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
206                  (ins VR128:$src1, VR128:$src2),
207                  !strconcat(base_opc, asm_opr), []>,
208                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
209}
210
211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212                      X86MemOperand x86memop, string OpcodeStr,
213                      Domain d, string Name, Predicate pred> {
214  // AVX
215  let Predicates = [UseAVX, OptForSize] in
216  defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
218                              "V"#Name>,
219                              VEX_4V, VEX_LIG, VEX_WIG;
220
221  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
222                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
223                     [(store RC:$src, addr:$dst)], d>,
224                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
225  // SSE1 & 2
226  let Constraints = "$src1 = $dst" in {
227    let Predicates = [pred, NoSSE41_Or_OptForSize] in
228    defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
229                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
230  }
231
232  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
233                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
234                     [(store RC:$src, addr:$dst)], d>,
235                     Sched<[WriteFStore]>;
236
237  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
238                  (!cast<Instruction>("V"#NAME#"rr_REV")
239                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
240  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
241                  (!cast<Instruction>(NAME#"rr_REV")
242                   VR128:$dst, VR128:$src2), 0>;
243}
244
245// Loading from memory automatically zeroing upper bits.
246multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
247                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
248                         Domain d> {
249  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
250                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
251                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
252                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
253  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
254                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
255                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
256                     Sched<[WriteFLoad]>;
257
258  // _alt version uses FR32/FR64 register class.
259  let isCodeGenOnly = 1 in {
260  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
261                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
262                         [(set RC:$dst, (mem_pat addr:$src))], d>,
263                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
264  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
265                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
266                         [(set RC:$dst, (mem_pat addr:$src))], d>,
267                         Sched<[WriteFLoad]>;
268  }
269}
270
271defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
272                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
273defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
274                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
275
276let canFoldAsLoad = 1, isReMaterializable = 1 in {
277  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
278                             SSEPackedSingle>, XS;
279  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
280                             SSEPackedDouble>, XD;
281}
282
283// Patterns
284let Predicates = [UseAVX] in {
285  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
286            (VMOVSSrm addr:$src)>;
287  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
288            (VMOVSDrm addr:$src)>;
289
290  // Represent the same patterns above but in the form they appear for
291  // 256-bit types
292  def : Pat<(v8f32 (X86vzload32 addr:$src)),
293            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
294  def : Pat<(v4f64 (X86vzload64 addr:$src)),
295            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
296}
297
298let Predicates = [UseAVX, OptForSize] in {
299  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
300  // MOVSS to the lower bits.
301  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
302            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
303  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
304            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
305
306  // Move low f32 and clear high bits.
307  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
308            (SUBREG_TO_REG (i32 0),
309             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
310              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
311  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
312            (SUBREG_TO_REG (i32 0),
313             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
314              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
315}
316
317let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
318// Move scalar to XMM zero-extended, zeroing a VR128 then do a
319// MOVSS to the lower bits.
320def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
321          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
322def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
323          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
324}
325
326let Predicates = [UseSSE2] in
327def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
328          (MOVSDrm addr:$src)>;
329
330let Predicates = [UseSSE1] in
331def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
332          (MOVSSrm addr:$src)>;
333
334//===----------------------------------------------------------------------===//
335// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
336//===----------------------------------------------------------------------===//
337
338multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
339                            X86MemOperand x86memop, PatFrag ld_frag,
340                            string asm, Domain d,
341                            X86SchedWriteMoveLS sched> {
342let hasSideEffects = 0, isMoveReg = 1 in
343  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
344              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
345           Sched<[sched.RR]>;
346let canFoldAsLoad = 1, isReMaterializable = 1 in
347  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
348              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
349                   [(set RC:$dst, (ld_frag addr:$src))], d>,
350           Sched<[sched.RM]>;
351}
352
353let Predicates = [HasAVX, NoVLX] in {
354defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
355                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
356                                PS, VEX, VEX_WIG;
357defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
358                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
359                                PD, VEX, VEX_WIG;
360defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
361                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
362                                PS, VEX, VEX_WIG;
363defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
364                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
365                                PD, VEX, VEX_WIG;
366
367defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
368                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
369                                 PS, VEX, VEX_L, VEX_WIG;
370defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
371                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
372                                 PD, VEX, VEX_L, VEX_WIG;
373defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
374                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
375                                 PS, VEX, VEX_L, VEX_WIG;
376defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
377                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
378                                 PD, VEX, VEX_L, VEX_WIG;
379}
380
381let Predicates = [UseSSE1] in {
382defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
383                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384                               PS;
385defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
386                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
387                               PS;
388}
389let Predicates = [UseSSE2] in {
390defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
391                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
392                               PD;
393defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
394                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
395                               PD;
396}
397
398let Predicates = [HasAVX, NoVLX]  in {
399let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
400def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
401                   "movaps\t{$src, $dst|$dst, $src}",
402                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
403                   VEX, VEX_WIG;
404def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
405                   "movapd\t{$src, $dst|$dst, $src}",
406                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
407                   VEX, VEX_WIG;
408def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
409                   "movups\t{$src, $dst|$dst, $src}",
410                   [(store (v4f32 VR128:$src), addr:$dst)]>,
411                   VEX, VEX_WIG;
412def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
413                   "movupd\t{$src, $dst|$dst, $src}",
414                   [(store (v2f64 VR128:$src), addr:$dst)]>,
415                   VEX, VEX_WIG;
416} // SchedRW
417
418let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
419def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
420                   "movaps\t{$src, $dst|$dst, $src}",
421                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
422                   VEX, VEX_L, VEX_WIG;
423def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
424                   "movapd\t{$src, $dst|$dst, $src}",
425                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
426                   VEX, VEX_L, VEX_WIG;
427def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
428                   "movups\t{$src, $dst|$dst, $src}",
429                   [(store (v8f32 VR256:$src), addr:$dst)]>,
430                   VEX, VEX_L, VEX_WIG;
431def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
432                   "movupd\t{$src, $dst|$dst, $src}",
433                   [(store (v4f64 VR256:$src), addr:$dst)]>,
434                   VEX, VEX_L, VEX_WIG;
435} // SchedRW
436} // Predicate
437
438// For disassembler
439let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
440    isMoveReg = 1 in {
441let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
442  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
443                          (ins VR128:$src),
444                          "movaps\t{$src, $dst|$dst, $src}", []>,
445                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
446  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
447                           (ins VR128:$src),
448                           "movapd\t{$src, $dst|$dst, $src}", []>,
449                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
450  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
451                           (ins VR128:$src),
452                           "movups\t{$src, $dst|$dst, $src}", []>,
453                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
454  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
455                           (ins VR128:$src),
456                           "movupd\t{$src, $dst|$dst, $src}", []>,
457                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
458} // SchedRW
459
460let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
461  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
462                            (ins VR256:$src),
463                            "movaps\t{$src, $dst|$dst, $src}", []>,
464                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
465  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
466                            (ins VR256:$src),
467                            "movapd\t{$src, $dst|$dst, $src}", []>,
468                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
469  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
470                            (ins VR256:$src),
471                            "movups\t{$src, $dst|$dst, $src}", []>,
472                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
473  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
474                            (ins VR256:$src),
475                            "movupd\t{$src, $dst|$dst, $src}", []>,
476                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
477} // SchedRW
478} // Predicate
479
480// Reversed version with ".s" suffix for GAS compatibility.
481def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
482                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
483def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
484                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
485def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
486                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
487def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
488                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
489def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
490                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
491def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
492                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
493def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
494                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
495def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
496                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
497
498let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
499def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500                   "movaps\t{$src, $dst|$dst, $src}",
501                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
502def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503                   "movapd\t{$src, $dst|$dst, $src}",
504                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
505def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506                   "movups\t{$src, $dst|$dst, $src}",
507                   [(store (v4f32 VR128:$src), addr:$dst)]>;
508def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
509                   "movupd\t{$src, $dst|$dst, $src}",
510                   [(store (v2f64 VR128:$src), addr:$dst)]>;
511} // SchedRW
512
513// For disassembler
514let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
515    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
516  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517                         "movaps\t{$src, $dst|$dst, $src}", []>,
518                         FoldGenData<"MOVAPSrr">;
519  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520                         "movapd\t{$src, $dst|$dst, $src}", []>,
521                         FoldGenData<"MOVAPDrr">;
522  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523                         "movups\t{$src, $dst|$dst, $src}", []>,
524                         FoldGenData<"MOVUPSrr">;
525  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
526                         "movupd\t{$src, $dst|$dst, $src}", []>,
527                         FoldGenData<"MOVUPDrr">;
528}
529
530// Reversed version with ".s" suffix for GAS compatibility.
531def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
532                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
533def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
534                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
535def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
536                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
537def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
538                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
539
540let Predicates = [HasAVX, NoVLX] in {
541  // 256-bit load/store need to use floating point load/store in case we don't
542  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
543  // available and changing the domain is beneficial.
544  def : Pat<(alignedloadv4i64 addr:$src),
545            (VMOVAPSYrm addr:$src)>;
546  def : Pat<(alignedloadv8i32 addr:$src),
547            (VMOVAPSYrm addr:$src)>;
548  def : Pat<(alignedloadv16i16 addr:$src),
549            (VMOVAPSYrm addr:$src)>;
550  def : Pat<(alignedloadv32i8 addr:$src),
551            (VMOVAPSYrm addr:$src)>;
552  def : Pat<(loadv4i64 addr:$src),
553            (VMOVUPSYrm addr:$src)>;
554  def : Pat<(loadv8i32 addr:$src),
555            (VMOVUPSYrm addr:$src)>;
556  def : Pat<(loadv16i16 addr:$src),
557            (VMOVUPSYrm addr:$src)>;
558  def : Pat<(loadv32i8 addr:$src),
559            (VMOVUPSYrm addr:$src)>;
560
561  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
562            (VMOVAPSYmr addr:$dst, VR256:$src)>;
563  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
564            (VMOVAPSYmr addr:$dst, VR256:$src)>;
565  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
566            (VMOVAPSYmr addr:$dst, VR256:$src)>;
567  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
568            (VMOVAPSYmr addr:$dst, VR256:$src)>;
569  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
570            (VMOVUPSYmr addr:$dst, VR256:$src)>;
571  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
572            (VMOVUPSYmr addr:$dst, VR256:$src)>;
573  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
574            (VMOVUPSYmr addr:$dst, VR256:$src)>;
575  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
576            (VMOVUPSYmr addr:$dst, VR256:$src)>;
577
578  def : Pat<(alignedloadv8f16 addr:$src),
579            (VMOVAPSrm addr:$src)>;
580  def : Pat<(loadv8f16 addr:$src),
581            (VMOVUPSrm addr:$src)>;
582  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
583            (VMOVAPSmr addr:$dst, VR128:$src)>;
584  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
585            (VMOVUPSmr addr:$dst, VR128:$src)>;
586  def : Pat<(alignedloadv16f16 addr:$src),
587            (VMOVAPSYrm addr:$src)>;
588  def : Pat<(loadv16f16 addr:$src),
589            (VMOVUPSYrm addr:$src)>;
590  def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
591            (VMOVAPSYmr addr:$dst, VR256:$src)>;
592  def : Pat<(store (v16f16 VR256:$src), addr:$dst),
593            (VMOVUPSYmr addr:$dst, VR256:$src)>;
594}
595
596// Use movaps / movups for SSE integer load / store (one byte shorter).
597// The instructions selected below are then converted to MOVDQA/MOVDQU
598// during the SSE domain pass.
599let Predicates = [UseSSE1] in {
600  def : Pat<(alignedloadv2i64 addr:$src),
601            (MOVAPSrm addr:$src)>;
602  def : Pat<(alignedloadv4i32 addr:$src),
603            (MOVAPSrm addr:$src)>;
604  def : Pat<(alignedloadv8i16 addr:$src),
605            (MOVAPSrm addr:$src)>;
606  def : Pat<(alignedloadv16i8 addr:$src),
607            (MOVAPSrm addr:$src)>;
608  def : Pat<(loadv2i64 addr:$src),
609            (MOVUPSrm addr:$src)>;
610  def : Pat<(loadv4i32 addr:$src),
611            (MOVUPSrm addr:$src)>;
612  def : Pat<(loadv8i16 addr:$src),
613            (MOVUPSrm addr:$src)>;
614  def : Pat<(loadv16i8 addr:$src),
615            (MOVUPSrm addr:$src)>;
616
617  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
618            (MOVAPSmr addr:$dst, VR128:$src)>;
619  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
620            (MOVAPSmr addr:$dst, VR128:$src)>;
621  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
622            (MOVAPSmr addr:$dst, VR128:$src)>;
623  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
624            (MOVAPSmr addr:$dst, VR128:$src)>;
625  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
626            (MOVUPSmr addr:$dst, VR128:$src)>;
627  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
628            (MOVUPSmr addr:$dst, VR128:$src)>;
629  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
630            (MOVUPSmr addr:$dst, VR128:$src)>;
631  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
632            (MOVUPSmr addr:$dst, VR128:$src)>;
633}
634
635let Predicates = [UseSSE2] in {
636  def : Pat<(alignedloadv8f16 addr:$src),
637            (MOVAPSrm addr:$src)>;
638  def : Pat<(loadv8f16 addr:$src),
639            (MOVUPSrm addr:$src)>;
640  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
641            (MOVAPSmr addr:$dst, VR128:$src)>;
642  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
643            (MOVUPSmr addr:$dst, VR128:$src)>;
644}
645
646//===----------------------------------------------------------------------===//
647// SSE 1 & 2 - Move Low packed FP Instructions
648//===----------------------------------------------------------------------===//
649
650multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
651                                      string base_opc, string asm_opr> {
652  // No pattern as they need be special cased between high and low.
653  let hasSideEffects = 0, mayLoad = 1 in
654  def PSrm : PI<opc, MRMSrcMem,
655                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
656                !strconcat(base_opc, "s", asm_opr),
657                [], SSEPackedSingle>, PS,
658                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
659
660  def PDrm : PI<opc, MRMSrcMem,
661         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
662         !strconcat(base_opc, "d", asm_opr),
663     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
664                              (scalar_to_vector (loadf64 addr:$src2)))))],
665              SSEPackedDouble>, PD,
666     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
667}
668
669multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
670                                 string base_opc> {
671  let Predicates = [UseAVX] in
672    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
673                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
674                                    VEX_4V, VEX_WIG;
675
676  let Constraints = "$src1 = $dst" in
677    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
678                                    "\t{$src2, $dst|$dst, $src2}">;
679}
680
681defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
682
683let SchedRW = [WriteFStore] in {
684let Predicates = [UseAVX] in {
685let mayStore = 1, hasSideEffects = 0 in
686def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
687                     "movlps\t{$src, $dst|$dst, $src}",
688                     []>,
689                     VEX, VEX_WIG;
690def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
691                     "movlpd\t{$src, $dst|$dst, $src}",
692                     [(store (f64 (extractelt (v2f64 VR128:$src),
693                                   (iPTR 0))), addr:$dst)]>,
694                     VEX, VEX_WIG;
695}// UseAVX
696let mayStore = 1, hasSideEffects = 0 in
697def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
698                   "movlps\t{$src, $dst|$dst, $src}",
699                   []>;
700def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
701                   "movlpd\t{$src, $dst|$dst, $src}",
702                   [(store (f64 (extractelt (v2f64 VR128:$src),
703                                 (iPTR 0))), addr:$dst)]>;
704} // SchedRW
705
706let Predicates = [UseSSE1] in {
707  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
708  // end up with a movsd or blend instead of shufp.
709  // No need for aligned load, we're only loading 64-bits.
710  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
711                      (i8 -28)),
712            (MOVLPSrm VR128:$src1, addr:$src2)>;
713  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
714            (MOVLPSrm VR128:$src1, addr:$src2)>;
715
716  def : Pat<(v4f32 (X86vzload64 addr:$src)),
717            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
718  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
719            (MOVLPSmr addr:$dst, VR128:$src)>;
720}
721
722//===----------------------------------------------------------------------===//
723// SSE 1 & 2 - Move Hi packed FP Instructions
724//===----------------------------------------------------------------------===//
725
726defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
727
728let SchedRW = [WriteFStore] in {
729// v2f64 extract element 1 is always custom lowered to unpack high to low
730// and extract element 0 so the non-store version isn't too horrible.
731let Predicates = [UseAVX] in {
732let mayStore = 1, hasSideEffects = 0 in
733def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
734                   "movhps\t{$src, $dst|$dst, $src}",
735                   []>, VEX, VEX_WIG;
736def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
737                   "movhpd\t{$src, $dst|$dst, $src}",
738                   [(store (f64 (extractelt
739                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
740                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
741} // UseAVX
742let mayStore = 1, hasSideEffects = 0 in
743def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
744                   "movhps\t{$src, $dst|$dst, $src}",
745                   []>;
746def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
747                   "movhpd\t{$src, $dst|$dst, $src}",
748                   [(store (f64 (extractelt
749                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
750                                 (iPTR 0))), addr:$dst)]>;
751} // SchedRW
752
753let Predicates = [UseAVX] in {
754  // MOVHPD patterns
755  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
756            (VMOVHPDrm VR128:$src1, addr:$src2)>;
757
758  def : Pat<(store (f64 (extractelt
759                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
760                          (iPTR 0))), addr:$dst),
761            (VMOVHPDmr addr:$dst, VR128:$src)>;
762
763  // MOVLPD patterns
764  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
765            (VMOVLPDrm VR128:$src1, addr:$src2)>;
766}
767
768let Predicates = [UseSSE1] in {
769  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
770  // end up with a movsd or blend instead of shufp.
771  // No need for aligned load, we're only loading 64-bits.
772  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
773            (MOVHPSrm VR128:$src1, addr:$src2)>;
774  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
775            (MOVHPSrm VR128:$src1, addr:$src2)>;
776
777  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
778                                addr:$dst),
779            (MOVHPSmr addr:$dst, VR128:$src)>;
780}
781
782let Predicates = [UseSSE2] in {
783  // MOVHPD patterns
784  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
785            (MOVHPDrm VR128:$src1, addr:$src2)>;
786
787  def : Pat<(store (f64 (extractelt
788                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
789                          (iPTR 0))), addr:$dst),
790            (MOVHPDmr addr:$dst, VR128:$src)>;
791
792  // MOVLPD patterns
793  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
794            (MOVLPDrm VR128:$src1, addr:$src2)>;
795}
796
797let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
798  // Use MOVLPD to load into the low bits from a full vector unless we can use
799  // BLENDPD.
800  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
801            (MOVLPDrm VR128:$src1, addr:$src2)>;
802}
803
804//===----------------------------------------------------------------------===//
805// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
806//===----------------------------------------------------------------------===//
807
808let Predicates = [UseAVX] in {
809  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
810                                       (ins VR128:$src1, VR128:$src2),
811                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
812                      [(set VR128:$dst,
813                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
814                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
815  let isCommutable = 1 in
816  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
817                                       (ins VR128:$src1, VR128:$src2),
818                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
819                      [(set VR128:$dst,
820                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
821                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
822                      NotMemoryFoldable;
823}
824let Constraints = "$src1 = $dst" in {
825  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
826                                       (ins VR128:$src1, VR128:$src2),
827                      "movlhps\t{$src2, $dst|$dst, $src2}",
828                      [(set VR128:$dst,
829                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
830                      Sched<[SchedWriteFShuffle.XMM]>;
831  let isCommutable = 1 in
832  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
833                                       (ins VR128:$src1, VR128:$src2),
834                      "movhlps\t{$src2, $dst|$dst, $src2}",
835                      [(set VR128:$dst,
836                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
837                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
838}
839
840//===----------------------------------------------------------------------===//
841// SSE 1 & 2 - Conversion Instructions
842//===----------------------------------------------------------------------===//
843
844multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
845                     SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
846                     string asm, string mem, X86FoldableSchedWrite sched,
847                     Domain d,
848                     SchedRead Int2Fpu = ReadDefault> {
849  let ExeDomain = d in {
850  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
851              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
852              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
853              Sched<[sched, Int2Fpu]>;
854  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
855              mem#"\t{$src, $dst|$dst, $src}",
856              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
857              Sched<[sched.Folded]>;
858  }
859}
860
861multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
862                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
863                       string asm, Domain d, X86FoldableSchedWrite sched> {
864let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
865  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
866             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
867             Sched<[sched]>;
868  let mayLoad = 1 in
869  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
870             [(set RC:$dst, (DstTy (any_sint_to_fp
871                                    (SrcTy (ld_frag addr:$src)))))], d>,
872             Sched<[sched.Folded]>;
873}
874}
875
876multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
877                          X86MemOperand x86memop, string asm, string mem,
878                          X86FoldableSchedWrite sched, Domain d> {
879let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
880  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
881              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
882              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
883  let mayLoad = 1 in
884  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
885              (ins DstRC:$src1, x86memop:$src),
886              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
887           Sched<[sched.Folded, sched.ReadAfterFold]>;
888} // hasSideEffects = 0
889}
890
891let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
892defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
893                                "cvttss2si", "cvttss2si",
894                                WriteCvtSS2I, SSEPackedSingle>,
895                                XS, VEX, VEX_LIG;
896defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
897                                "cvttss2si", "cvttss2si",
898                                WriteCvtSS2I, SSEPackedSingle>,
899                                XS, VEX, VEX_W, VEX_LIG;
900defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
901                                "cvttsd2si", "cvttsd2si",
902                                WriteCvtSD2I, SSEPackedDouble>,
903                                XD, VEX, VEX_LIG;
904defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
905                                "cvttsd2si", "cvttsd2si",
906                                WriteCvtSD2I, SSEPackedDouble>,
907                                XD, VEX, VEX_W, VEX_LIG;
908
909defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
910                               "cvtss2si", "cvtss2si",
911                               WriteCvtSS2I, SSEPackedSingle>,
912                               XS, VEX, VEX_LIG;
913defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
914                               "cvtss2si", "cvtss2si",
915                               WriteCvtSS2I, SSEPackedSingle>,
916                               XS, VEX, VEX_W, VEX_LIG;
917defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
918                               "cvtsd2si", "cvtsd2si",
919                               WriteCvtSD2I, SSEPackedDouble>,
920                               XD, VEX, VEX_LIG;
921defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
922                               "cvtsd2si", "cvtsd2si",
923                               WriteCvtSD2I, SSEPackedDouble>,
924                               XD, VEX, VEX_W, VEX_LIG;
925}
926
927// The assembler can recognize rr 64-bit instructions by seeing a rxx
928// register, but the same isn't true when only using memory operands,
929// provide other assembly "l" and "q" forms to address this explicitly
930// where appropriate to do so.
931let isCodeGenOnly = 1 in {
932defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
933                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
934                                  VEX_LIG, SIMD_EXC;
935defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
936                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
937                                  VEX_W, VEX_LIG, SIMD_EXC;
938defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
939                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
940                                  VEX_LIG;
941defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
942                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
943                                  VEX_W, VEX_LIG, SIMD_EXC;
944} // isCodeGenOnly = 1
945
946let Predicates = [UseAVX] in {
947  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
948            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
949  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
950            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
951  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
952            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
953  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
954            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
955
956  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
957            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
958  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
959            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
960  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
961            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
962  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
963            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
964
965  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
966  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
967
968  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
969  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
970}
971
972let isCodeGenOnly = 1 in {
973defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
974                      "cvttss2si", "cvttss2si",
975                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
976defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
977                      "cvttss2si", "cvttss2si",
978                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
979defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
980                      "cvttsd2si", "cvttsd2si",
981                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
982defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
983                      "cvttsd2si", "cvttsd2si",
984                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
985
986defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
987                     "cvtss2si", "cvtss2si",
988                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
989defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
990                     "cvtss2si", "cvtss2si",
991                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
992defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
993                     "cvtsd2si", "cvtsd2si",
994                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
995defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
996                     "cvtsd2si", "cvtsd2si",
997                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
998
999defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
1000                      "cvtsi2ss", "cvtsi2ss{l}",
1001                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
1002defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
1003                      "cvtsi2ss", "cvtsi2ss{q}",
1004                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
1005defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
1006                      "cvtsi2sd", "cvtsi2sd{l}",
1007                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
1008defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
1009                      "cvtsi2sd", "cvtsi2sd{q}",
1010                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
1011} // isCodeGenOnly = 1
1012
1013let Predicates = [UseSSE1] in {
1014  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
1015  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
1016}
1017
1018let Predicates = [UseSSE2] in {
1019  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
1020  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
1021}
1022
1023// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1024// and/or XMM operand(s).
1025
1026multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1027                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1028                          Operand memop, PatFrags mem_frags, string asm,
1029                          X86FoldableSchedWrite sched, Domain d> {
1030let ExeDomain = d in {
1031  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1032                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1033                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1034               Sched<[sched]>;
1035  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1036                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1037                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1038               Sched<[sched.Folded]>;
1039}
1040}
1041
1042multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1043                    RegisterClass DstRC, X86MemOperand x86memop,
1044                    string asm, string mem, X86FoldableSchedWrite sched,
1045                    Domain d, bit Is2Addr = 1> {
1046let hasSideEffects = 0, ExeDomain = d in {
1047  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1048                  !if(Is2Addr,
1049                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1050                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1051                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1052  let mayLoad = 1 in
1053  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1054                  (ins DstRC:$src1, x86memop:$src2),
1055                  !if(Is2Addr,
1056                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1057                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1058                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1059}
1060}
1061
1062let Uses = [MXCSR], mayRaiseFPException = 1 in {
1063let Predicates = [UseAVX] in {
1064defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1065                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1066                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1067defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1068                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1069                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1070}
1071defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1072                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1073                 SSEPackedDouble>, XD;
1074defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1075                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1076                   SSEPackedDouble>, XD, REX_W;
1077}
1078
1079let Predicates = [UseAVX] in {
1080defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1081          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1082          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1083defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1084          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1085          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1086defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1087          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1088          XD, VEX_4V, VEX_LIG;
1089defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1090          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1091          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1092}
1093let Constraints = "$src1 = $dst" in {
1094  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1095                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1096                        XS, SIMD_EXC;
1097  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1098                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1099                        XS, REX_W, SIMD_EXC;
1100  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1101                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1102                        XD;
1103  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1104                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1105                        XD, REX_W, SIMD_EXC;
1106}
1107
1108def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1109               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1110def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1111               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1112def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1113               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1114def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1115               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1116
1117def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1118              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1119def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1120              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1121
1122def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1123                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1124def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1125                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1126def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1127                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1128def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1129                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1130
1131def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1132                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1133def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1134                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1135
1136/// SSE 1 Only
1137
1138// Aliases for intrinsics
1139let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1140defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1141                                ssmem, sse_load_f32, "cvttss2si",
1142                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1143defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1144                               X86cvtts2Int, ssmem, sse_load_f32,
1145                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1146                               XS, VEX, VEX_LIG, VEX_W;
1147defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1148                                sdmem, sse_load_f64, "cvttsd2si",
1149                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1150defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1151                              X86cvtts2Int, sdmem, sse_load_f64,
1152                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1153                              XD, VEX, VEX_LIG, VEX_W;
1154}
1155let Uses = [MXCSR], mayRaiseFPException = 1 in {
1156defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1157                                    ssmem, sse_load_f32, "cvttss2si",
1158                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1159defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1160                                   X86cvtts2Int, ssmem, sse_load_f32,
1161                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1162                                   XS, REX_W;
1163defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1164                                    sdmem, sse_load_f64, "cvttsd2si",
1165                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1166defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1167                                  X86cvtts2Int, sdmem, sse_load_f64,
1168                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1169                                  XD, REX_W;
1170}
1171
1172def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1173                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1174def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1175                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1176def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1177                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1178def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1179                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1180def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1181                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1182def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1183                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1184def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1185                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1186def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1187                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1188
1189def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1190                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1191def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1192                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1193def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1194                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1195def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1196                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1197def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1198                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1199def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1200                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1201def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1202                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1203def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1204                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1205
1206let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1207defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1208                                  ssmem, sse_load_f32, "cvtss2si",
1209                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1210defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1211                                  ssmem, sse_load_f32, "cvtss2si",
1212                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1213}
1214let Uses = [MXCSR], mayRaiseFPException = 1 in {
1215defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1216                               ssmem, sse_load_f32, "cvtss2si",
1217                               WriteCvtSS2I, SSEPackedSingle>, XS;
1218defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1219                                 ssmem, sse_load_f32, "cvtss2si",
1220                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1221
1222defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1223                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1224                               SSEPackedSingle, WriteCvtI2PS>,
1225                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1226defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1227                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1228                               SSEPackedSingle, WriteCvtI2PSY>,
1229                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1230
1231defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1232                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1233                            SSEPackedSingle, WriteCvtI2PS>,
1234                            PS, Requires<[UseSSE2]>;
1235}
1236
1237// AVX aliases
1238def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1239                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1240def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1241                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1242def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1243                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1244def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1245                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1246def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1247                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1248def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1249                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1250def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1251                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1252def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1253                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1254
1255// SSE aliases
1256def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1257                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1258def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1259                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1260def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1261                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1262def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1263                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1264def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1265                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1266def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1267                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1268def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1269                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1270def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1271                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1272
1273/// SSE 2 Only
1274
1275// Convert scalar double to scalar single
1276let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
1277    ExeDomain = SSEPackedSingle in {
1278def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1279                        (ins FR32:$src1, FR64:$src2),
1280                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1281                        VEX_4V, VEX_LIG, VEX_WIG,
1282                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1283let mayLoad = 1 in
1284def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1285                     (ins FR32:$src1, f64mem:$src2),
1286                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1287                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1288                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1289}
1290
1291def : Pat<(f32 (any_fpround FR64:$src)),
1292            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1293          Requires<[UseAVX]>;
1294
1295let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1296def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1297                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1298                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1299                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1300def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1301                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1302                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1303                    XD, Requires<[UseSSE2, OptForSize]>,
1304                    Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
1305}
1306
1307let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
1308def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1309                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1310                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1311                       [(set VR128:$dst,
1312                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1313                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1314                       Sched<[WriteCvtSD2SS]>;
1315def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1316                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1317                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1318                       [(set VR128:$dst,
1319                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1320                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1321                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1322let Constraints = "$src1 = $dst" in {
1323def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1324                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1325                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1326                       [(set VR128:$dst,
1327                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1328                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1329def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1330                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1331                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1332                       [(set VR128:$dst,
1333                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1334                       XD, Requires<[UseSSE2]>,
1335                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1336}
1337}
1338
1339// Convert scalar single to scalar double
1340// SSE2 instructions with XS prefix
1341let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
1342def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1343                    (ins FR64:$src1, FR32:$src2),
1344                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1345                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1346                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1347let mayLoad = 1 in
1348def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1349                    (ins FR64:$src1, f32mem:$src2),
1350                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1351                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1352                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1353                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1354} // isCodeGenOnly = 1, hasSideEffects = 0
1355
1356def : Pat<(f64 (any_fpextend FR32:$src)),
1357    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1358def : Pat<(any_fpextend (loadf32 addr:$src)),
1359    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1360
1361let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1362def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1363                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1364                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1365                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1366def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1367                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1368                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1369                   XS, Requires<[UseSSE2, OptForSize]>,
1370                   Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
1371} // isCodeGenOnly = 1
1372
1373let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1374    ExeDomain = SSEPackedSingle in {
1375def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1376                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1377                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1378                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1379                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1380let mayLoad = 1 in
1381def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1382                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1383                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1384                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1385                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1386let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1387def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1388                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1389                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1390                    []>, XS, Requires<[UseSSE2]>,
1391                    Sched<[WriteCvtSS2SD]>;
1392let mayLoad = 1 in
1393def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1394                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1395                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1396                    []>, XS, Requires<[UseSSE2]>,
1397                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1398}
1399} // hasSideEffects = 0
1400
1401// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1402// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1403// vmovs{s,d} instructions
1404let Predicates = [UseAVX] in {
1405def : Pat<(v4f32 (X86Movss
1406                   (v4f32 VR128:$dst),
1407                   (v4f32 (scalar_to_vector
1408                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1409          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1410
1411def : Pat<(v2f64 (X86Movsd
1412                   (v2f64 VR128:$dst),
1413                   (v2f64 (scalar_to_vector
1414                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1415          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1416
1417def : Pat<(v4f32 (X86Movss
1418                   (v4f32 VR128:$dst),
1419                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1420          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1421
1422def : Pat<(v4f32 (X86Movss
1423                   (v4f32 VR128:$dst),
1424                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1425          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1426
1427def : Pat<(v4f32 (X86Movss
1428                   (v4f32 VR128:$dst),
1429                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1430          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1431
1432def : Pat<(v4f32 (X86Movss
1433                   (v4f32 VR128:$dst),
1434                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1435          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1436
1437def : Pat<(v2f64 (X86Movsd
1438                   (v2f64 VR128:$dst),
1439                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1440          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1441
1442def : Pat<(v2f64 (X86Movsd
1443                   (v2f64 VR128:$dst),
1444                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1445          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1446
1447def : Pat<(v2f64 (X86Movsd
1448                   (v2f64 VR128:$dst),
1449                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1450          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1451
1452def : Pat<(v2f64 (X86Movsd
1453                   (v2f64 VR128:$dst),
1454                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1455          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1456} // Predicates = [UseAVX]
1457
1458let Predicates = [UseSSE2] in {
1459def : Pat<(v4f32 (X86Movss
1460                   (v4f32 VR128:$dst),
1461                   (v4f32 (scalar_to_vector
1462                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1463          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1464
1465def : Pat<(v2f64 (X86Movsd
1466                   (v2f64 VR128:$dst),
1467                   (v2f64 (scalar_to_vector
1468                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1469          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1470
1471def : Pat<(v2f64 (X86Movsd
1472                   (v2f64 VR128:$dst),
1473                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1474          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1475
1476def : Pat<(v2f64 (X86Movsd
1477                   (v2f64 VR128:$dst),
1478                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1479          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1480
1481def : Pat<(v2f64 (X86Movsd
1482                   (v2f64 VR128:$dst),
1483                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1484          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1485
1486def : Pat<(v2f64 (X86Movsd
1487                   (v2f64 VR128:$dst),
1488                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1489          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1490} // Predicates = [UseSSE2]
1491
1492let Predicates = [UseSSE1] in {
1493def : Pat<(v4f32 (X86Movss
1494                   (v4f32 VR128:$dst),
1495                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1496          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1497
1498def : Pat<(v4f32 (X86Movss
1499                   (v4f32 VR128:$dst),
1500                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1501          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1502
1503def : Pat<(v4f32 (X86Movss
1504                   (v4f32 VR128:$dst),
1505                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1506          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1507
1508def : Pat<(v4f32 (X86Movss
1509                   (v4f32 VR128:$dst),
1510                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1511          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1512} // Predicates = [UseSSE1]
1513
1514let Predicates = [HasAVX, NoVLX] in {
1515// Convert packed single/double fp to doubleword
1516def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1517                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1518                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1519                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1520def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1521                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1522                       [(set VR128:$dst,
1523                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1524                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1525def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1526                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1527                        [(set VR256:$dst,
1528                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1529                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1530def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1531                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1532                        [(set VR256:$dst,
1533                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1534                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1535}
1536def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1537                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1538                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1539                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1540def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1541                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1542                     [(set VR128:$dst,
1543                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1544                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1545
1546
1547// Convert Packed Double FP to Packed DW Integers
1548let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1549// The assembler can recognize rr 256-bit instructions by seeing a ymm
1550// register, but the same isn't true when using memory operands instead.
1551// Provide other assembly rr and rm forms to address this explicitly.
1552def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1553                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1554                       [(set VR128:$dst,
1555                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1556                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1557
1558// XMM only
1559def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1560                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1561                      [(set VR128:$dst,
1562                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1563                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1564
1565// YMM only
1566def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1567                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1568                       [(set VR128:$dst,
1569                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1570                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1571def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1572                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1573                       [(set VR128:$dst,
1574                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1575                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1576}
1577
1578def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1579                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1580def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1581                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1582
1583def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1584                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1585                      [(set VR128:$dst,
1586                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1587                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1588def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1589                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1590                      [(set VR128:$dst,
1591                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1592                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1593
1594// Convert with truncation packed single/double fp to doubleword
1595// SSE2 packed instructions with XS prefix
1596let Uses = [MXCSR], mayRaiseFPException = 1 in {
1597let Predicates = [HasAVX, NoVLX] in {
1598def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1599                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1600                         [(set VR128:$dst,
1601                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1602                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1603def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1604                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1605                         [(set VR128:$dst,
1606                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1607                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1608def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1609                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1610                          [(set VR256:$dst,
1611                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1612                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1613def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1614                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1615                          [(set VR256:$dst,
1616                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1617                          VEX, VEX_L,
1618                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1619}
1620
1621def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1622                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1623                       [(set VR128:$dst,
1624                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1625                       Sched<[WriteCvtPS2I]>;
1626def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1627                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1628                       [(set VR128:$dst,
1629                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1630                       Sched<[WriteCvtPS2ILd]>;
1631}
1632
1633// The assembler can recognize rr 256-bit instructions by seeing a ymm
1634// register, but the same isn't true when using memory operands instead.
1635// Provide other assembly rr and rm forms to address this explicitly.
1636let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1637// XMM only
1638def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1639                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1640                        [(set VR128:$dst,
1641                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1642                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1643def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1644                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1645                        [(set VR128:$dst,
1646                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1647                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1648
1649// YMM only
1650def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1651                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1652                         [(set VR128:$dst,
1653                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1654                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1655def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1656                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1657                         [(set VR128:$dst,
1658                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1659                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1660} // Predicates = [HasAVX, NoVLX]
1661
1662def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1663                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1664def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1665                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1666
1667let Predicates = [HasAVX, NoVLX] in {
1668  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1669            (VCVTTPD2DQYrr VR256:$src)>;
1670  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1671            (VCVTTPD2DQYrm addr:$src)>;
1672}
1673
1674def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1675                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1676                      [(set VR128:$dst,
1677                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1678                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1679def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1680                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1681                      [(set VR128:$dst,
1682                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1683                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1684
1685// Convert packed single to packed double
1686let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1687                  // SSE2 instructions without OpSize prefix
1688def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1689                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1690                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1691                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1692def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1693                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1694                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1695                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1696def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1697                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1698                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1699                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1700def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1701                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1702                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1703                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1704}
1705
1706let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1707def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1708                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1709                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1710                   PS, Sched<[WriteCvtPS2PD]>;
1711def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1712                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1713                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1714                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1715}
1716
1717// Convert Packed DW Integers to Packed Double FP
1718let Predicates = [HasAVX, NoVLX] in {
1719let hasSideEffects = 0, mayLoad = 1 in
1720def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1721                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1722                        [(set VR128:$dst,
1723                          (v2f64 (X86any_VSintToFP
1724                                  (bc_v4i32
1725                                   (v2i64 (scalar_to_vector
1726                                           (loadi64 addr:$src)))))))]>,
1727                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1728def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1729                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1730                        [(set VR128:$dst,
1731                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1732                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1733def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1734                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1735                         [(set VR256:$dst,
1736                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1737                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1738                         VEX_WIG;
1739def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1740                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1741                         [(set VR256:$dst,
1742                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1743                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1744}
1745
1746let hasSideEffects = 0, mayLoad = 1 in
1747def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1748                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1749                       [(set VR128:$dst,
1750                         (v2f64 (X86any_VSintToFP
1751                                 (bc_v4i32
1752                                  (v2i64 (scalar_to_vector
1753                                          (loadi64 addr:$src)))))))]>,
1754                       Sched<[WriteCvtI2PDLd]>;
1755def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1756                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1757                       [(set VR128:$dst,
1758                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1759                       Sched<[WriteCvtI2PD]>;
1760
1761// AVX register conversion intrinsics
1762let Predicates = [HasAVX, NoVLX] in {
1763  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1764            (VCVTDQ2PDrm addr:$src)>;
1765} // Predicates = [HasAVX, NoVLX]
1766
1767// SSE2 register conversion intrinsics
1768let Predicates = [UseSSE2] in {
1769  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1770            (CVTDQ2PDrm addr:$src)>;
1771} // Predicates = [UseSSE2]
1772
1773// Convert packed double to packed single
1774// The assembler can recognize rr 256-bit instructions by seeing a ymm
1775// register, but the same isn't true when using memory operands instead.
1776// Provide other assembly rr and rm forms to address this explicitly.
1777let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1778// XMM only
1779def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1780                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1781                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1782                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1783def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1784                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1785                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
1786                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1787
1788def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1789                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1790                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
1791                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1792def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1793                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1794                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
1795                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1796} // Predicates = [HasAVX, NoVLX]
1797
1798def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1799                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1800def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1801                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1802
1803def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1804                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1805                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1806                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1807def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1808                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1809                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
1810                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1811
1812//===----------------------------------------------------------------------===//
1813// SSE 1 & 2 - Compare Instructions
1814//===----------------------------------------------------------------------===//
1815
1816// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1817multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1818                            Operand memop, SDNode OpNode, ValueType VT,
1819                            PatFrag ld_frag, string asm,
1820                            X86FoldableSchedWrite sched,
1821                            PatFrags mem_frags> {
1822  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1823                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1824                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1825                                              VR128:$src2, timm:$cc))]>,
1826           Sched<[sched]>, SIMD_EXC;
1827  let mayLoad = 1 in
1828  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1829                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1830                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1831                                              (mem_frags addr:$src2), timm:$cc))]>,
1832           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1833
1834  let isCodeGenOnly = 1 in {
1835    let isCommutable = 1 in
1836    def rr : SIi8<0xC2, MRMSrcReg,
1837                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1838                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1839                  Sched<[sched]>, SIMD_EXC;
1840    def rm : SIi8<0xC2, MRMSrcMem,
1841                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1842                  [(set RC:$dst, (OpNode RC:$src1,
1843                                         (ld_frag addr:$src2), timm:$cc))]>,
1844                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1845  }
1846}
1847
1848let ExeDomain = SSEPackedSingle in
1849defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1850                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1851                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1852                 XS, VEX_4V, VEX_LIG, VEX_WIG;
1853let ExeDomain = SSEPackedDouble in
1854defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1855                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1856                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1857                 XD, VEX_4V, VEX_LIG, VEX_WIG;
1858
1859let Constraints = "$src1 = $dst" in {
1860  let ExeDomain = SSEPackedSingle in
1861  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1862                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1863                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1864  let ExeDomain = SSEPackedDouble in
1865  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1866                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1867                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1868}
1869
1870// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1871multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
1872                         ValueType vt, X86MemOperand x86memop,
1873                         PatFrag ld_frag, string OpcodeStr, Domain d,
1874                         X86FoldableSchedWrite sched = WriteFComX> {
1875  let ExeDomain = d in {
1876  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1877                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1878                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1879          Sched<[sched]>, SIMD_EXC;
1880  let mayLoad = 1 in
1881  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1882                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1883                     [(set EFLAGS, (OpNode (vt RC:$src1),
1884                                           (ld_frag addr:$src2)))]>,
1885          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1886}
1887}
1888
1889// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1890multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1891                             ValueType vt, Operand memop,
1892                             PatFrags mem_frags, string OpcodeStr,
1893                             Domain d,
1894                             X86FoldableSchedWrite sched = WriteFComX> {
1895let ExeDomain = d in {
1896  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1897                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1898                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1899          Sched<[sched]>, SIMD_EXC;
1900let mayLoad = 1 in
1901  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1902                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1903                     [(set EFLAGS, (OpNode (vt RC:$src1),
1904                                           (mem_frags addr:$src2)))]>,
1905          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1906}
1907}
1908
1909let Defs = [EFLAGS] in {
1910  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1911                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1912  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1913                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1914  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1915                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1916  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1917                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1918
1919  let isCodeGenOnly = 1 in {
1920    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1921                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1922    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1923                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1924
1925    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1926                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1927    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1928                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1929  }
1930  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1931                                  "ucomiss", SSEPackedSingle>, PS;
1932  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1933                                  "ucomisd", SSEPackedDouble>, PD;
1934  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1935                                  "comiss", SSEPackedSingle>, PS;
1936  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1937                                  "comisd", SSEPackedDouble>, PD;
1938
1939  let isCodeGenOnly = 1 in {
1940    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1941                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1942    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1943                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1944
1945    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1946                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1947    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1948                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1949  }
1950} // Defs = [EFLAGS]
1951
1952// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1953multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1954                            ValueType VT, string asm,
1955                            X86FoldableSchedWrite sched,
1956                            Domain d, PatFrag ld_frag> {
1957  let isCommutable = 1 in
1958  def rri : PIi8<0xC2, MRMSrcReg,
1959             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1960             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1961            Sched<[sched]>, SIMD_EXC;
1962  def rmi : PIi8<0xC2, MRMSrcMem,
1963             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1964             [(set RC:$dst,
1965               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1966            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1967}
1968
1969defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1970               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1971               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1972defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1973               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1974               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1975defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1976               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1977               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1978defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1979               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1980               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1981let Constraints = "$src1 = $dst" in {
1982  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1983                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1984                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1985  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1986                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1987                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1988}
1989
1990def CommutableCMPCC : PatLeaf<(timm), [{
1991  uint64_t Imm = N->getZExtValue() & 0x7;
1992  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1993}]>;
1994
1995// Patterns to select compares with loads in first operand.
1996let Predicates = [HasAVX] in {
1997  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1998                                CommutableCMPCC:$cc)),
1999            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2000
2001  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
2002                                CommutableCMPCC:$cc)),
2003            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2004
2005  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
2006                                CommutableCMPCC:$cc)),
2007            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2008
2009  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
2010                                CommutableCMPCC:$cc)),
2011            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2012
2013  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2014                          CommutableCMPCC:$cc)),
2015            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2016
2017  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2018                          CommutableCMPCC:$cc)),
2019            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2020}
2021
2022let Predicates = [UseSSE2] in {
2023  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
2024                                CommutableCMPCC:$cc)),
2025            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2026
2027  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2028                          CommutableCMPCC:$cc)),
2029            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2030}
2031
2032let Predicates = [UseSSE1] in {
2033  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2034                                CommutableCMPCC:$cc)),
2035            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2036
2037  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2038                          CommutableCMPCC:$cc)),
2039            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2040}
2041
2042//===----------------------------------------------------------------------===//
2043// SSE 1 & 2 - Shuffle Instructions
2044//===----------------------------------------------------------------------===//
2045
2046/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2047multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2048                         ValueType vt, string asm, PatFrag mem_frag,
2049                         X86FoldableSchedWrite sched, Domain d,
2050                         bit IsCommutable = 0> {
2051  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2052                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2053                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2054                                       (i8 timm:$src3))))], d>,
2055            Sched<[sched.Folded, sched.ReadAfterFold]>;
2056  let isCommutable = IsCommutable in
2057  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2058                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2059                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2060                                     (i8 timm:$src3))))], d>,
2061            Sched<[sched]>;
2062}
2063
2064let Predicates = [HasAVX, NoVLX] in {
2065  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2066           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2067           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2068           PS, VEX_4V, VEX_WIG;
2069  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2070           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2071           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2072           PS, VEX_4V, VEX_L, VEX_WIG;
2073  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2074           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2075           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2076           PD, VEX_4V, VEX_WIG;
2077  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2078           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2079           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2080           PD, VEX_4V, VEX_L, VEX_WIG;
2081}
2082let Constraints = "$src1 = $dst" in {
2083  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2084                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2085                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2086  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2087                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2088                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2089}
2090
2091//===----------------------------------------------------------------------===//
2092// SSE 1 & 2 - Unpack FP Instructions
2093//===----------------------------------------------------------------------===//
2094
2095/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2096multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2097                                   PatFrag mem_frag, RegisterClass RC,
2098                                   X86MemOperand x86memop, string asm,
2099                                   X86FoldableSchedWrite sched, Domain d,
2100                                   bit IsCommutable = 0> {
2101    let isCommutable = IsCommutable in
2102    def rr : PI<opc, MRMSrcReg,
2103                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2104                asm, [(set RC:$dst,
2105                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2106                Sched<[sched]>;
2107    def rm : PI<opc, MRMSrcMem,
2108                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2109                asm, [(set RC:$dst,
2110                           (vt (OpNode RC:$src1,
2111                                       (mem_frag addr:$src2))))], d>,
2112             Sched<[sched.Folded, sched.ReadAfterFold]>;
2113}
2114
2115let Predicates = [HasAVX, NoVLX] in {
2116defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2117      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2118                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2119defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2120      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2121                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2122defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2123      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2124                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2125defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2126      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2127                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2128
2129defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2130      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2131                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2132defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2133      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2134                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2135defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2136      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2137                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2138defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2139      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2140                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2141}// Predicates = [HasAVX, NoVLX]
2142
2143let Constraints = "$src1 = $dst" in {
2144  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2145        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2146                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2147  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2148        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2149                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2150  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2151        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2152                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2153  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2154        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2155                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2156} // Constraints = "$src1 = $dst"
2157
2158let Predicates = [HasAVX1Only] in {
2159  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2160            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2161  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2162            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2163  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2164            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2165  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2166            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2167
2168  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2169            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2170  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2171            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2172  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2173            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2174  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2175            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2176}
2177
2178let Predicates = [UseSSE2] in {
2179  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2180  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2181                              (v2f64 (simple_load addr:$src2)))),
2182            (MOVHPDrm VR128:$src1, addr:$src2)>;
2183}
2184
2185//===----------------------------------------------------------------------===//
2186// SSE 1 & 2 - Extract Floating-Point Sign mask
2187//===----------------------------------------------------------------------===//
2188
2189/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2190multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2191                                string asm, Domain d> {
2192  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2193              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2194              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2195              Sched<[WriteFMOVMSK]>;
2196}
2197
2198let Predicates = [HasAVX] in {
2199  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2200                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2201  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2202                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2203  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2204                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2205  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2206                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2207
2208  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2209  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2210            (VMOVMSKPSrr VR128:$src)>;
2211  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2212            (VMOVMSKPDrr VR128:$src)>;
2213  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2214            (VMOVMSKPSYrr VR256:$src)>;
2215  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2216            (VMOVMSKPDYrr VR256:$src)>;
2217}
2218
2219defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2220                                     SSEPackedSingle>, PS;
2221defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2222                                     SSEPackedDouble>, PD;
2223
2224let Predicates = [UseSSE2] in {
2225  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2226  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2227            (MOVMSKPSrr VR128:$src)>;
2228  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2229            (MOVMSKPDrr VR128:$src)>;
2230}
2231
2232//===---------------------------------------------------------------------===//
2233// SSE2 - Packed Integer Logical Instructions
2234//===---------------------------------------------------------------------===//
2235
2236let ExeDomain = SSEPackedInt in { // SSE integer instructions
2237
2238/// PDI_binop_rm - Simple SSE2 binary operator.
2239multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2240                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2241                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2242                        bit IsCommutable, bit Is2Addr> {
2243  let isCommutable = IsCommutable in
2244  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2245       (ins RC:$src1, RC:$src2),
2246       !if(Is2Addr,
2247           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2248           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2249       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2250       Sched<[sched]>;
2251  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2252       (ins RC:$src1, x86memop:$src2),
2253       !if(Is2Addr,
2254           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2255           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2256       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2257       Sched<[sched.Folded, sched.ReadAfterFold]>;
2258}
2259} // ExeDomain = SSEPackedInt
2260
2261multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2262                         ValueType OpVT128, ValueType OpVT256,
2263                         X86SchedWriteWidths sched, bit IsCommutable,
2264                         Predicate prd> {
2265let Predicates = [HasAVX, prd] in
2266  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2267                             VR128, load, i128mem, sched.XMM,
2268                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2269
2270let Constraints = "$src1 = $dst" in
2271  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2272                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2273
2274let Predicates = [HasAVX2, prd] in
2275  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2276                               OpVT256, VR256, load, i256mem, sched.YMM,
2277                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2278}
2279
2280// These are ordered here for pattern ordering requirements with the fp versions
2281
2282defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2283                           SchedWriteVecLogic, 1, NoVLX>;
2284defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2285                           SchedWriteVecLogic, 1, NoVLX>;
2286defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2287                           SchedWriteVecLogic, 1, NoVLX>;
2288defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2289                           SchedWriteVecLogic, 0, NoVLX>;
2290
2291//===----------------------------------------------------------------------===//
2292// SSE 1 & 2 - Logical Instructions
2293//===----------------------------------------------------------------------===//
2294
2295/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2296///
2297/// There are no patterns here because isel prefers integer versions for SSE2
2298/// and later. There are SSE1 v4f32 patterns later.
2299multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2300                                   X86SchedWriteWidths sched> {
2301  let Predicates = [HasAVX, NoVLX] in {
2302  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2303        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2304        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2305
2306  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2307        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2308        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2309
2310  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2311       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2312       [], [], 0>, PS, VEX_4V, VEX_WIG;
2313
2314  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2315       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2316       [], [], 0>, PD, VEX_4V, VEX_WIG;
2317  }
2318
2319  let Constraints = "$src1 = $dst" in {
2320    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2321         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2322         [], []>, PS;
2323
2324    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2325         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2326         [], []>, PD;
2327  }
2328}
2329
2330defm AND  : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
2331defm OR   : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
2332defm XOR  : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
2333let isCommutable = 0 in
2334  defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
2335
2336let Predicates = [HasAVX2, NoVLX] in {
2337  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2338            (VPANDYrr VR256:$src1, VR256:$src2)>;
2339  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2340            (VPANDYrr VR256:$src1, VR256:$src2)>;
2341  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2342            (VPANDYrr VR256:$src1, VR256:$src2)>;
2343
2344  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2345            (VPORYrr VR256:$src1, VR256:$src2)>;
2346  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2347            (VPORYrr VR256:$src1, VR256:$src2)>;
2348  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2349            (VPORYrr VR256:$src1, VR256:$src2)>;
2350
2351  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2352            (VPXORYrr VR256:$src1, VR256:$src2)>;
2353  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2354            (VPXORYrr VR256:$src1, VR256:$src2)>;
2355  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2356            (VPXORYrr VR256:$src1, VR256:$src2)>;
2357
2358  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2359            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2360  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2361            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2362  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2363            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2364
2365  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2366            (VPANDYrm VR256:$src1, addr:$src2)>;
2367  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2368            (VPANDYrm VR256:$src1, addr:$src2)>;
2369  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2370            (VPANDYrm VR256:$src1, addr:$src2)>;
2371
2372  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2373            (VPORYrm VR256:$src1, addr:$src2)>;
2374  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2375            (VPORYrm VR256:$src1, addr:$src2)>;
2376  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2377            (VPORYrm VR256:$src1, addr:$src2)>;
2378
2379  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2380            (VPXORYrm VR256:$src1, addr:$src2)>;
2381  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2382            (VPXORYrm VR256:$src1, addr:$src2)>;
2383  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2384            (VPXORYrm VR256:$src1, addr:$src2)>;
2385
2386  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2387            (VPANDNYrm VR256:$src1, addr:$src2)>;
2388  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2389            (VPANDNYrm VR256:$src1, addr:$src2)>;
2390  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2391            (VPANDNYrm VR256:$src1, addr:$src2)>;
2392}
2393
2394// If only AVX1 is supported, we need to handle integer operations with
2395// floating point instructions since the integer versions aren't available.
2396let Predicates = [HasAVX1Only] in {
2397  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2398            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2399  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2400            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2401  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2402            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2403  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2404            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2405
2406  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2407            (VORPSYrr VR256:$src1, VR256:$src2)>;
2408  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2409            (VORPSYrr VR256:$src1, VR256:$src2)>;
2410  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2411            (VORPSYrr VR256:$src1, VR256:$src2)>;
2412  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2413            (VORPSYrr VR256:$src1, VR256:$src2)>;
2414
2415  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2416            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2417  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2418            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2419  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2420            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2421  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2422            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2423
2424  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2425            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2426  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2427            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2428  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2429            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2430  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2431            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2432
2433  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2434            (VANDPSYrm VR256:$src1, addr:$src2)>;
2435  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2436            (VANDPSYrm VR256:$src1, addr:$src2)>;
2437  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2438            (VANDPSYrm VR256:$src1, addr:$src2)>;
2439  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2440            (VANDPSYrm VR256:$src1, addr:$src2)>;
2441
2442  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2443            (VORPSYrm VR256:$src1, addr:$src2)>;
2444  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2445            (VORPSYrm VR256:$src1, addr:$src2)>;
2446  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2447            (VORPSYrm VR256:$src1, addr:$src2)>;
2448  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2449            (VORPSYrm VR256:$src1, addr:$src2)>;
2450
2451  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2452            (VXORPSYrm VR256:$src1, addr:$src2)>;
2453  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2454            (VXORPSYrm VR256:$src1, addr:$src2)>;
2455  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2456            (VXORPSYrm VR256:$src1, addr:$src2)>;
2457  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2458            (VXORPSYrm VR256:$src1, addr:$src2)>;
2459
2460  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2461            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2462  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2463            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2464  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2465            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2466  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2467            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2468}
2469
2470let Predicates = [HasAVX, NoVLX] in {
2471  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2472            (VPANDrr VR128:$src1, VR128:$src2)>;
2473  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2474            (VPANDrr VR128:$src1, VR128:$src2)>;
2475  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2476            (VPANDrr VR128:$src1, VR128:$src2)>;
2477
2478  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2479            (VPORrr VR128:$src1, VR128:$src2)>;
2480  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2481            (VPORrr VR128:$src1, VR128:$src2)>;
2482  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2483            (VPORrr VR128:$src1, VR128:$src2)>;
2484
2485  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2486            (VPXORrr VR128:$src1, VR128:$src2)>;
2487  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2488            (VPXORrr VR128:$src1, VR128:$src2)>;
2489  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2490            (VPXORrr VR128:$src1, VR128:$src2)>;
2491
2492  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2493            (VPANDNrr VR128:$src1, VR128:$src2)>;
2494  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2495            (VPANDNrr VR128:$src1, VR128:$src2)>;
2496  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2497            (VPANDNrr VR128:$src1, VR128:$src2)>;
2498
2499  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2500            (VPANDrm VR128:$src1, addr:$src2)>;
2501  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2502            (VPANDrm VR128:$src1, addr:$src2)>;
2503  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2504            (VPANDrm VR128:$src1, addr:$src2)>;
2505
2506  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2507            (VPORrm VR128:$src1, addr:$src2)>;
2508  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2509            (VPORrm VR128:$src1, addr:$src2)>;
2510  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2511            (VPORrm VR128:$src1, addr:$src2)>;
2512
2513  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2514            (VPXORrm VR128:$src1, addr:$src2)>;
2515  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2516            (VPXORrm VR128:$src1, addr:$src2)>;
2517  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2518            (VPXORrm VR128:$src1, addr:$src2)>;
2519
2520  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2521            (VPANDNrm VR128:$src1, addr:$src2)>;
2522  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2523            (VPANDNrm VR128:$src1, addr:$src2)>;
2524  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2525            (VPANDNrm VR128:$src1, addr:$src2)>;
2526}
2527
2528let Predicates = [UseSSE2] in {
2529  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2530            (PANDrr VR128:$src1, VR128:$src2)>;
2531  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2532            (PANDrr VR128:$src1, VR128:$src2)>;
2533  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2534            (PANDrr VR128:$src1, VR128:$src2)>;
2535
2536  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2537            (PORrr VR128:$src1, VR128:$src2)>;
2538  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2539            (PORrr VR128:$src1, VR128:$src2)>;
2540  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2541            (PORrr VR128:$src1, VR128:$src2)>;
2542
2543  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2544            (PXORrr VR128:$src1, VR128:$src2)>;
2545  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2546            (PXORrr VR128:$src1, VR128:$src2)>;
2547  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2548            (PXORrr VR128:$src1, VR128:$src2)>;
2549
2550  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2551            (PANDNrr VR128:$src1, VR128:$src2)>;
2552  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2553            (PANDNrr VR128:$src1, VR128:$src2)>;
2554  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2555            (PANDNrr VR128:$src1, VR128:$src2)>;
2556
2557  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2558            (PANDrm VR128:$src1, addr:$src2)>;
2559  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2560            (PANDrm VR128:$src1, addr:$src2)>;
2561  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2562            (PANDrm VR128:$src1, addr:$src2)>;
2563
2564  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2565            (PORrm VR128:$src1, addr:$src2)>;
2566  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2567            (PORrm VR128:$src1, addr:$src2)>;
2568  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2569            (PORrm VR128:$src1, addr:$src2)>;
2570
2571  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2572            (PXORrm VR128:$src1, addr:$src2)>;
2573  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2574            (PXORrm VR128:$src1, addr:$src2)>;
2575  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2576            (PXORrm VR128:$src1, addr:$src2)>;
2577
2578  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2579            (PANDNrm VR128:$src1, addr:$src2)>;
2580  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2581            (PANDNrm VR128:$src1, addr:$src2)>;
2582  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2583            (PANDNrm VR128:$src1, addr:$src2)>;
2584}
2585
2586// Patterns for packed operations when we don't have integer type available.
2587def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2588          (ANDPSrr VR128:$src1, VR128:$src2)>;
2589def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2590          (ORPSrr VR128:$src1, VR128:$src2)>;
2591def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2592          (XORPSrr VR128:$src1, VR128:$src2)>;
2593def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2594          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2595
2596def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2597          (ANDPSrm VR128:$src1, addr:$src2)>;
2598def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2599          (ORPSrm VR128:$src1, addr:$src2)>;
2600def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2601          (XORPSrm VR128:$src1, addr:$src2)>;
2602def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2603          (ANDNPSrm VR128:$src1, addr:$src2)>;
2604
2605//===----------------------------------------------------------------------===//
2606// SSE 1 & 2 - Arithmetic Instructions
2607//===----------------------------------------------------------------------===//
2608
2609/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2610/// vector forms.
2611///
2612/// In addition, we also have a special variant of the scalar form here to
2613/// represent the associated intrinsic operation.  This form is unlike the
2614/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2615/// and leaves the top elements unmodified (therefore these cannot be commuted).
2616///
2617/// These three forms can each be reg+reg or reg+mem.
2618///
2619
2620/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2621/// classes below
2622multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2623                                  SDPatternOperator OpNode, X86SchedWriteSizes sched> {
2624let Uses = [MXCSR], mayRaiseFPException = 1 in {
2625  let Predicates = [HasAVX, NoVLX] in {
2626  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2627                               VR128, v4f32, f128mem, loadv4f32,
2628                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2629  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2630                               VR128, v2f64, f128mem, loadv2f64,
2631                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2632
2633  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2634                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2635                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2636  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2637                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2638                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2639  }
2640
2641  let Constraints = "$src1 = $dst" in {
2642    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2643                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2644                              sched.PS.XMM>, PS;
2645    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2646                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2647                              sched.PD.XMM>, PD;
2648  }
2649}
2650}
2651
2652multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2653                                  X86SchedWriteSizes sched> {
2654let Uses = [MXCSR], mayRaiseFPException = 1 in {
2655  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2656                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2657                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2658  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2659                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2660                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2661
2662  let Constraints = "$src1 = $dst" in {
2663    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2664                              OpNode, FR32, f32mem, SSEPackedSingle,
2665                              sched.PS.Scl>, XS;
2666    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2667                              OpNode, FR64, f64mem, SSEPackedDouble,
2668                              sched.PD.Scl>, XD;
2669  }
2670}
2671}
2672
2673multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2674                                      SDPatternOperator OpNode,
2675                                      X86SchedWriteSizes sched> {
2676let Uses = [MXCSR], mayRaiseFPException = 1 in {
2677  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2678                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2679                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2680  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2681                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2682                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2683
2684  let Constraints = "$src1 = $dst" in {
2685    defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2686                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2687                   SSEPackedSingle, sched.PS.Scl>, XS;
2688    defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2689                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2690                   SSEPackedDouble, sched.PD.Scl>, XD;
2691  }
2692}
2693}
2694
2695// Binary Arithmetic instructions
2696defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2697           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2698           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2699defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2700           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2701           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2702let isCommutable = 0 in {
2703  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2704             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2705             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2706  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2707             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2708             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2709  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2710             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2711             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2712  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2713             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2714             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2715}
2716
2717let isCodeGenOnly = 1 in {
2718  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2719             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2720  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2721             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2722}
2723
2724// Patterns used to select SSE scalar fp arithmetic instructions from
2725// either:
2726//
2727// (1) a scalar fp operation followed by a blend
2728//
2729// The effect is that the backend no longer emits unnecessary vector
2730// insert instructions immediately after SSE scalar fp instructions
2731// like addss or mulss.
2732//
2733// For example, given the following code:
2734//   __m128 foo(__m128 A, __m128 B) {
2735//     A[0] += B[0];
2736//     return A;
2737//   }
2738//
2739// Previously we generated:
2740//   addss %xmm0, %xmm1
2741//   movss %xmm1, %xmm0
2742//
2743// We now generate:
2744//   addss %xmm1, %xmm0
2745//
2746// (2) a vector packed single/double fp operation followed by a vector insert
2747//
2748// The effect is that the backend converts the packed fp instruction
2749// followed by a vector insert into a single SSE scalar fp instruction.
2750//
2751// For example, given the following code:
2752//   __m128 foo(__m128 A, __m128 B) {
2753//     __m128 C = A + B;
2754//     return (__m128) {c[0], a[1], a[2], a[3]};
2755//   }
2756//
2757// Previously we generated:
2758//   addps %xmm0, %xmm1
2759//   movss %xmm1, %xmm0
2760//
2761// We now generate:
2762//   addss %xmm1, %xmm0
2763
2764// TODO: Some canonicalization in lowering would simplify the number of
2765// patterns we have to try to match.
2766multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
2767                                ValueType VT, ValueType EltTy,
2768                                RegisterClass RC, PatFrag ld_frag,
2769                                Predicate BasePredicate> {
2770  let Predicates = [BasePredicate] in {
2771    // extracted scalar math op with insert via movss/movsd
2772    def : Pat<(VT (Move (VT VR128:$dst),
2773                        (VT (scalar_to_vector
2774                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2775                                 RC:$src))))),
2776              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2777               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2778    def : Pat<(VT (Move (VT VR128:$dst),
2779                        (VT (scalar_to_vector
2780                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2781                                 (ld_frag addr:$src)))))),
2782              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2783  }
2784
2785  // Repeat for AVX versions of the instructions.
2786  let Predicates = [UseAVX] in {
2787    // extracted scalar math op with insert via movss/movsd
2788    def : Pat<(VT (Move (VT VR128:$dst),
2789                        (VT (scalar_to_vector
2790                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2791                                 RC:$src))))),
2792              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2793               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2794    def : Pat<(VT (Move (VT VR128:$dst),
2795                        (VT (scalar_to_vector
2796                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2797                                 (ld_frag addr:$src)))))),
2798              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2799  }
2800}
2801
2802defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2803defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2804defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2805defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2806
2807defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2808defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2809defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2810defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2811
2812/// Unop Arithmetic
2813/// In addition, we also have a special variant of the scalar form here to
2814/// represent the associated intrinsic operation.  This form is unlike the
2815/// plain scalar form, in that it takes an entire vector (instead of a
2816/// scalar) and leaves the top elements undefined.
2817///
2818/// And, we have a special variant form for a full-vector intrinsic form.
2819
2820/// sse_fp_unop_s - SSE1 unops in scalar form
2821/// For the non-AVX defs, we need $src1 to be tied to $dst because
2822/// the HW instructions are 2 operand / destructive.
2823multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2824                          X86MemOperand x86memop, Operand intmemop,
2825                          SDPatternOperator OpNode, Domain d,
2826                          X86FoldableSchedWrite sched, Predicate target> {
2827  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2828  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2829              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2830            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2831            Requires<[target]>;
2832  let mayLoad = 1 in
2833  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2834            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2835            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2836            Sched<[sched.Folded]>,
2837            Requires<[target, OptForSize]>;
2838  }
2839
2840  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2841  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2842                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2843                Sched<[sched]>;
2844  let mayLoad = 1 in
2845  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2846                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2847                Sched<[sched.Folded, sched.ReadAfterFold]>;
2848  }
2849
2850}
2851
2852multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2853                              Intrinsic Intr, Predicate target> {
2854  let Predicates = [target] in {
2855  // These are unary operations, but they are modeled as having 2 source operands
2856  // because the high elements of the destination are unchanged in SSE.
2857  def : Pat<(Intr VR128:$src),
2858            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2859  }
2860  // We don't want to fold scalar loads into these instructions unless
2861  // optimizing for size. This is because the folded instruction will have a
2862  // partial register update, while the unfolded sequence will not, e.g.
2863  // movss mem, %xmm0
2864  // rcpss %xmm0, %xmm0
2865  // which has a clobber before the rcp, vs.
2866  // rcpss mem, %xmm0
2867  let Predicates = [target, OptForSize] in {
2868    def : Pat<(Intr (mem_frags addr:$src2)),
2869               (!cast<Instruction>(NAME#m_Int)
2870                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2871  }
2872}
2873
2874multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2875                              Intrinsic Intr, Predicate target> {
2876  let Predicates = [target] in {
2877   def : Pat<(Intr VR128:$src),
2878             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2879                                 VR128:$src)>;
2880  }
2881  let Predicates = [target, OptForSize] in {
2882    def : Pat<(Intr (mem_frags addr:$src2)),
2883              (!cast<Instruction>(NAME#m_Int)
2884                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2885  }
2886}
2887
2888multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2889                          ValueType ScalarVT, X86MemOperand x86memop,
2890                          Operand intmemop, SDPatternOperator OpNode, Domain d,
2891                          X86FoldableSchedWrite sched, Predicate target> {
2892  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2893  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2894            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2895            [], d>, Sched<[sched]>;
2896  let mayLoad = 1 in
2897  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2898             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2899            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2900  }
2901  let hasSideEffects = 0, ExeDomain = d in {
2902  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2903                (ins VR128:$src1, VR128:$src2),
2904             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2905             []>, Sched<[sched]>;
2906  let mayLoad = 1 in
2907  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2908                (ins VR128:$src1, intmemop:$src2),
2909             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2910             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2911  }
2912
2913  // We don't want to fold scalar loads into these instructions unless
2914  // optimizing for size. This is because the folded instruction will have a
2915  // partial register update, while the unfolded sequence will not, e.g.
2916  // vmovss mem, %xmm0
2917  // vrcpss %xmm0, %xmm0, %xmm0
2918  // which has a clobber before the rcp, vs.
2919  // vrcpss mem, %xmm0, %xmm0
2920  // TODO: In theory, we could fold the load, and avoid the stall caused by
2921  // the partial register store, either in BreakFalseDeps or with smarter RA.
2922  let Predicates = [target] in {
2923   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2924                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2925  }
2926  let Predicates = [target, OptForSize] in {
2927    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2928              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2929            addr:$src)>;
2930  }
2931}
2932
2933/// sse1_fp_unop_p - SSE1 unops in packed form.
2934multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2935                          X86SchedWriteWidths sched, list<Predicate> prds> {
2936let Predicates = prds in {
2937  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2938                       !strconcat("v", OpcodeStr,
2939                                  "ps\t{$src, $dst|$dst, $src}"),
2940                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2941                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2942  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2943                       !strconcat("v", OpcodeStr,
2944                                  "ps\t{$src, $dst|$dst, $src}"),
2945                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2946                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2947  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2948                        !strconcat("v", OpcodeStr,
2949                                   "ps\t{$src, $dst|$dst, $src}"),
2950                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2951                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2952  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2953                        !strconcat("v", OpcodeStr,
2954                                   "ps\t{$src, $dst|$dst, $src}"),
2955                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2956                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2957}
2958
2959  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2960                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2961                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2962                Sched<[sched.XMM]>;
2963  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2964                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2965                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2966                Sched<[sched.XMM.Folded]>;
2967}
2968
2969/// sse2_fp_unop_p - SSE2 unops in vector forms.
2970multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2971                          SDPatternOperator OpNode, X86SchedWriteWidths sched> {
2972let Predicates = [HasAVX, NoVLX] in {
2973  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2974                       !strconcat("v", OpcodeStr,
2975                                  "pd\t{$src, $dst|$dst, $src}"),
2976                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2977                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2978  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2979                       !strconcat("v", OpcodeStr,
2980                                  "pd\t{$src, $dst|$dst, $src}"),
2981                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2982                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2983  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2984                        !strconcat("v", OpcodeStr,
2985                                   "pd\t{$src, $dst|$dst, $src}"),
2986                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2987                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2988  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2989                        !strconcat("v", OpcodeStr,
2990                                   "pd\t{$src, $dst|$dst, $src}"),
2991                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2992                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2993}
2994
2995  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2996                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2997                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2998                Sched<[sched.XMM]>;
2999  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3000                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3001                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3002                Sched<[sched.XMM.Folded]>;
3003}
3004
3005multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
3006  defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
3007                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3008                      UseSSE1>, XS;
3009  defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
3010                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3011                      AVXTarget>,
3012                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
3013}
3014
3015multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3016                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3017  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
3018                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3019  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
3020                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3021                       XS, VEX_4V, VEX_LIG, VEX_WIG;
3022}
3023
3024multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3025                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3026  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
3027                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3028  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
3029                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3030                         XD, VEX_4V, VEX_LIG, VEX_WIG;
3031}
3032
3033// Square root.
3034defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3035             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3036             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3037             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3038
3039// Reciprocal approximations. Note that these typically require refinement
3040// in order to obtain suitable precision.
3041defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3042             sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
3043             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3044defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3045             sse1_fp_unop_s_intr<"rcp", HasAVX>,
3046             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3047
3048// There is no f64 version of the reciprocal approximation instructions.
3049
3050multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
3051                                      ValueType VT, Predicate BasePredicate> {
3052  let Predicates = [BasePredicate] in {
3053    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3054                                  (OpNode (extractelt VT:$src, 0))))),
3055              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3056  }
3057
3058  // Repeat for AVX versions of the instructions.
3059  let Predicates = [UseAVX] in {
3060    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3061                                  (OpNode (extractelt VT:$src, 0))))),
3062              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3063  }
3064}
3065
3066defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3067defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3068
3069multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3070                                           SDNode Move, ValueType VT,
3071                                           Predicate BasePredicate> {
3072  let Predicates = [BasePredicate] in {
3073    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3074              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3075  }
3076
3077  // Repeat for AVX versions of the instructions.
3078  let Predicates = [HasAVX] in {
3079    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3080              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3081  }
3082}
3083
3084defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3085                                       v4f32, UseSSE1>;
3086defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3087                                       v4f32, UseSSE1>;
3088
3089
3090//===----------------------------------------------------------------------===//
3091// SSE 1 & 2 - Non-temporal stores
3092//===----------------------------------------------------------------------===//
3093
3094let AddedComplexity = 400 in { // Prefer non-temporal versions
3095let Predicates = [HasAVX, NoVLX] in {
3096let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3097def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3098                     (ins f128mem:$dst, VR128:$src),
3099                     "movntps\t{$src, $dst|$dst, $src}",
3100                     [(alignednontemporalstore (v4f32 VR128:$src),
3101                                               addr:$dst)]>, VEX, VEX_WIG;
3102def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3103                     (ins f128mem:$dst, VR128:$src),
3104                     "movntpd\t{$src, $dst|$dst, $src}",
3105                     [(alignednontemporalstore (v2f64 VR128:$src),
3106                                               addr:$dst)]>, VEX, VEX_WIG;
3107} // SchedRW
3108
3109let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3110def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3111                     (ins f256mem:$dst, VR256:$src),
3112                     "movntps\t{$src, $dst|$dst, $src}",
3113                     [(alignednontemporalstore (v8f32 VR256:$src),
3114                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3115def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3116                     (ins f256mem:$dst, VR256:$src),
3117                     "movntpd\t{$src, $dst|$dst, $src}",
3118                     [(alignednontemporalstore (v4f64 VR256:$src),
3119                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3120} // SchedRW
3121
3122let ExeDomain = SSEPackedInt in {
3123def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3124                         (ins i128mem:$dst, VR128:$src),
3125                         "movntdq\t{$src, $dst|$dst, $src}",
3126                         [(alignednontemporalstore (v2i64 VR128:$src),
3127                                                   addr:$dst)]>, VEX, VEX_WIG,
3128                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3129def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3130                    (ins i256mem:$dst, VR256:$src),
3131                    "movntdq\t{$src, $dst|$dst, $src}",
3132                    [(alignednontemporalstore (v4i64 VR256:$src),
3133                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3134                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3135} // ExeDomain
3136} // Predicates
3137
3138let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3139def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3140                    "movntps\t{$src, $dst|$dst, $src}",
3141                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3142def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3143                    "movntpd\t{$src, $dst|$dst, $src}",
3144                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3145} // SchedRW
3146
3147let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3148def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3149                    "movntdq\t{$src, $dst|$dst, $src}",
3150                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3151
3152let SchedRW = [WriteStoreNT] in {
3153// There is no AVX form for instructions below this point
3154def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3155                 "movnti{l}\t{$src, $dst|$dst, $src}",
3156                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3157               PS, Requires<[HasSSE2]>;
3158def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3159                     "movnti{q}\t{$src, $dst|$dst, $src}",
3160                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3161                  PS, Requires<[HasSSE2]>;
3162} // SchedRW = [WriteStoreNT]
3163
3164let Predicates = [HasAVX, NoVLX] in {
3165  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3166            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3167  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3168            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3169  def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
3170            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3171  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3172            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3173
3174  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3175            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3176  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3177            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3178  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3179            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3180  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3181            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3182}
3183
3184let Predicates = [UseSSE2] in {
3185  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3186            (MOVNTDQmr addr:$dst, VR128:$src)>;
3187  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3188            (MOVNTDQmr addr:$dst, VR128:$src)>;
3189  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3190            (MOVNTDQmr addr:$dst, VR128:$src)>;
3191  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3192            (MOVNTDQmr addr:$dst, VR128:$src)>;
3193}
3194
3195} // AddedComplexity
3196
3197//===----------------------------------------------------------------------===//
3198// SSE 1 & 2 - Prefetch and memory fence
3199//===----------------------------------------------------------------------===//
3200
3201// Prefetch intrinsic.
3202let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3203def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3204    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3205def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3206    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3207def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3208    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3209def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3210    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3211}
3212
3213// FIXME: How should flush instruction be modeled?
3214let SchedRW = [WriteLoad] in {
3215// Flush cache
3216def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3217               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3218               PS, Requires<[HasSSE2]>;
3219}
3220
3221let SchedRW = [WriteNop] in {
3222// Pause. This "instruction" is encoded as "rep; nop", so even though it
3223// was introduced with SSE2, it's backward compatible.
3224def PAUSE : I<0x90, RawFrm, (outs), (ins),
3225              "pause", [(int_x86_sse2_pause)]>, OBXS;
3226}
3227
3228let SchedRW = [WriteFence] in {
3229// Load, store, and memory fence
3230// TODO: As with mfence, we may want to ease the availability of sfence/lfence
3231// to include any 64-bit target.
3232def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3233               PS, Requires<[HasSSE1]>;
3234def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3235               PS, Requires<[HasSSE2]>;
3236def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3237               PS, Requires<[HasMFence]>;
3238} // SchedRW
3239
3240def : Pat<(X86MFence), (MFENCE)>;
3241
3242//===----------------------------------------------------------------------===//
3243// SSE 1 & 2 - Load/Store XCSR register
3244//===----------------------------------------------------------------------===//
3245
3246let mayLoad=1, hasSideEffects=1 in
3247def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3248               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3249               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3250let mayStore=1, hasSideEffects=1 in
3251def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3252               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3253               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3254
3255let mayLoad=1, hasSideEffects=1 in
3256def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3257              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3258              PS, Sched<[WriteLDMXCSR]>;
3259let mayStore=1, hasSideEffects=1 in
3260def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3261              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3262              PS, Sched<[WriteSTMXCSR]>;
3263
3264//===---------------------------------------------------------------------===//
3265// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3266//===---------------------------------------------------------------------===//
3267
3268let ExeDomain = SSEPackedInt in { // SSE integer instructions
3269
3270let hasSideEffects = 0 in {
3271def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3272                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3273                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3274def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3275                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3276                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3277def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3278                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3279                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3280def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3281                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3282                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3283}
3284
3285// For Disassembler
3286let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3287def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3288                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3289                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3290                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3291def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3292                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3293                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3294                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3295def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3296                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3297                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3298                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3299def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3300                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3301                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3302                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3303}
3304
3305let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3306    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3307def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3308                      "movdqa\t{$src, $dst|$dst, $src}",
3309                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3310                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3311def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3312                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3313                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3314                      VEX, VEX_L, VEX_WIG;
3315def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3316                   "vmovdqu\t{$src, $dst|$dst, $src}",
3317                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3318                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3319                   XS, VEX, VEX_WIG;
3320def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3321                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3322                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3323                   XS, VEX, VEX_L, VEX_WIG;
3324}
3325
3326let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3327def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3328                      (ins i128mem:$dst, VR128:$src),
3329                      "movdqa\t{$src, $dst|$dst, $src}",
3330                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3331                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3332def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3333                      (ins i256mem:$dst, VR256:$src),
3334                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3335                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3336def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3337                   "vmovdqu\t{$src, $dst|$dst, $src}",
3338                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3339                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3340def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3341                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3342                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3343}
3344
3345let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3346let hasSideEffects = 0 in {
3347def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3348                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3349
3350def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3351                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3352                   XS, Requires<[UseSSE2]>;
3353}
3354
3355// For Disassembler
3356let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3357def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3358                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3359                       FoldGenData<"MOVDQArr">;
3360
3361def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3362                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3363                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3364}
3365} // SchedRW
3366
3367let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3368    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3369def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3370                   "movdqa\t{$src, $dst|$dst, $src}",
3371                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3372def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3373                   "movdqu\t{$src, $dst|$dst, $src}",
3374                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3375                 XS, Requires<[UseSSE2]>;
3376}
3377
3378let mayStore = 1, hasSideEffects = 0,
3379    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3380def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3381                   "movdqa\t{$src, $dst|$dst, $src}",
3382                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3383def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3384                   "movdqu\t{$src, $dst|$dst, $src}",
3385                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3386                 XS, Requires<[UseSSE2]>;
3387}
3388
3389} // ExeDomain = SSEPackedInt
3390
3391// Reversed version with ".s" suffix for GAS compatibility.
3392def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3393                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3394def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3395                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3396def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3397                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3398def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3399                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3400
3401// Reversed version with ".s" suffix for GAS compatibility.
3402def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3403                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3404def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3405                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3406
3407let Predicates = [HasAVX, NoVLX] in {
3408  // Additional patterns for other integer sizes.
3409  def : Pat<(alignedloadv4i32 addr:$src),
3410            (VMOVDQArm addr:$src)>;
3411  def : Pat<(alignedloadv8i16 addr:$src),
3412            (VMOVDQArm addr:$src)>;
3413  def : Pat<(alignedloadv8f16 addr:$src),
3414            (VMOVDQArm addr:$src)>;
3415  def : Pat<(alignedloadv16i8 addr:$src),
3416            (VMOVDQArm addr:$src)>;
3417  def : Pat<(loadv4i32 addr:$src),
3418            (VMOVDQUrm addr:$src)>;
3419  def : Pat<(loadv8i16 addr:$src),
3420            (VMOVDQUrm addr:$src)>;
3421  def : Pat<(loadv8f16 addr:$src),
3422            (VMOVDQUrm addr:$src)>;
3423  def : Pat<(loadv16i8 addr:$src),
3424            (VMOVDQUrm addr:$src)>;
3425
3426  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3427            (VMOVDQAmr addr:$dst, VR128:$src)>;
3428  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3429            (VMOVDQAmr addr:$dst, VR128:$src)>;
3430  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
3431            (VMOVDQAmr addr:$dst, VR128:$src)>;
3432  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3433            (VMOVDQAmr addr:$dst, VR128:$src)>;
3434  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3435            (VMOVDQUmr addr:$dst, VR128:$src)>;
3436  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3437            (VMOVDQUmr addr:$dst, VR128:$src)>;
3438  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
3439            (VMOVDQUmr addr:$dst, VR128:$src)>;
3440  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3441            (VMOVDQUmr addr:$dst, VR128:$src)>;
3442}
3443
3444//===---------------------------------------------------------------------===//
3445// SSE2 - Packed Integer Arithmetic Instructions
3446//===---------------------------------------------------------------------===//
3447
3448let ExeDomain = SSEPackedInt in { // SSE integer instructions
3449
3450/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3451multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3452                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3453                         PatFrag memop_frag, X86MemOperand x86memop,
3454                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3455  let isCommutable = 1 in
3456  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3457       (ins RC:$src1, RC:$src2),
3458       !if(Is2Addr,
3459           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3460           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3461       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3462       Sched<[sched]>;
3463  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3464       (ins RC:$src1, x86memop:$src2),
3465       !if(Is2Addr,
3466           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3467           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3468       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3469                                     (memop_frag addr:$src2))))]>,
3470       Sched<[sched.Folded, sched.ReadAfterFold]>;
3471}
3472} // ExeDomain = SSEPackedInt
3473
3474defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3475                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3476defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3477                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3478defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3479                             SchedWriteVecALU, 1, NoVLX>;
3480defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3481                             SchedWriteVecALU, 1, NoVLX>;
3482defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3483                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3484defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3485                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3486defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3487                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3488defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3489                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3490defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3491                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3492defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3493                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3494defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3495                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3496defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3497                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3498defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3499                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3500defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3501                             SchedWriteVecALU, 0, NoVLX>;
3502defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3503                             SchedWriteVecALU, 0, NoVLX>;
3504defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3505                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3506defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3507                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3508defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3509                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3510defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3511                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3512defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3513                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3514defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3515                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3516defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3517                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3518defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3519                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3520defm PAVGB   : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
3521                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3522defm PAVGW   : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
3523                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3524defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3525                             SchedWriteVecIMul, 1, NoVLX>;
3526
3527let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3528defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3529                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3530                              VEX_4V, VEX_WIG;
3531
3532let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3533defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3534                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3535                               0>, VEX_4V, VEX_L, VEX_WIG;
3536let Constraints = "$src1 = $dst" in
3537defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3538                             memop, i128mem, SchedWriteVecIMul.XMM>;
3539
3540let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3541defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3542                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3543                             VEX_4V, VEX_WIG;
3544let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3545defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3546                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3547                             VEX_4V, VEX_L, VEX_WIG;
3548let Constraints = "$src1 = $dst" in
3549defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3550                            memop, i128mem, SchedWritePSADBW.XMM>;
3551
3552//===---------------------------------------------------------------------===//
3553// SSE2 - Packed Integer Logical Instructions
3554//===---------------------------------------------------------------------===//
3555
3556multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3557                         string OpcodeStr, SDNode OpNode,
3558                         SDNode OpNode2, RegisterClass RC,
3559                         X86FoldableSchedWrite sched,
3560                         X86FoldableSchedWrite schedImm,
3561                         ValueType DstVT, ValueType SrcVT,
3562                         PatFrag ld_frag, bit Is2Addr = 1> {
3563  // src2 is always 128-bit
3564  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3565       (ins RC:$src1, VR128:$src2),
3566       !if(Is2Addr,
3567           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3568           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3569       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3570       Sched<[sched]>;
3571  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3572       (ins RC:$src1, i128mem:$src2),
3573       !if(Is2Addr,
3574           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3575           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3576       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3577                       (SrcVT (ld_frag addr:$src2)))))]>,
3578       Sched<[sched.Folded, sched.ReadAfterFold]>;
3579  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3580       (ins RC:$src1, u8imm:$src2),
3581       !if(Is2Addr,
3582           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3583           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3584       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3585       Sched<[schedImm]>;
3586}
3587
3588multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3589                             string OpcodeStr, SDNode OpNode,
3590                             SDNode OpNode2, ValueType DstVT128,
3591                             ValueType DstVT256, ValueType SrcVT,
3592                             X86SchedWriteWidths sched,
3593                             X86SchedWriteWidths schedImm, Predicate prd> {
3594let Predicates = [HasAVX, prd] in
3595  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3596                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3597                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3598let Predicates = [HasAVX2, prd] in
3599  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3600                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3601                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3602                                VEX_WIG;
3603let Constraints = "$src1 = $dst" in
3604  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3605                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3606                            memop>;
3607}
3608
3609multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3610                        SDNode OpNode, RegisterClass RC, ValueType VT,
3611                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3612  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3613       !if(Is2Addr,
3614           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3615           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3616       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3617       Sched<[sched]>;
3618}
3619
3620multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3621                            SDNode OpNode, X86SchedWriteWidths sched> {
3622let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3623  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3624                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3625let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3626  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3627                               VR256, v32i8, sched.YMM, 0>,
3628                               VEX_4V, VEX_L, VEX_WIG;
3629let Constraints = "$src1 = $dst" in
3630  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3631                           sched.XMM>;
3632}
3633
3634let ExeDomain = SSEPackedInt in {
3635  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3636                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3637                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3638  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3639                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3640                                 SchedWriteVecShiftImm, NoVLX>;
3641  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3642                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3643                                 SchedWriteVecShiftImm, NoVLX>;
3644
3645  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3646                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3647                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3648  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3649                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3650                                 SchedWriteVecShiftImm, NoVLX>;
3651  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3652                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3653                                 SchedWriteVecShiftImm, NoVLX>;
3654
3655  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3656                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3657                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3658  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3659                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3660                                 SchedWriteVecShiftImm, NoVLX>;
3661
3662  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3663                                 SchedWriteShuffle>;
3664  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3665                                 SchedWriteShuffle>;
3666} // ExeDomain = SSEPackedInt
3667
3668//===---------------------------------------------------------------------===//
3669// SSE2 - Packed Integer Comparison Instructions
3670//===---------------------------------------------------------------------===//
3671
3672defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3673                             SchedWriteVecALU, 1, TruePredicate>;
3674defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3675                             SchedWriteVecALU, 1, TruePredicate>;
3676defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3677                             SchedWriteVecALU, 1, TruePredicate>;
3678defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3679                             SchedWriteVecALU, 0, TruePredicate>;
3680defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3681                             SchedWriteVecALU, 0, TruePredicate>;
3682defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3683                             SchedWriteVecALU, 0, TruePredicate>;
3684
3685//===---------------------------------------------------------------------===//
3686// SSE2 - Packed Integer Shuffle Instructions
3687//===---------------------------------------------------------------------===//
3688
3689let ExeDomain = SSEPackedInt in {
3690multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3691                         SDNode OpNode, X86SchedWriteWidths sched,
3692                         Predicate prd> {
3693let Predicates = [HasAVX, prd] in {
3694  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3695                      (ins VR128:$src1, u8imm:$src2),
3696                      !strconcat("v", OpcodeStr,
3697                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3698                      [(set VR128:$dst,
3699                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3700                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3701  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3702                      (ins i128mem:$src1, u8imm:$src2),
3703                      !strconcat("v", OpcodeStr,
3704                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3705                     [(set VR128:$dst,
3706                       (vt128 (OpNode (load addr:$src1),
3707                        (i8 timm:$src2))))]>, VEX,
3708                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3709}
3710
3711let Predicates = [HasAVX2, prd] in {
3712  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3713                       (ins VR256:$src1, u8imm:$src2),
3714                       !strconcat("v", OpcodeStr,
3715                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3716                       [(set VR256:$dst,
3717                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3718                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3719  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3720                       (ins i256mem:$src1, u8imm:$src2),
3721                       !strconcat("v", OpcodeStr,
3722                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3723                      [(set VR256:$dst,
3724                        (vt256 (OpNode (load addr:$src1),
3725                         (i8 timm:$src2))))]>, VEX, VEX_L,
3726                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3727}
3728
3729let Predicates = [UseSSE2] in {
3730  def ri : Ii8<0x70, MRMSrcReg,
3731               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3732               !strconcat(OpcodeStr,
3733                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3734               [(set VR128:$dst,
3735                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3736               Sched<[sched.XMM]>;
3737  def mi : Ii8<0x70, MRMSrcMem,
3738               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3739               !strconcat(OpcodeStr,
3740                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3741               [(set VR128:$dst,
3742                 (vt128 (OpNode (memop addr:$src1),
3743                        (i8 timm:$src2))))]>,
3744               Sched<[sched.XMM.Folded]>;
3745}
3746}
3747} // ExeDomain = SSEPackedInt
3748
3749defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3750                             SchedWriteShuffle, NoVLX>, PD;
3751defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3752                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3753defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3754                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3755
3756//===---------------------------------------------------------------------===//
3757// Packed Integer Pack Instructions (SSE & AVX)
3758//===---------------------------------------------------------------------===//
3759
3760let ExeDomain = SSEPackedInt in {
3761multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3762                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3763                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3764                     PatFrag ld_frag, bit Is2Addr = 1> {
3765  def rr : PDI<opc, MRMSrcReg,
3766               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3767               !if(Is2Addr,
3768                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3769                   !strconcat(OpcodeStr,
3770                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3771               [(set RC:$dst,
3772                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3773               Sched<[sched]>;
3774  def rm : PDI<opc, MRMSrcMem,
3775               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3776               !if(Is2Addr,
3777                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3778                   !strconcat(OpcodeStr,
3779                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3780               [(set RC:$dst,
3781                     (OutVT (OpNode (ArgVT RC:$src1),
3782                                    (ld_frag addr:$src2))))]>,
3783               Sched<[sched.Folded, sched.ReadAfterFold]>;
3784}
3785
3786multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3787                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3788                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3789                     PatFrag ld_frag, bit Is2Addr = 1> {
3790  def rr : SS48I<opc, MRMSrcReg,
3791                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3792                 !if(Is2Addr,
3793                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3794                     !strconcat(OpcodeStr,
3795                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3796                 [(set RC:$dst,
3797                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3798                 Sched<[sched]>;
3799  def rm : SS48I<opc, MRMSrcMem,
3800                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3801                 !if(Is2Addr,
3802                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3803                     !strconcat(OpcodeStr,
3804                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3805                 [(set RC:$dst,
3806                       (OutVT (OpNode (ArgVT RC:$src1),
3807                                      (ld_frag addr:$src2))))]>,
3808                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3809}
3810
3811let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3812  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3813                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3814                             VEX_4V, VEX_WIG;
3815  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3816                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3817                             VEX_4V, VEX_WIG;
3818
3819  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3820                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3821                             VEX_4V, VEX_WIG;
3822  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3823                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3824                             VEX_4V, VEX_WIG;
3825}
3826
3827let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3828  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3829                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3830                              VEX_4V, VEX_L, VEX_WIG;
3831  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3832                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3833                              VEX_4V, VEX_L, VEX_WIG;
3834
3835  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3836                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3837                              VEX_4V, VEX_L, VEX_WIG;
3838  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3839                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3840                              VEX_4V, VEX_L, VEX_WIG;
3841}
3842
3843let Constraints = "$src1 = $dst" in {
3844  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3845                            i128mem, SchedWriteShuffle.XMM, memop>;
3846  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3847                            i128mem, SchedWriteShuffle.XMM, memop>;
3848
3849  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3850                            i128mem, SchedWriteShuffle.XMM, memop>;
3851
3852  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3853                            i128mem, SchedWriteShuffle.XMM, memop>;
3854}
3855} // ExeDomain = SSEPackedInt
3856
3857//===---------------------------------------------------------------------===//
3858// SSE2 - Packed Integer Unpack Instructions
3859//===---------------------------------------------------------------------===//
3860
3861let ExeDomain = SSEPackedInt in {
3862multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3863                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3864                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3865                       bit Is2Addr = 1> {
3866  def rr : PDI<opc, MRMSrcReg,
3867      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3868      !if(Is2Addr,
3869          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3870          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3871      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3872      Sched<[sched]>;
3873  def rm : PDI<opc, MRMSrcMem,
3874      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3875      !if(Is2Addr,
3876          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3877          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3878      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3879      Sched<[sched.Folded, sched.ReadAfterFold]>;
3880}
3881
3882let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3883  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3884                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3885                                 VEX_4V, VEX_WIG;
3886  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3887                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3888                                 VEX_4V, VEX_WIG;
3889  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3890                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3891                                 VEX_4V, VEX_WIG;
3892  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3893                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3894                                 VEX_4V, VEX_WIG;
3895}
3896
3897let Predicates = [HasAVX, NoVLX] in {
3898  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3899                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3900                                 VEX_4V, VEX_WIG;
3901  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3902                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3903                                 VEX_4V, VEX_WIG;
3904  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3905                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3906                                 VEX_4V, VEX_WIG;
3907  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3908                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3909                                 VEX_4V, VEX_WIG;
3910}
3911
3912let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3913  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3914                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3915                                  VEX_4V, VEX_L, VEX_WIG;
3916  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3917                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3918                                  VEX_4V, VEX_L, VEX_WIG;
3919  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3920                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3921                                  VEX_4V, VEX_L, VEX_WIG;
3922  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3923                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3924                                  VEX_4V, VEX_L, VEX_WIG;
3925}
3926
3927let Predicates = [HasAVX2, NoVLX] in {
3928  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3929                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3930                                  VEX_4V, VEX_L, VEX_WIG;
3931  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3932                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3933                                  VEX_4V, VEX_L, VEX_WIG;
3934  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3935                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3936                                  VEX_4V, VEX_L, VEX_WIG;
3937  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3938                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3939                                  VEX_4V, VEX_L, VEX_WIG;
3940}
3941
3942let Constraints = "$src1 = $dst" in {
3943  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3944                                i128mem, SchedWriteShuffle.XMM, memop>;
3945  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3946                                i128mem, SchedWriteShuffle.XMM, memop>;
3947  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3948                                i128mem, SchedWriteShuffle.XMM, memop>;
3949  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3950                                i128mem, SchedWriteShuffle.XMM, memop>;
3951
3952  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3953                                i128mem, SchedWriteShuffle.XMM, memop>;
3954  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3955                                i128mem, SchedWriteShuffle.XMM, memop>;
3956  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3957                                i128mem, SchedWriteShuffle.XMM, memop>;
3958  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3959                                i128mem, SchedWriteShuffle.XMM, memop>;
3960}
3961} // ExeDomain = SSEPackedInt
3962
3963//===---------------------------------------------------------------------===//
3964// SSE2 - Packed Integer Extract and Insert
3965//===---------------------------------------------------------------------===//
3966
3967let ExeDomain = SSEPackedInt in {
3968multiclass sse2_pinsrw<bit Is2Addr = 1> {
3969  def rr : Ii8<0xC4, MRMSrcReg,
3970       (outs VR128:$dst), (ins VR128:$src1,
3971        GR32orGR64:$src2, u8imm:$src3),
3972       !if(Is2Addr,
3973           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3974           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3975       [(set VR128:$dst,
3976         (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
3977       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3978  def rm : Ii8<0xC4, MRMSrcMem,
3979                      (outs VR128:$dst), (ins VR128:$src1,
3980                       i16mem:$src2, u8imm:$src3),
3981       !if(Is2Addr,
3982           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3983           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3984       [(set VR128:$dst,
3985         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3986                    timm:$src3))]>,
3987       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3988}
3989
3990// Extract
3991let Predicates = [HasAVX, NoBWI] in
3992def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3993                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3994                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3995                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3996                                            timm:$src2))]>,
3997                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3998def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3999                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4000                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4001                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4002                                            timm:$src2))]>,
4003               Sched<[WriteVecExtract]>;
4004
4005// Insert
4006let Predicates = [HasAVX, NoBWI] in
4007defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
4008
4009let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4010defm PINSRW : sse2_pinsrw, PD;
4011
4012} // ExeDomain = SSEPackedInt
4013
4014// Always select FP16 instructions if available.
4015let Predicates = [UseSSE2], AddedComplexity = -10 in {
4016  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4017  def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
4018  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4019  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4020}
4021
4022let Predicates = [HasAVX, NoBWI] in {
4023  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4024  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4025  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4026}
4027
4028//===---------------------------------------------------------------------===//
4029// SSE2 - Packed Mask Creation
4030//===---------------------------------------------------------------------===//
4031
4032let ExeDomain = SSEPackedInt in {
4033
4034def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4035           (ins VR128:$src),
4036           "pmovmskb\t{$src, $dst|$dst, $src}",
4037           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4038           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
4039
4040let Predicates = [HasAVX2] in {
4041def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4042           (ins VR256:$src),
4043           "pmovmskb\t{$src, $dst|$dst, $src}",
4044           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4045           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
4046}
4047
4048def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4049           "pmovmskb\t{$src, $dst|$dst, $src}",
4050           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4051           Sched<[WriteVecMOVMSK]>;
4052
4053} // ExeDomain = SSEPackedInt
4054
4055//===---------------------------------------------------------------------===//
4056// SSE2 - Conditional Store
4057//===---------------------------------------------------------------------===//
4058
4059let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4060// As VEX does not have separate instruction contexts for address size
4061// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
4062// Prefer VMASKMODDQU64.
4063let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
4064def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4065           (ins VR128:$src, VR128:$mask),
4066           "maskmovdqu\t{$mask, $src|$src, $mask}",
4067           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4068           VEX, VEX_WIG;
4069let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4070def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4071           (ins VR128:$src, VR128:$mask),
4072           "maskmovdqu\t{$mask, $src|$src, $mask}",
4073           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4074           VEX, VEX_WIG;
4075
4076let Uses = [EDI], Predicates = [UseSSE2] in
4077def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4078           "maskmovdqu\t{$mask, $src|$src, $mask}",
4079           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4080let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4081def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4082           "maskmovdqu\t{$mask, $src|$src, $mask}",
4083           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4084
4085} // ExeDomain = SSEPackedInt
4086
4087//===---------------------------------------------------------------------===//
4088// SSE2 - Move Doubleword/Quadword
4089//===---------------------------------------------------------------------===//
4090
4091//===---------------------------------------------------------------------===//
4092// Move Int Doubleword to Packed Double Int
4093//
4094let ExeDomain = SSEPackedInt in {
4095def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4096                        "movd\t{$src, $dst|$dst, $src}",
4097                        [(set VR128:$dst,
4098                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4099                          VEX, Sched<[WriteVecMoveFromGpr]>;
4100def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4101                        "movd\t{$src, $dst|$dst, $src}",
4102                        [(set VR128:$dst,
4103                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4104                        VEX, Sched<[WriteVecLoad]>;
4105def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4106                          "movq\t{$src, $dst|$dst, $src}",
4107                          [(set VR128:$dst,
4108                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4109                          VEX, Sched<[WriteVecMoveFromGpr]>;
4110let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4111def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4112                          "movq\t{$src, $dst|$dst, $src}", []>,
4113                          VEX, Sched<[WriteVecLoad]>;
4114let isCodeGenOnly = 1 in
4115def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4116                         "movq\t{$src, $dst|$dst, $src}",
4117                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4118                         VEX, Sched<[WriteVecMoveFromGpr]>;
4119
4120def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4121                      "movd\t{$src, $dst|$dst, $src}",
4122                      [(set VR128:$dst,
4123                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4124                      Sched<[WriteVecMoveFromGpr]>;
4125def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4126                      "movd\t{$src, $dst|$dst, $src}",
4127                      [(set VR128:$dst,
4128                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4129                      Sched<[WriteVecLoad]>;
4130def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4131                        "movq\t{$src, $dst|$dst, $src}",
4132                        [(set VR128:$dst,
4133                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4134                        Sched<[WriteVecMoveFromGpr]>;
4135let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4136def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4137                        "movq\t{$src, $dst|$dst, $src}", []>,
4138                        Sched<[WriteVecLoad]>;
4139let isCodeGenOnly = 1 in
4140def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4141                       "movq\t{$src, $dst|$dst, $src}",
4142                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4143                       Sched<[WriteVecMoveFromGpr]>;
4144} // ExeDomain = SSEPackedInt
4145
4146//===---------------------------------------------------------------------===//
4147// Move Int Doubleword to Single Scalar
4148//
4149let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4150  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4151                        "movd\t{$src, $dst|$dst, $src}",
4152                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4153                        VEX, Sched<[WriteVecMoveFromGpr]>;
4154
4155  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4156                        "movd\t{$src, $dst|$dst, $src}",
4157                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4158                        Sched<[WriteVecMoveFromGpr]>;
4159
4160} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4161
4162//===---------------------------------------------------------------------===//
4163// Move Packed Doubleword Int to Packed Double Int
4164//
4165let ExeDomain = SSEPackedInt in {
4166def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4167                         "movd\t{$src, $dst|$dst, $src}",
4168                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4169                                          (iPTR 0)))]>, VEX,
4170                         Sched<[WriteVecMoveToGpr]>;
4171def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4172                         (ins i32mem:$dst, VR128:$src),
4173                         "movd\t{$src, $dst|$dst, $src}",
4174                         [(store (i32 (extractelt (v4i32 VR128:$src),
4175                                       (iPTR 0))), addr:$dst)]>,
4176                         VEX, Sched<[WriteVecStore]>;
4177def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4178                       "movd\t{$src, $dst|$dst, $src}",
4179                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4180                                        (iPTR 0)))]>,
4181                   Sched<[WriteVecMoveToGpr]>;
4182def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4183                       "movd\t{$src, $dst|$dst, $src}",
4184                       [(store (i32 (extractelt (v4i32 VR128:$src),
4185                                     (iPTR 0))), addr:$dst)]>,
4186                       Sched<[WriteVecStore]>;
4187} // ExeDomain = SSEPackedInt
4188
4189//===---------------------------------------------------------------------===//
4190// Move Packed Doubleword Int first element to Doubleword Int
4191//
4192let ExeDomain = SSEPackedInt in {
4193let SchedRW = [WriteVecMoveToGpr] in {
4194def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4195                          "movq\t{$src, $dst|$dst, $src}",
4196                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4197                                                        (iPTR 0)))]>,
4198                      VEX;
4199
4200def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4201                        "movq\t{$src, $dst|$dst, $src}",
4202                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4203                                                         (iPTR 0)))]>;
4204} //SchedRW
4205
4206let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4207def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4208                          (ins i64mem:$dst, VR128:$src),
4209                          "movq\t{$src, $dst|$dst, $src}", []>,
4210                          VEX, Sched<[WriteVecStore]>;
4211let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4212def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4213                        "movq\t{$src, $dst|$dst, $src}", []>,
4214                        Sched<[WriteVecStore]>;
4215} // ExeDomain = SSEPackedInt
4216
4217//===---------------------------------------------------------------------===//
4218// Bitcast FR64 <-> GR64
4219//
4220let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4221  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4222                           "movq\t{$src, $dst|$dst, $src}",
4223                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4224                           VEX, Sched<[WriteVecMoveToGpr]>;
4225
4226  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4227                         "movq\t{$src, $dst|$dst, $src}",
4228                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4229                         Sched<[WriteVecMoveToGpr]>;
4230} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4231
4232//===---------------------------------------------------------------------===//
4233// Move Scalar Single to Double Int
4234//
4235let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4236  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4237                        "movd\t{$src, $dst|$dst, $src}",
4238                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4239                        VEX, Sched<[WriteVecMoveToGpr]>;
4240  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4241                        "movd\t{$src, $dst|$dst, $src}",
4242                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4243                        Sched<[WriteVecMoveToGpr]>;
4244} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4245
4246let Predicates = [UseAVX] in {
4247  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4248            (VMOVDI2PDIrr GR32:$src)>;
4249
4250  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4251            (VMOV64toPQIrr GR64:$src)>;
4252
4253  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4254  // These instructions also write zeros in the high part of a 256-bit register.
4255  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4256            (VMOVDI2PDIrm addr:$src)>;
4257  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4258            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4259}
4260
4261let Predicates = [UseSSE2] in {
4262  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4263            (MOVDI2PDIrr GR32:$src)>;
4264
4265  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4266            (MOV64toPQIrr GR64:$src)>;
4267  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4268            (MOVDI2PDIrm addr:$src)>;
4269}
4270
4271// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4272// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4273// these aliases.
4274def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4275                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4276def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4277                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4278// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4279def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4280                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4281def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4282                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4283
4284//===---------------------------------------------------------------------===//
4285// SSE2 - Move Quadword
4286//===---------------------------------------------------------------------===//
4287
4288//===---------------------------------------------------------------------===//
4289// Move Quadword Int to Packed Quadword Int
4290//
4291
4292let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4293def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4294                    "vmovq\t{$src, $dst|$dst, $src}",
4295                    [(set VR128:$dst,
4296                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4297                    VEX, Requires<[UseAVX]>, VEX_WIG;
4298def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4299                    "movq\t{$src, $dst|$dst, $src}",
4300                    [(set VR128:$dst,
4301                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4302                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4303} // ExeDomain, SchedRW
4304
4305//===---------------------------------------------------------------------===//
4306// Move Packed Quadword Int to Quadword Int
4307//
4308let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4309def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4310                        "movq\t{$src, $dst|$dst, $src}",
4311                        [(store (i64 (extractelt (v2i64 VR128:$src),
4312                                      (iPTR 0))), addr:$dst)]>,
4313                        VEX, VEX_WIG;
4314def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4315                      "movq\t{$src, $dst|$dst, $src}",
4316                      [(store (i64 (extractelt (v2i64 VR128:$src),
4317                                    (iPTR 0))), addr:$dst)]>;
4318} // ExeDomain, SchedRW
4319
4320// For disassembler only
4321let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4322    SchedRW = [SchedWriteVecLogic.XMM] in {
4323def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4324                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4325def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4326                      "movq\t{$src, $dst|$dst, $src}", []>;
4327}
4328
4329def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4330                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4331def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4332                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4333
4334let Predicates = [UseAVX] in {
4335  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4336            (VMOVQI2PQIrm addr:$src)>;
4337  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4338            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4339
4340  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4341            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4342}
4343
4344let Predicates = [UseSSE2] in {
4345  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4346
4347  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4348            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4349}
4350
4351//===---------------------------------------------------------------------===//
4352// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4353// IA32 document. movq xmm1, xmm2 does clear the high bits.
4354//
4355let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4356def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4357                        "vmovq\t{$src, $dst|$dst, $src}",
4358                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4359                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4360def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4361                        "movq\t{$src, $dst|$dst, $src}",
4362                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4363                        XS, Requires<[UseSSE2]>;
4364} // ExeDomain, SchedRW
4365
4366let Predicates = [UseAVX] in {
4367  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4368            (VMOVZPQILo2PQIrr VR128:$src)>;
4369}
4370let Predicates = [UseSSE2] in {
4371  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4372            (MOVZPQILo2PQIrr VR128:$src)>;
4373}
4374
4375let Predicates = [UseAVX] in {
4376  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4377            (SUBREG_TO_REG (i32 0),
4378             (v2f64 (VMOVZPQILo2PQIrr
4379                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4380             sub_xmm)>;
4381  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4382            (SUBREG_TO_REG (i32 0),
4383             (v2i64 (VMOVZPQILo2PQIrr
4384                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4385             sub_xmm)>;
4386}
4387
4388//===---------------------------------------------------------------------===//
4389// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4390//===---------------------------------------------------------------------===//
4391
4392multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4393                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4394                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4395def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4396                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4397                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4398                      Sched<[sched]>;
4399def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4400                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4401                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4402                      Sched<[sched.Folded]>;
4403}
4404
4405let Predicates = [HasAVX, NoVLX] in {
4406  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4407                                       v4f32, VR128, loadv4f32, f128mem,
4408                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4409  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4410                                       v4f32, VR128, loadv4f32, f128mem,
4411                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4412  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4413                                       v8f32, VR256, loadv8f32, f256mem,
4414                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4415  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4416                                       v8f32, VR256, loadv8f32, f256mem,
4417                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4418}
4419defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4420                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4421defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4422                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4423
4424let Predicates = [HasAVX, NoVLX] in {
4425  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4426            (VMOVSHDUPrr VR128:$src)>;
4427  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4428            (VMOVSHDUPrm addr:$src)>;
4429  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4430            (VMOVSLDUPrr VR128:$src)>;
4431  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4432            (VMOVSLDUPrm addr:$src)>;
4433  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4434            (VMOVSHDUPYrr VR256:$src)>;
4435  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4436            (VMOVSHDUPYrm addr:$src)>;
4437  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4438            (VMOVSLDUPYrr VR256:$src)>;
4439  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4440            (VMOVSLDUPYrm addr:$src)>;
4441}
4442
4443let Predicates = [UseSSE3] in {
4444  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4445            (MOVSHDUPrr VR128:$src)>;
4446  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4447            (MOVSHDUPrm addr:$src)>;
4448  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4449            (MOVSLDUPrr VR128:$src)>;
4450  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4451            (MOVSLDUPrm addr:$src)>;
4452}
4453
4454//===---------------------------------------------------------------------===//
4455// SSE3 - Replicate Double FP - MOVDDUP
4456//===---------------------------------------------------------------------===//
4457
4458multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4459def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4460                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4461                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4462                    Sched<[sched.XMM]>;
4463def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4464                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4465                    [(set VR128:$dst,
4466                      (v2f64 (X86Movddup
4467                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4468                    Sched<[sched.XMM.Folded]>;
4469}
4470
4471// FIXME: Merge with above classes when there are patterns for the ymm version
4472multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4473def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4474                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4475                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4476                    Sched<[sched.YMM]>;
4477def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4478                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4479                    [(set VR256:$dst,
4480                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4481                    Sched<[sched.YMM.Folded]>;
4482}
4483
4484let Predicates = [HasAVX, NoVLX] in {
4485  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4486                                      VEX, VEX_WIG;
4487  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4488                                        VEX, VEX_L, VEX_WIG;
4489}
4490
4491defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4492
4493
4494let Predicates = [HasAVX, NoVLX] in {
4495  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4496            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4497}
4498
4499let Predicates = [UseSSE3] in {
4500  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4501            (MOVDDUPrm addr:$src)>;
4502}
4503
4504//===---------------------------------------------------------------------===//
4505// SSE3 - Move Unaligned Integer
4506//===---------------------------------------------------------------------===//
4507
4508let Predicates = [HasAVX] in {
4509  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4510                      "vlddqu\t{$src, $dst|$dst, $src}",
4511                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4512                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4513  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4514                       "vlddqu\t{$src, $dst|$dst, $src}",
4515                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4516                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4517} // Predicates
4518
4519def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4520                   "lddqu\t{$src, $dst|$dst, $src}",
4521                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4522                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4523
4524//===---------------------------------------------------------------------===//
4525// SSE3 - Arithmetic
4526//===---------------------------------------------------------------------===//
4527
4528multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4529                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4530                       PatFrag ld_frag, bit Is2Addr = 1> {
4531let Uses = [MXCSR], mayRaiseFPException = 1 in {
4532  def rr : I<0xD0, MRMSrcReg,
4533       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4534       !if(Is2Addr,
4535           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4536           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4537       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4538       Sched<[sched]>;
4539  def rm : I<0xD0, MRMSrcMem,
4540       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4541       !if(Is2Addr,
4542           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4543           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4544       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4545       Sched<[sched.Folded, sched.ReadAfterFold]>;
4546}
4547}
4548
4549let Predicates = [HasAVX] in {
4550  let ExeDomain = SSEPackedSingle in {
4551    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4552                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4553                                 XD, VEX_4V, VEX_WIG;
4554    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4555                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4556                                  XD, VEX_4V, VEX_L, VEX_WIG;
4557  }
4558  let ExeDomain = SSEPackedDouble in {
4559    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4560                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4561                                 PD, VEX_4V, VEX_WIG;
4562    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4563                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4564                                  PD, VEX_4V, VEX_L, VEX_WIG;
4565  }
4566}
4567let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4568  let ExeDomain = SSEPackedSingle in
4569  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4570                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4571  let ExeDomain = SSEPackedDouble in
4572  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4573                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4574}
4575
4576//===---------------------------------------------------------------------===//
4577// SSE3 Instructions
4578//===---------------------------------------------------------------------===//
4579
4580// Horizontal ops
4581multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4582                   X86MemOperand x86memop, SDNode OpNode,
4583                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4584                   bit Is2Addr = 1> {
4585let Uses = [MXCSR], mayRaiseFPException = 1 in {
4586  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4587       !if(Is2Addr,
4588         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4589         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4590      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4591      Sched<[sched]>;
4592
4593  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4594       !if(Is2Addr,
4595         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4596         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4597      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4598      Sched<[sched.Folded, sched.ReadAfterFold]>;
4599}
4600}
4601multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4602                  X86MemOperand x86memop, SDNode OpNode,
4603                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4604                  bit Is2Addr = 1> {
4605let Uses = [MXCSR], mayRaiseFPException = 1 in {
4606  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4607       !if(Is2Addr,
4608         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4609         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4610      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4611        Sched<[sched]>;
4612
4613  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4614       !if(Is2Addr,
4615         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4616         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4617      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4618        Sched<[sched.Folded, sched.ReadAfterFold]>;
4619}
4620}
4621
4622let Predicates = [HasAVX] in {
4623  let ExeDomain = SSEPackedSingle in {
4624    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4625                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4626    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4627                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4628    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4629                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4630    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4631                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4632  }
4633  let ExeDomain = SSEPackedDouble in {
4634    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4635                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4636    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4637                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4638    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4639                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4640    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4641                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4642  }
4643}
4644
4645let Constraints = "$src1 = $dst" in {
4646  let ExeDomain = SSEPackedSingle in {
4647    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4648                          WriteFHAdd, memopv4f32>;
4649    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4650                          WriteFHAdd, memopv4f32>;
4651  }
4652  let ExeDomain = SSEPackedDouble in {
4653    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4654                         WriteFHAdd, memopv2f64>;
4655    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4656                         WriteFHAdd, memopv2f64>;
4657  }
4658}
4659
4660//===---------------------------------------------------------------------===//
4661// SSSE3 - Packed Absolute Instructions
4662//===---------------------------------------------------------------------===//
4663
4664/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4665multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4666                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4667  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4668                 (ins VR128:$src),
4669                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4670                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4671                 Sched<[sched.XMM]>;
4672
4673  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4674                 (ins i128mem:$src),
4675                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4676                 [(set VR128:$dst,
4677                   (vt (OpNode (ld_frag addr:$src))))]>,
4678                 Sched<[sched.XMM.Folded]>;
4679}
4680
4681/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4682multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4683                          SDNode OpNode, X86SchedWriteWidths sched> {
4684  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4685                  (ins VR256:$src),
4686                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4687                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4688                  Sched<[sched.YMM]>;
4689
4690  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4691                  (ins i256mem:$src),
4692                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4693                  [(set VR256:$dst,
4694                    (vt (OpNode (load addr:$src))))]>,
4695                  Sched<[sched.YMM.Folded]>;
4696}
4697
4698let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4699  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4700                              load>, VEX, VEX_WIG;
4701  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4702                              load>, VEX, VEX_WIG;
4703}
4704let Predicates = [HasAVX, NoVLX] in {
4705  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4706                              load>, VEX, VEX_WIG;
4707}
4708let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4709  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4710                                VEX, VEX_L, VEX_WIG;
4711  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4712                                VEX, VEX_L, VEX_WIG;
4713}
4714let Predicates = [HasAVX2, NoVLX] in {
4715  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4716                                VEX, VEX_L, VEX_WIG;
4717}
4718
4719defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4720                          memop>;
4721defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4722                          memop>;
4723defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4724                          memop>;
4725
4726//===---------------------------------------------------------------------===//
4727// SSSE3 - Packed Binary Operator Instructions
4728//===---------------------------------------------------------------------===//
4729
4730/// SS3I_binop_rm - Simple SSSE3 bin op
4731multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4732                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4733                         PatFrag memop_frag, X86MemOperand x86memop,
4734                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4735  let isCommutable = 1 in
4736  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4737       (ins RC:$src1, RC:$src2),
4738       !if(Is2Addr,
4739         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4740         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4741       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4742       Sched<[sched]>;
4743  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4744       (ins RC:$src1, x86memop:$src2),
4745       !if(Is2Addr,
4746         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4747         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4748       [(set RC:$dst,
4749         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4750       Sched<[sched.Folded, sched.ReadAfterFold]>;
4751}
4752
4753/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4754multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4755                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4756                             PatFrag ld_frag, bit Is2Addr = 1> {
4757  let isCommutable = 1 in
4758  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4759       (ins VR128:$src1, VR128:$src2),
4760       !if(Is2Addr,
4761         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4762         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4763       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4764       Sched<[sched]>;
4765  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4766       (ins VR128:$src1, i128mem:$src2),
4767       !if(Is2Addr,
4768         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4769         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4770       [(set VR128:$dst,
4771         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4772       Sched<[sched.Folded, sched.ReadAfterFold]>;
4773}
4774
4775multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4776                               Intrinsic IntId256,
4777                               X86FoldableSchedWrite sched> {
4778  let isCommutable = 1 in
4779  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4780       (ins VR256:$src1, VR256:$src2),
4781       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4782       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4783       Sched<[sched]>;
4784  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4785       (ins VR256:$src1, i256mem:$src2),
4786       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4787       [(set VR256:$dst,
4788         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4789       Sched<[sched.Folded, sched.ReadAfterFold]>;
4790}
4791
4792let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4793let isCommutable = 0 in {
4794  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4795                                  VR128, load, i128mem,
4796                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4797  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4798                                  v16i8, VR128, load, i128mem,
4799                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4800}
4801defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4802                                  VR128, load, i128mem,
4803                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4804}
4805
4806let ImmT = NoImm, Predicates = [HasAVX] in {
4807let isCommutable = 0 in {
4808  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4809                                  load, i128mem,
4810                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4811  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4812                                  load, i128mem,
4813                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4814  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4815                                  load, i128mem,
4816                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4817  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4818                                  load, i128mem,
4819                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4820  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4821                                      int_x86_ssse3_psign_b_128,
4822                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4823  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4824                                      int_x86_ssse3_psign_w_128,
4825                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4826  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4827                                      int_x86_ssse3_psign_d_128,
4828                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4829  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4830                                      int_x86_ssse3_phadd_sw_128,
4831                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4832  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4833                                      int_x86_ssse3_phsub_sw_128,
4834                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4835}
4836}
4837
4838let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4839let isCommutable = 0 in {
4840  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4841                                  VR256, load, i256mem,
4842                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4843  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4844                                   v32i8, VR256, load, i256mem,
4845                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4846}
4847defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4848                                  VR256, load, i256mem,
4849                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4850}
4851
4852let ImmT = NoImm, Predicates = [HasAVX2] in {
4853let isCommutable = 0 in {
4854  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4855                                  VR256, load, i256mem,
4856                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4857  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4858                                  load, i256mem,
4859                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4860  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4861                                  VR256, load, i256mem,
4862                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4863  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4864                                  load, i256mem,
4865                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4866  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4867                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4868  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4869                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4870  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4871                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4872  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4873                                       int_x86_avx2_phadd_sw,
4874                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4875  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4876                                       int_x86_avx2_phsub_sw,
4877                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4878}
4879}
4880
4881// None of these have i8 immediate fields.
4882let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4883let isCommutable = 0 in {
4884  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4885                                 memop, i128mem, SchedWritePHAdd.XMM>;
4886  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4887                                 memop, i128mem, SchedWritePHAdd.XMM>;
4888  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4889                                 memop, i128mem, SchedWritePHAdd.XMM>;
4890  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4891                                 memop, i128mem, SchedWritePHAdd.XMM>;
4892  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4893                                     SchedWriteVecALU.XMM, memop>;
4894  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4895                                     SchedWriteVecALU.XMM, memop>;
4896  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4897                                     SchedWriteVecALU.XMM, memop>;
4898  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4899                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4900  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4901                                     int_x86_ssse3_phadd_sw_128,
4902                                     SchedWritePHAdd.XMM, memop>;
4903  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4904                                     int_x86_ssse3_phsub_sw_128,
4905                                     SchedWritePHAdd.XMM, memop>;
4906  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4907                                 v16i8, VR128, memop, i128mem,
4908                                 SchedWriteVecIMul.XMM>;
4909}
4910defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4911                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4912}
4913
4914//===---------------------------------------------------------------------===//
4915// SSSE3 - Packed Align Instruction Patterns
4916//===---------------------------------------------------------------------===//
4917
4918multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4919                         PatFrag memop_frag, X86MemOperand x86memop,
4920                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4921  let hasSideEffects = 0 in {
4922  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4923      (ins RC:$src1, RC:$src2, u8imm:$src3),
4924      !if(Is2Addr,
4925        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4926        !strconcat(asm,
4927                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4928      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4929      Sched<[sched]>;
4930  let mayLoad = 1 in
4931  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4932      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4933      !if(Is2Addr,
4934        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4935        !strconcat(asm,
4936                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4937      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4938                                     (memop_frag addr:$src2),
4939                                     (i8 timm:$src3))))]>,
4940      Sched<[sched.Folded, sched.ReadAfterFold]>;
4941  }
4942}
4943
4944let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4945  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4946                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4947let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4948  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4949                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4950let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4951  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4952                               SchedWriteShuffle.XMM>;
4953
4954//===---------------------------------------------------------------------===//
4955// SSSE3 - Thread synchronization
4956//===---------------------------------------------------------------------===//
4957
4958let SchedRW = [WriteSystem] in {
4959let Uses = [EAX, ECX, EDX] in
4960def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4961                     TB, Requires<[HasSSE3, Not64BitMode]>;
4962let Uses = [RAX, ECX, EDX] in
4963def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4964                     TB, Requires<[HasSSE3, In64BitMode]>;
4965
4966let Uses = [ECX, EAX] in
4967def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4968                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4969} // SchedRW
4970
4971def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4972def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4973
4974def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4975      Requires<[Not64BitMode]>;
4976def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4977      Requires<[In64BitMode]>;
4978
4979//===----------------------------------------------------------------------===//
4980// SSE4.1 - Packed Move with Sign/Zero Extend
4981// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4982//===----------------------------------------------------------------------===//
4983
4984multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4985                            RegisterClass OutRC, RegisterClass InRC,
4986                            X86FoldableSchedWrite sched> {
4987  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4988                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4989                 Sched<[sched]>;
4990
4991  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4992                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4993                 Sched<[sched.Folded]>;
4994}
4995
4996multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4997                              X86MemOperand MemOp, X86MemOperand MemYOp,
4998                              Predicate prd> {
4999  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5000                               SchedWriteShuffle.XMM>;
5001  let Predicates = [HasAVX, prd] in
5002    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5003                                     VR128, VR128, SchedWriteShuffle.XMM>,
5004                                     VEX, VEX_WIG;
5005  let Predicates = [HasAVX2, prd] in
5006    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5007                                     VR256, VR128, WriteVPMOV256>,
5008                                     VEX, VEX_L, VEX_WIG;
5009}
5010
5011multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5012                          X86MemOperand MemYOp, Predicate prd> {
5013  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5014                                        MemOp, MemYOp, prd>;
5015  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5016                                        !strconcat("pmovzx", OpcodeStr),
5017                                        MemOp, MemYOp, prd>;
5018}
5019
5020defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5021defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5022defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5023
5024defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5025defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5026
5027defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5028
5029// AVX2 Patterns
5030multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5031                                     SDNode ExtOp, SDNode InVecOp> {
5032  // Register-Register patterns
5033  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5034  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5035            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5036  }
5037  let Predicates = [HasAVX2, NoVLX] in {
5038  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5039            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5040  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5041            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5042
5043  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5044            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5045  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5046            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5047
5048  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5049            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5050  }
5051
5052  // Simple Register-Memory patterns
5053  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5054  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5055            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5056
5057  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5058            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5059  }
5060
5061  let Predicates = [HasAVX2, NoVLX] in {
5062  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5063            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5064  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5065            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5066
5067  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5068            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5069  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5070            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5071
5072  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5073            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5074  }
5075
5076  // AVX2 Register-Memory patterns
5077  let Predicates = [HasAVX2, NoVLX] in {
5078  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5079            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5080
5081  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5082            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5083  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5084            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5085  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5086            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5087
5088  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5089            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5090
5091  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5092            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5093  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5094            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5095
5096  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5097            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5098  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5099            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5100  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5101            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5102  }
5103}
5104
5105defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5106defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5107
5108// SSE4.1/AVX patterns.
5109multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5110                                SDNode ExtOp> {
5111  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5112  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5113            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5114  }
5115  let Predicates = [HasAVX, NoVLX] in {
5116  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5117            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5118  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5119            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5120
5121  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5122            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5123  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5124            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5125
5126  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5127            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5128  }
5129  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5130  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5131            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5132  }
5133  let Predicates = [HasAVX, NoVLX] in {
5134  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5135            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5136  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5137            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5138
5139  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5140            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5141  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5142            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5143
5144  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5145            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5146  }
5147  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5148  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5149            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5150  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5151            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5152  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5153            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5154  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5155            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5156  }
5157  let Predicates = [HasAVX, NoVLX] in {
5158  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5159            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5160  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5161            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5162  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5163            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5164
5165  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5166            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5167  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5168            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5169
5170  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5171            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5172  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5173            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5174  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5175            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5176  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5177            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5178
5179  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5180            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5181  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5182            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5183  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5184            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5185
5186  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5187            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5188  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5189            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5190  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5191            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5192  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5193            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5194  }
5195}
5196
5197defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5198defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5199
5200let Predicates = [UseSSE41] in {
5201  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5202  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5203}
5204
5205//===----------------------------------------------------------------------===//
5206// SSE4.1 - Extract Instructions
5207//===----------------------------------------------------------------------===//
5208
5209/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5210multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5211  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5212                 (ins VR128:$src1, u8imm:$src2),
5213                 !strconcat(OpcodeStr,
5214                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5215                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5216                                         timm:$src2))]>,
5217                  Sched<[WriteVecExtract]>;
5218  let hasSideEffects = 0, mayStore = 1 in
5219  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5220                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5221                 !strconcat(OpcodeStr,
5222                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5223                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
5224                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5225}
5226
5227let Predicates = [HasAVX, NoBWI] in
5228  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5229
5230defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5231
5232
5233/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5234multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5235  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5236  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5237                   (ins VR128:$src1, u8imm:$src2),
5238                   !strconcat(OpcodeStr,
5239                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5240                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5241
5242  let hasSideEffects = 0, mayStore = 1 in
5243  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5244                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5245                 !strconcat(OpcodeStr,
5246                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5247                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
5248                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5249}
5250
5251let Predicates = [HasAVX, NoBWI] in
5252  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5253
5254defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5255
5256let Predicates = [UseSSE41] in
5257  def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5258
5259let Predicates = [HasAVX, NoBWI] in
5260  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5261
5262
5263/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5264multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5265  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5266                 (ins VR128:$src1, u8imm:$src2),
5267                 !strconcat(OpcodeStr,
5268                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5269                 [(set GR32:$dst,
5270                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5271                  Sched<[WriteVecExtract]>;
5272  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5273                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5274                 !strconcat(OpcodeStr,
5275                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5276                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5277                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5278}
5279
5280let Predicates = [HasAVX, NoDQI] in
5281  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5282
5283defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5284
5285/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5286multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5287  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5288                 (ins VR128:$src1, u8imm:$src2),
5289                 !strconcat(OpcodeStr,
5290                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5291                 [(set GR64:$dst,
5292                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5293                  Sched<[WriteVecExtract]>;
5294  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5295                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5296                 !strconcat(OpcodeStr,
5297                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5298                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5299                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5300}
5301
5302let Predicates = [HasAVX, NoDQI] in
5303  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5304
5305defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5306
5307/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5308/// destination
5309multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5310  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5311                   (ins VR128:$src1, u8imm:$src2),
5312                   !strconcat(OpcodeStr,
5313                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5314                   [(set GR32orGR64:$dst,
5315                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5316                   Sched<[WriteVecExtract]>;
5317  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5318                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5319                   !strconcat(OpcodeStr,
5320                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5321                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5322                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5323}
5324
5325let ExeDomain = SSEPackedSingle in {
5326  let Predicates = [UseAVX] in
5327    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5328  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5329}
5330
5331//===----------------------------------------------------------------------===//
5332// SSE4.1 - Insert Instructions
5333//===----------------------------------------------------------------------===//
5334
5335multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5336  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5337      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5338      !if(Is2Addr,
5339        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5340        !strconcat(asm,
5341                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5342      [(set VR128:$dst,
5343        (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
5344      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5345  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5346      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5347      !if(Is2Addr,
5348        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5349        !strconcat(asm,
5350                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5351      [(set VR128:$dst,
5352        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
5353                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5354}
5355
5356let Predicates = [HasAVX, NoBWI] in
5357  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5358let Constraints = "$src1 = $dst" in
5359  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5360
5361multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5362  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5363      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5364      !if(Is2Addr,
5365        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5366        !strconcat(asm,
5367                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5368      [(set VR128:$dst,
5369        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5370      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5371  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5372      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5373      !if(Is2Addr,
5374        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5375        !strconcat(asm,
5376                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5377      [(set VR128:$dst,
5378        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5379                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5380}
5381
5382let Predicates = [HasAVX, NoDQI] in
5383  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5384let Constraints = "$src1 = $dst" in
5385  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5386
5387multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5388  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5389      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5390      !if(Is2Addr,
5391        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5392        !strconcat(asm,
5393                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5394      [(set VR128:$dst,
5395        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5396      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5397  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5398      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5399      !if(Is2Addr,
5400        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5401        !strconcat(asm,
5402                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5403      [(set VR128:$dst,
5404        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5405                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5406}
5407
5408let Predicates = [HasAVX, NoDQI] in
5409  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5410let Constraints = "$src1 = $dst" in
5411  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5412
5413// insertps has a few different modes, there's the first two here below which
5414// are optimized inserts that won't zero arbitrary elements in the destination
5415// vector. The next one matches the intrinsic and could zero arbitrary elements
5416// in the target vector.
5417multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5418  let isCommutable = 1 in
5419  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5420      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5421      !if(Is2Addr,
5422        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5423        !strconcat(asm,
5424                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5425      [(set VR128:$dst,
5426        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5427      Sched<[SchedWriteFShuffle.XMM]>;
5428  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5429      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5430      !if(Is2Addr,
5431        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5432        !strconcat(asm,
5433                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5434      [(set VR128:$dst,
5435        (X86insertps VR128:$src1,
5436                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5437                    timm:$src3))]>,
5438      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5439}
5440
5441let ExeDomain = SSEPackedSingle in {
5442  let Predicates = [UseAVX] in
5443    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5444                     VEX_4V, VEX_WIG;
5445  let Constraints = "$src1 = $dst" in
5446    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5447}
5448
5449//===----------------------------------------------------------------------===//
5450// SSE4.1 - Round Instructions
5451//===----------------------------------------------------------------------===//
5452
5453multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5454                           X86MemOperand x86memop, RegisterClass RC,
5455                           ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
5456                           X86FoldableSchedWrite sched> {
5457  // Intrinsic operation, reg.
5458  // Vector intrinsic operation, reg
5459let Uses = [MXCSR], mayRaiseFPException = 1 in {
5460  def r : SS4AIi8<opc, MRMSrcReg,
5461                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5462                  !strconcat(OpcodeStr,
5463                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5464                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5465                  Sched<[sched]>;
5466
5467  // Vector intrinsic operation, mem
5468  def m : SS4AIi8<opc, MRMSrcMem,
5469                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5470                  !strconcat(OpcodeStr,
5471                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5472                  [(set RC:$dst,
5473                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5474                  Sched<[sched.Folded]>;
5475}
5476}
5477
5478multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5479                          string OpcodeStr, X86FoldableSchedWrite sched> {
5480let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5481  def SSr : SS4AIi8<opcss, MRMSrcReg,
5482        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5483        !strconcat(OpcodeStr,
5484            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5485      []>, Sched<[sched]>;
5486
5487  let mayLoad = 1 in
5488  def SSm : SS4AIi8<opcss, MRMSrcMem,
5489        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5490        !strconcat(OpcodeStr,
5491             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5492        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5493} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5494
5495let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5496  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5497        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5498        !strconcat(OpcodeStr,
5499              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5500        []>, Sched<[sched]>;
5501
5502  let mayLoad = 1 in
5503  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5504        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5505        !strconcat(OpcodeStr,
5506             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5507        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5508} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5509}
5510
5511multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5512                           string OpcodeStr, X86FoldableSchedWrite sched> {
5513let Uses = [MXCSR], mayRaiseFPException = 1 in {
5514let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5515  def SSr : SS4AIi8<opcss, MRMSrcReg,
5516                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5517                    !strconcat(OpcodeStr,
5518                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5519                    []>, Sched<[sched]>;
5520
5521  let mayLoad = 1 in
5522  def SSm : SS4AIi8<opcss, MRMSrcMem,
5523                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5524                    !strconcat(OpcodeStr,
5525                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5526                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5527} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5528
5529let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5530  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5531                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5532                    !strconcat(OpcodeStr,
5533                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5534                    []>, Sched<[sched]>;
5535
5536  let mayLoad = 1 in
5537  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5538                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5539                    !strconcat(OpcodeStr,
5540                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5541                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5542} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5543}
5544}
5545
5546multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5547                            string OpcodeStr, X86FoldableSchedWrite sched,
5548                            ValueType VT32, ValueType VT64,
5549                            SDNode OpNode, bit Is2Addr = 1> {
5550let Uses = [MXCSR], mayRaiseFPException = 1 in {
5551let ExeDomain = SSEPackedSingle in {
5552  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5553        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5554        !if(Is2Addr,
5555            !strconcat(OpcodeStr,
5556                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5557            !strconcat(OpcodeStr,
5558                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5559        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5560        Sched<[sched]>;
5561
5562  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5563        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5564        !if(Is2Addr,
5565            !strconcat(OpcodeStr,
5566                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5567            !strconcat(OpcodeStr,
5568                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5569        [(set VR128:$dst,
5570             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5571        Sched<[sched.Folded, sched.ReadAfterFold]>;
5572} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5573
5574let ExeDomain = SSEPackedDouble in {
5575  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5576        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5577        !if(Is2Addr,
5578            !strconcat(OpcodeStr,
5579                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5580            !strconcat(OpcodeStr,
5581                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5582        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5583        Sched<[sched]>;
5584
5585  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5586        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5587        !if(Is2Addr,
5588            !strconcat(OpcodeStr,
5589                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5590            !strconcat(OpcodeStr,
5591                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5592        [(set VR128:$dst,
5593              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5594        Sched<[sched.Folded, sched.ReadAfterFold]>;
5595} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5596}
5597}
5598
5599// FP round - roundss, roundps, roundsd, roundpd
5600let Predicates = [HasAVX, NoVLX] in {
5601  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5602    // Intrinsic form
5603    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5604                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5605                                   VEX, VEX_WIG;
5606    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5607                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5608                                   VEX, VEX_L, VEX_WIG;
5609  }
5610
5611  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5612    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5613                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5614                                   VEX, VEX_WIG;
5615    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5616                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5617                                   VEX, VEX_L, VEX_WIG;
5618  }
5619}
5620let Predicates = [UseAVX] in {
5621  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5622                                  v4f32, v2f64, X86RndScales, 0>,
5623                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5624  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5625                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5626}
5627
5628let Predicates = [UseAVX] in {
5629  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5630            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5631  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5632            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5633}
5634
5635let Predicates = [UseAVX, OptForSize] in {
5636  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5637            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5638  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5639            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5640}
5641
5642let ExeDomain = SSEPackedSingle in
5643defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5644                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5645let ExeDomain = SSEPackedDouble in
5646defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5647                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5648
5649defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5650
5651let Constraints = "$src1 = $dst" in
5652defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5653                               v4f32, v2f64, X86RndScales>;
5654
5655let Predicates = [UseSSE41] in {
5656  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5657            (ROUNDSSr FR32:$src1, timm:$src2)>;
5658  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5659            (ROUNDSDr FR64:$src1, timm:$src2)>;
5660}
5661
5662let Predicates = [UseSSE41, OptForSize] in {
5663  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5664            (ROUNDSSm addr:$src1, timm:$src2)>;
5665  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5666            (ROUNDSDm addr:$src1, timm:$src2)>;
5667}
5668
5669//===----------------------------------------------------------------------===//
5670// SSE4.1 - Packed Bit Test
5671//===----------------------------------------------------------------------===//
5672
5673// ptest instruction we'll lower to this in X86ISelLowering primarily from
5674// the intel intrinsic that corresponds to this.
5675let Defs = [EFLAGS], Predicates = [HasAVX] in {
5676def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5677                "vptest\t{$src2, $src1|$src1, $src2}",
5678                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5679                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5680def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5681                "vptest\t{$src2, $src1|$src1, $src2}",
5682                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5683                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5684                VEX, VEX_WIG;
5685
5686def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5687                "vptest\t{$src2, $src1|$src1, $src2}",
5688                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5689                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5690def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5691                "vptest\t{$src2, $src1|$src1, $src2}",
5692                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5693                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5694                VEX, VEX_L, VEX_WIG;
5695}
5696
5697let Defs = [EFLAGS] in {
5698def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5699              "ptest\t{$src2, $src1|$src1, $src2}",
5700              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5701              Sched<[SchedWriteVecTest.XMM]>;
5702def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5703              "ptest\t{$src2, $src1|$src1, $src2}",
5704              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5705              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5706}
5707
5708// The bit test instructions below are AVX only
5709multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5710                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5711                       X86FoldableSchedWrite sched> {
5712  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5713            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5714            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5715            Sched<[sched]>, VEX;
5716  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5717            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5718            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5719            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5720}
5721
5722let Defs = [EFLAGS], Predicates = [HasAVX] in {
5723let ExeDomain = SSEPackedSingle in {
5724defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5725                            SchedWriteFTest.XMM>;
5726defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5727                            SchedWriteFTest.YMM>, VEX_L;
5728}
5729let ExeDomain = SSEPackedDouble in {
5730defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5731                            SchedWriteFTest.XMM>;
5732defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5733                            SchedWriteFTest.YMM>, VEX_L;
5734}
5735}
5736
5737//===----------------------------------------------------------------------===//
5738// SSE4.1 - Misc Instructions
5739//===----------------------------------------------------------------------===//
5740
5741let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5742  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5743                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5744                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5745                     Sched<[WritePOPCNT]>, OpSize16, XS;
5746  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5747                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5748                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5749                      (implicit EFLAGS)]>,
5750                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5751
5752  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5753                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5754                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5755                     Sched<[WritePOPCNT]>, OpSize32, XS;
5756
5757  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5758                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5759                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5760                      (implicit EFLAGS)]>,
5761                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5762
5763  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5764                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5765                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5766                      Sched<[WritePOPCNT]>, XS;
5767  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5768                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5769                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5770                       (implicit EFLAGS)]>,
5771                       Sched<[WritePOPCNT.Folded]>, XS;
5772}
5773
5774// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5775multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5776                                 SDNode OpNode, PatFrag ld_frag,
5777                                 X86FoldableSchedWrite Sched> {
5778  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5779                 (ins VR128:$src),
5780                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5781                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5782                 Sched<[Sched]>;
5783  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5784                  (ins i128mem:$src),
5785                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5786                  [(set VR128:$dst,
5787                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5788                 Sched<[Sched.Folded]>;
5789}
5790
5791// PHMIN has the same profile as PSAD, thus we use the same scheduling
5792// model, although the naming is misleading.
5793let Predicates = [HasAVX] in
5794defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5795                                         X86phminpos, load,
5796                                         WritePHMINPOS>, VEX, VEX_WIG;
5797defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5798                                         X86phminpos, memop,
5799                                         WritePHMINPOS>;
5800
5801/// SS48I_binop_rm - Simple SSE41 binary operator.
5802multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5803                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5804                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5805                          bit Is2Addr = 1> {
5806  let isCommutable = 1 in
5807  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5808       (ins RC:$src1, RC:$src2),
5809       !if(Is2Addr,
5810           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5811           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5812       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5813       Sched<[sched]>;
5814  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5815       (ins RC:$src1, x86memop:$src2),
5816       !if(Is2Addr,
5817           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5818           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5819       [(set RC:$dst,
5820         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5821       Sched<[sched.Folded, sched.ReadAfterFold]>;
5822}
5823
5824let Predicates = [HasAVX, NoVLX] in {
5825  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5826                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5827                                  VEX_4V, VEX_WIG;
5828  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5829                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5830                                  VEX_4V, VEX_WIG;
5831  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5832                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5833                                  VEX_4V, VEX_WIG;
5834  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5835                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5836                                  VEX_4V, VEX_WIG;
5837  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5838                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5839                                  VEX_4V, VEX_WIG;
5840}
5841let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5842  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5843                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5844                                  VEX_4V, VEX_WIG;
5845  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5846                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5847                                  VEX_4V, VEX_WIG;
5848  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5849                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5850                                  VEX_4V, VEX_WIG;
5851  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5852                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5853                                  VEX_4V, VEX_WIG;
5854}
5855
5856let Predicates = [HasAVX2, NoVLX] in {
5857  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5858                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5859                                  VEX_4V, VEX_L, VEX_WIG;
5860  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5861                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5862                                  VEX_4V, VEX_L, VEX_WIG;
5863  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5864                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5865                                  VEX_4V, VEX_L, VEX_WIG;
5866  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5867                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5868                                  VEX_4V, VEX_L, VEX_WIG;
5869  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5870                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5871                                  VEX_4V, VEX_L, VEX_WIG;
5872}
5873let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5874  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5875                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5876                                  VEX_4V, VEX_L, VEX_WIG;
5877  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5878                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5879                                  VEX_4V, VEX_L, VEX_WIG;
5880  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5881                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5882                                  VEX_4V, VEX_L, VEX_WIG;
5883  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5884                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5885                                  VEX_4V, VEX_L, VEX_WIG;
5886}
5887
5888let Constraints = "$src1 = $dst" in {
5889  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5890                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5891  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5892                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5893  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5894                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5895  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5896                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5897  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5898                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5899  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5900                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5901  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5902                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5903  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5904                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5905  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5906                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5907}
5908
5909let Predicates = [HasAVX, NoVLX] in
5910  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5911                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5912                                 VEX_4V, VEX_WIG;
5913let Predicates = [HasAVX] in
5914  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5915                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5916                                 VEX_4V, VEX_WIG;
5917
5918let Predicates = [HasAVX2, NoVLX] in
5919  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5920                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5921                                  VEX_4V, VEX_L, VEX_WIG;
5922let Predicates = [HasAVX2] in
5923  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5924                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5925                                  VEX_4V, VEX_L, VEX_WIG;
5926
5927let Constraints = "$src1 = $dst" in {
5928  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5929                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5930  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5931                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5932}
5933
5934/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5935multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5936                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5937                 X86MemOperand x86memop, bit Is2Addr,
5938                 X86FoldableSchedWrite sched> {
5939  let isCommutable = 1 in
5940  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5941        (ins RC:$src1, RC:$src2, u8imm:$src3),
5942        !if(Is2Addr,
5943            !strconcat(OpcodeStr,
5944                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5945            !strconcat(OpcodeStr,
5946                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5947        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5948        Sched<[sched]>;
5949  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5950        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5951        !if(Is2Addr,
5952            !strconcat(OpcodeStr,
5953                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5954            !strconcat(OpcodeStr,
5955                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5956        [(set RC:$dst,
5957          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5958        Sched<[sched.Folded, sched.ReadAfterFold]>;
5959}
5960
5961/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5962multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5963                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5964                           X86MemOperand x86memop, bit Is2Addr,
5965                           X86FoldableSchedWrite sched> {
5966  let isCommutable = 1 in
5967  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5968        (ins RC:$src1, RC:$src2, u8imm:$src3),
5969        !if(Is2Addr,
5970            !strconcat(OpcodeStr,
5971                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5972            !strconcat(OpcodeStr,
5973                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5974        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5975        Sched<[sched]>;
5976  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5977        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5978        !if(Is2Addr,
5979            !strconcat(OpcodeStr,
5980                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5981            !strconcat(OpcodeStr,
5982                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5983        [(set RC:$dst,
5984          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5985        Sched<[sched.Folded, sched.ReadAfterFold]>;
5986}
5987
5988def BlendCommuteImm2 : SDNodeXForm<timm, [{
5989  uint8_t Imm = N->getZExtValue() & 0x03;
5990  return getI8Imm(Imm ^ 0x03, SDLoc(N));
5991}]>;
5992
5993def BlendCommuteImm4 : SDNodeXForm<timm, [{
5994  uint8_t Imm = N->getZExtValue() & 0x0f;
5995  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5996}]>;
5997
5998def BlendCommuteImm8 : SDNodeXForm<timm, [{
5999  uint8_t Imm = N->getZExtValue() & 0xff;
6000  return getI8Imm(Imm ^ 0xff, SDLoc(N));
6001}]>;
6002
6003// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
6004def BlendScaleImm4 : SDNodeXForm<timm, [{
6005  uint8_t Imm = N->getZExtValue();
6006  uint8_t NewImm = 0;
6007  for (unsigned i = 0; i != 4; ++i) {
6008    if (Imm & (1 << i))
6009      NewImm |= 0x3 << (i * 2);
6010  }
6011  return getI8Imm(NewImm, SDLoc(N));
6012}]>;
6013
6014// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
6015def BlendScaleImm2 : SDNodeXForm<timm, [{
6016  uint8_t Imm = N->getZExtValue();
6017  uint8_t NewImm = 0;
6018  for (unsigned i = 0; i != 2; ++i) {
6019    if (Imm & (1 << i))
6020      NewImm |= 0xf << (i * 4);
6021  }
6022  return getI8Imm(NewImm, SDLoc(N));
6023}]>;
6024
6025// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
6026def BlendScaleImm2to4 : SDNodeXForm<timm, [{
6027  uint8_t Imm = N->getZExtValue();
6028  uint8_t NewImm = 0;
6029  for (unsigned i = 0; i != 2; ++i) {
6030    if (Imm & (1 << i))
6031      NewImm |= 0x3 << (i * 2);
6032  }
6033  return getI8Imm(NewImm, SDLoc(N));
6034}]>;
6035
6036// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
6037def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
6038  uint8_t Imm = N->getZExtValue();
6039  uint8_t NewImm = 0;
6040  for (unsigned i = 0; i != 4; ++i) {
6041    if (Imm & (1 << i))
6042      NewImm |= 0x3 << (i * 2);
6043  }
6044  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6045}]>;
6046
6047// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
6048def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
6049  uint8_t Imm = N->getZExtValue();
6050  uint8_t NewImm = 0;
6051  for (unsigned i = 0; i != 2; ++i) {
6052    if (Imm & (1 << i))
6053      NewImm |= 0xf << (i * 4);
6054  }
6055  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6056}]>;
6057
6058// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
6059def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
6060  uint8_t Imm = N->getZExtValue();
6061  uint8_t NewImm = 0;
6062  for (unsigned i = 0; i != 2; ++i) {
6063    if (Imm & (1 << i))
6064      NewImm |= 0x3 << (i * 2);
6065  }
6066  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6067}]>;
6068
6069let Predicates = [HasAVX] in {
6070  let isCommutable = 0 in {
6071    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6072                                        VR128, load, i128mem, 0,
6073                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6074  }
6075
6076let Uses = [MXCSR], mayRaiseFPException = 1 in {
6077  let ExeDomain = SSEPackedSingle in
6078  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6079                                   VR128, load, f128mem, 0,
6080                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6081  let ExeDomain = SSEPackedDouble in
6082  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6083                                   VR128, load, f128mem, 0,
6084                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6085  let ExeDomain = SSEPackedSingle in
6086  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6087                                    VR256, load, i256mem, 0,
6088                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6089}
6090}
6091
6092let Predicates = [HasAVX2] in {
6093  let isCommutable = 0 in {
6094  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6095                                  VR256, load, i256mem, 0,
6096                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6097  }
6098}
6099
6100let Constraints = "$src1 = $dst" in {
6101  let isCommutable = 0 in {
6102  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6103                                     VR128, memop, i128mem, 1,
6104                                     SchedWriteMPSAD.XMM>;
6105  }
6106
6107  let ExeDomain = SSEPackedSingle in
6108  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6109                                  VR128, memop, f128mem, 1,
6110                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6111  let ExeDomain = SSEPackedDouble in
6112  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6113                                  VR128, memop, f128mem, 1,
6114                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6115}
6116
6117/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6118multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6119                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6120                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6121                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6122let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6123  let isCommutable = 1 in
6124  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6125        (ins RC:$src1, RC:$src2, u8imm:$src3),
6126        !if(Is2Addr,
6127            !strconcat(OpcodeStr,
6128                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6129            !strconcat(OpcodeStr,
6130                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6131        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6132        Sched<[sched]>;
6133  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6134        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6135        !if(Is2Addr,
6136            !strconcat(OpcodeStr,
6137                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6138            !strconcat(OpcodeStr,
6139                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6140        [(set RC:$dst,
6141          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6142        Sched<[sched.Folded, sched.ReadAfterFold]>;
6143}
6144
6145  // Pattern to commute if load is in first source.
6146  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6147            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6148                                            (commuteXForm timm:$src3))>;
6149}
6150
6151let Predicates = [HasAVX] in {
6152  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6153                                  VR128, load, f128mem, 0, SSEPackedSingle,
6154                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6155                                  VEX_4V, VEX_WIG;
6156  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6157                                   VR256, load, f256mem, 0, SSEPackedSingle,
6158                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6159                                   VEX_4V, VEX_L, VEX_WIG;
6160  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6161                                  VR128, load, f128mem, 0, SSEPackedDouble,
6162                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6163                                  VEX_4V, VEX_WIG;
6164  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6165                                   VR256, load, f256mem, 0, SSEPackedDouble,
6166                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6167                                   VEX_4V, VEX_L, VEX_WIG;
6168  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6169                                  VR128, load, i128mem, 0, SSEPackedInt,
6170                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6171                                  VEX_4V, VEX_WIG;
6172}
6173
6174let Predicates = [HasAVX2] in {
6175  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6176                                   VR256, load, i256mem, 0, SSEPackedInt,
6177                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6178                                   VEX_4V, VEX_L, VEX_WIG;
6179}
6180
6181// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6182// ExecutionDomainFixPass will cleanup domains later on.
6183let Predicates = [HasAVX1Only] in {
6184def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6185          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6186def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6187          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6188def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6189          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6190
6191// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6192// it from becoming movsd via commuting under optsize.
6193def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6194          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6195def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6196          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6197def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6198          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6199
6200def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6201          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6202def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6203          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6204def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6205          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6206
6207// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6208// it from becoming movss via commuting under optsize.
6209def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6210          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6211def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6212          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6213def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6214          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6215}
6216
6217defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6218                               VR128, memop, f128mem, 1, SSEPackedSingle,
6219                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6220defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6221                               VR128, memop, f128mem, 1, SSEPackedDouble,
6222                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6223defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6224                               VR128, memop, i128mem, 1, SSEPackedInt,
6225                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6226
6227let Predicates = [UseSSE41] in {
6228// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6229// it from becoming movss via commuting under optsize.
6230def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6231          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6232def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6233          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6234def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6235          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6236
6237def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6238          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6239def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6240          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6241def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6242          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6243}
6244
6245// For insertion into the zero index (low half) of a 256-bit vector, it is
6246// more efficient to generate a blend with immediate instead of an insert*128.
6247let Predicates = [HasAVX] in {
6248def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6249          (VBLENDPDYrri VR256:$src1,
6250                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6251                                       VR128:$src2, sub_xmm), 0x3)>;
6252def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6253          (VBLENDPSYrri VR256:$src1,
6254                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6255                                       VR128:$src2, sub_xmm), 0xf)>;
6256
6257def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6258          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6259                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6260def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6261          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6262                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6263}
6264
6265/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6266multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6267                                X86MemOperand x86memop, ValueType VT,
6268                                PatFrag mem_frag, SDNode OpNode,
6269                                X86FoldableSchedWrite sched> {
6270  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6271                  (ins RC:$src1, RC:$src2, RC:$src3),
6272                  !strconcat(OpcodeStr,
6273                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6274                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6275                  SSEPackedInt>, TAPD, VEX_4V,
6276                Sched<[sched]>;
6277
6278  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6279                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6280                  !strconcat(OpcodeStr,
6281                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6282                  [(set RC:$dst,
6283                        (OpNode RC:$src3, (mem_frag addr:$src2),
6284                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6285                Sched<[sched.Folded, sched.ReadAfterFold,
6286                       // x86memop:$src2
6287                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6288                       ReadDefault,
6289                       // RC::$src3
6290                       sched.ReadAfterFold]>;
6291}
6292
6293let Predicates = [HasAVX] in {
6294let ExeDomain = SSEPackedDouble in {
6295defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6296                                       v2f64, loadv2f64, X86Blendv,
6297                                       SchedWriteFVarBlend.XMM>;
6298defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6299                                       v4f64, loadv4f64, X86Blendv,
6300                                       SchedWriteFVarBlend.YMM>, VEX_L;
6301} // ExeDomain = SSEPackedDouble
6302let ExeDomain = SSEPackedSingle in {
6303defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6304                                       v4f32, loadv4f32, X86Blendv,
6305                                       SchedWriteFVarBlend.XMM>;
6306defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6307                                       v8f32, loadv8f32, X86Blendv,
6308                                       SchedWriteFVarBlend.YMM>, VEX_L;
6309} // ExeDomain = SSEPackedSingle
6310defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6311                                       v16i8, loadv16i8, X86Blendv,
6312                                       SchedWriteVarBlend.XMM>;
6313}
6314
6315let Predicates = [HasAVX2] in {
6316defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6317                                       v32i8, loadv32i8, X86Blendv,
6318                                       SchedWriteVarBlend.YMM>, VEX_L;
6319}
6320
6321let Predicates = [HasAVX] in {
6322  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6323                              (v4i32 VR128:$src2))),
6324            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6325  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6326                              (v2i64 VR128:$src2))),
6327            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6328  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6329                              (v8i32 VR256:$src2))),
6330            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6331  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6332                              (v4i64 VR256:$src2))),
6333            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6334}
6335
6336// Prefer a movss or movsd over a blendps when optimizing for size. these were
6337// changed to use blends because blends have better throughput on sandybridge
6338// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6339let Predicates = [HasAVX, OptForSpeed] in {
6340  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6341            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6342  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6343            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6344
6345  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6346            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6347  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6348            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6349  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6350            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6351
6352  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6353            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6354  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6355            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6356  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6357            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6358
6359  // Move low f32 and clear high bits.
6360  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6361            (SUBREG_TO_REG (i32 0),
6362             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6363                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6364                          (i8 1))), sub_xmm)>;
6365  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6366            (SUBREG_TO_REG (i32 0),
6367             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6368                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6369                          (i8 3))), sub_xmm)>;
6370}
6371
6372// Prefer a movss or movsd over a blendps when optimizing for size. these were
6373// changed to use blends because blends have better throughput on sandybridge
6374// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6375let Predicates = [UseSSE41, OptForSpeed] in {
6376  // With SSE41 we can use blends for these patterns.
6377  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6378            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6379  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6380            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6381
6382  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6383            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6384  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6385            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6386  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6387            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6388
6389  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6390            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6391  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6392            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6393  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6394            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6395}
6396
6397
6398/// SS41I_ternary - SSE 4.1 ternary operator
6399let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6400  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6401                           PatFrag mem_frag, X86MemOperand x86memop,
6402                           SDNode OpNode, X86FoldableSchedWrite sched> {
6403    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6404                    (ins VR128:$src1, VR128:$src2),
6405                    !strconcat(OpcodeStr,
6406                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6407                    [(set VR128:$dst,
6408                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6409                    Sched<[sched]>;
6410
6411    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6412                    (ins VR128:$src1, x86memop:$src2),
6413                    !strconcat(OpcodeStr,
6414                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6415                    [(set VR128:$dst,
6416                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6417                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6418  }
6419}
6420
6421let ExeDomain = SSEPackedDouble in
6422defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6423                              X86Blendv, SchedWriteFVarBlend.XMM>;
6424let ExeDomain = SSEPackedSingle in
6425defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6426                              X86Blendv, SchedWriteFVarBlend.XMM>;
6427defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6428                              X86Blendv, SchedWriteVarBlend.XMM>;
6429
6430// Aliases with the implicit xmm0 argument
6431def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6432                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6433def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6434                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6435def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6436                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6437def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6438                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6439def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6440                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6441def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6442                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6443
6444let Predicates = [UseSSE41] in {
6445  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6446                              (v4i32 VR128:$src2))),
6447            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6448  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6449                              (v2i64 VR128:$src2))),
6450            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6451}
6452
6453let AddedComplexity = 400 in { // Prefer non-temporal versions
6454
6455let Predicates = [HasAVX, NoVLX] in
6456def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6457                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6458                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6459let Predicates = [HasAVX2, NoVLX] in
6460def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6461                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6462                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6463def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6464                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6465                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6466
6467let Predicates = [HasAVX2, NoVLX] in {
6468  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6469            (VMOVNTDQAYrm addr:$src)>;
6470  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6471            (VMOVNTDQAYrm addr:$src)>;
6472  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6473            (VMOVNTDQAYrm addr:$src)>;
6474  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6475            (VMOVNTDQAYrm addr:$src)>;
6476  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6477            (VMOVNTDQAYrm addr:$src)>;
6478  def : Pat<(v16f16 (alignednontemporalload addr:$src)),
6479            (VMOVNTDQAYrm addr:$src)>;
6480  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6481            (VMOVNTDQAYrm addr:$src)>;
6482}
6483
6484let Predicates = [HasAVX, NoVLX] in {
6485  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6486            (VMOVNTDQArm addr:$src)>;
6487  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6488            (VMOVNTDQArm addr:$src)>;
6489  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6490            (VMOVNTDQArm addr:$src)>;
6491  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6492            (VMOVNTDQArm addr:$src)>;
6493  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6494            (VMOVNTDQArm addr:$src)>;
6495  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6496            (VMOVNTDQArm addr:$src)>;
6497  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6498            (VMOVNTDQArm addr:$src)>;
6499}
6500
6501let Predicates = [UseSSE41] in {
6502  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6503            (MOVNTDQArm addr:$src)>;
6504  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6505            (MOVNTDQArm addr:$src)>;
6506  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6507            (MOVNTDQArm addr:$src)>;
6508  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6509            (MOVNTDQArm addr:$src)>;
6510  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6511            (MOVNTDQArm addr:$src)>;
6512  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6513            (MOVNTDQArm addr:$src)>;
6514  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6515            (MOVNTDQArm addr:$src)>;
6516}
6517
6518} // AddedComplexity
6519
6520//===----------------------------------------------------------------------===//
6521// SSE4.2 - Compare Instructions
6522//===----------------------------------------------------------------------===//
6523
6524/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6525multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6526                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6527                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6528                          bit Is2Addr = 1> {
6529  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6530       (ins RC:$src1, RC:$src2),
6531       !if(Is2Addr,
6532           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6533           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6534       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6535       Sched<[sched]>;
6536  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6537       (ins RC:$src1, x86memop:$src2),
6538       !if(Is2Addr,
6539           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6540           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6541       [(set RC:$dst,
6542         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6543       Sched<[sched.Folded, sched.ReadAfterFold]>;
6544}
6545
6546let Predicates = [HasAVX] in
6547  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6548                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6549                                 VEX_4V, VEX_WIG;
6550
6551let Predicates = [HasAVX2] in
6552  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6553                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6554                                  VEX_4V, VEX_L, VEX_WIG;
6555
6556let Constraints = "$src1 = $dst" in
6557  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6558                                memop, i128mem, SchedWriteVecALU.XMM>;
6559
6560//===----------------------------------------------------------------------===//
6561// SSE4.2 - String/text Processing Instructions
6562//===----------------------------------------------------------------------===//
6563
6564multiclass pcmpistrm_SS42AI<string asm> {
6565  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6566    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6567    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6568    []>, Sched<[WritePCmpIStrM]>;
6569  let mayLoad = 1 in
6570  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6571    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6572    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6573    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6574}
6575
6576let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6577  let Predicates = [HasAVX] in
6578  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
6579  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6580}
6581
6582multiclass SS42AI_pcmpestrm<string asm> {
6583  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6584    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6585    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6586    []>, Sched<[WritePCmpEStrM]>;
6587  let mayLoad = 1 in
6588  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6589    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6590    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6591    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6592}
6593
6594let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6595  let Predicates = [HasAVX] in
6596  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
6597  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6598}
6599
6600multiclass SS42AI_pcmpistri<string asm> {
6601  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6602    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6603    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6604    []>, Sched<[WritePCmpIStrI]>;
6605  let mayLoad = 1 in
6606  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6607    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6608    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6609    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6610}
6611
6612let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6613  let Predicates = [HasAVX] in
6614  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
6615  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6616}
6617
6618multiclass SS42AI_pcmpestri<string asm> {
6619  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6620    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6621    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6622    []>, Sched<[WritePCmpEStrI]>;
6623  let mayLoad = 1 in
6624  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6625    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6626    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6627    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6628}
6629
6630let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6631  let Predicates = [HasAVX] in
6632  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
6633  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6634}
6635
6636//===----------------------------------------------------------------------===//
6637// SSE4.2 - CRC Instructions
6638//===----------------------------------------------------------------------===//
6639
6640// No CRC instructions have AVX equivalents
6641
6642// crc intrinsic instruction
6643// This set of instructions are only rm, the only difference is the size
6644// of r and m.
6645class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6646                   RegisterClass RCIn, SDPatternOperator Int> :
6647  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6648         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6649         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6650         Sched<[WriteCRC32]>;
6651
6652class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6653                   X86MemOperand x86memop, SDPatternOperator Int> :
6654  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6655         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6656         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6657         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6658
6659let Constraints = "$src1 = $dst" in {
6660  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6661                                 int_x86_sse42_crc32_32_8>;
6662  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6663                                 int_x86_sse42_crc32_32_8>;
6664  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6665                                 int_x86_sse42_crc32_32_16>, OpSize16;
6666  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6667                                 int_x86_sse42_crc32_32_16>, OpSize16;
6668  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6669                                 int_x86_sse42_crc32_32_32>, OpSize32;
6670  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6671                                 int_x86_sse42_crc32_32_32>, OpSize32;
6672  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6673                                 int_x86_sse42_crc32_64_64>, REX_W;
6674  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6675                                 int_x86_sse42_crc32_64_64>, REX_W;
6676  let hasSideEffects = 0 in {
6677    let mayLoad = 1 in
6678    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6679                                   null_frag>, REX_W;
6680    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6681                                   null_frag>, REX_W;
6682  }
6683}
6684
6685//===----------------------------------------------------------------------===//
6686// SHA-NI Instructions
6687//===----------------------------------------------------------------------===//
6688
6689// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6690multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6691                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6692  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6693             (ins VR128:$src1, VR128:$src2),
6694             !if(UsesXMM0,
6695                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6696                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6697             [!if(UsesXMM0,
6698                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6699                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6700             T8PS, Sched<[sched]>;
6701
6702  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6703             (ins VR128:$src1, i128mem:$src2),
6704             !if(UsesXMM0,
6705                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6706                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6707             [!if(UsesXMM0,
6708                  (set VR128:$dst, (IntId VR128:$src1,
6709                    (memop addr:$src2), XMM0)),
6710                  (set VR128:$dst, (IntId VR128:$src1,
6711                    (memop addr:$src2))))]>, T8PS,
6712             Sched<[sched.Folded, sched.ReadAfterFold]>;
6713}
6714
6715let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6716  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6717                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6718                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6719                         [(set VR128:$dst,
6720                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6721                            (i8 timm:$src3)))]>, TAPS,
6722                         Sched<[SchedWriteVecIMul.XMM]>;
6723  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6724                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6725                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6726                         [(set VR128:$dst,
6727                           (int_x86_sha1rnds4 VR128:$src1,
6728                            (memop addr:$src2),
6729                            (i8 timm:$src3)))]>, TAPS,
6730                         Sched<[SchedWriteVecIMul.XMM.Folded,
6731                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6732
6733  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6734                              SchedWriteVecIMul.XMM>;
6735  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6736                              SchedWriteVecIMul.XMM>;
6737  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6738                              SchedWriteVecIMul.XMM>;
6739
6740  let Uses=[XMM0] in
6741  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6742                                SchedWriteVecIMul.XMM, 1>;
6743
6744  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6745                               SchedWriteVecIMul.XMM>;
6746  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6747                               SchedWriteVecIMul.XMM>;
6748}
6749
6750// Aliases with explicit %xmm0
6751def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6752                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6753def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6754                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6755
6756//===----------------------------------------------------------------------===//
6757// AES-NI Instructions
6758//===----------------------------------------------------------------------===//
6759
6760multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6761                             Intrinsic IntId, PatFrag ld_frag,
6762                             bit Is2Addr = 0, RegisterClass RC = VR128,
6763                             X86MemOperand MemOp = i128mem> {
6764  let AsmString = OpcodeStr#
6765                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6766                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6767    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6768                   (ins RC:$src1, RC:$src2), "",
6769                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6770                   Sched<[WriteAESDecEnc]>;
6771    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6772                   (ins RC:$src1, MemOp:$src2), "",
6773                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6774                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6775  }
6776}
6777
6778// Perform One Round of an AES Encryption/Decryption Flow
6779let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6780  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6781                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6782  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6783                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6784  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6785                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6786  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6787                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6788}
6789
6790let Predicates = [NoVLX, HasVAES] in {
6791  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6792                         int_x86_aesni_aesenc_256, load, 0, VR256,
6793                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6794  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6795                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6796                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6797  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6798                         int_x86_aesni_aesdec_256, load, 0, VR256,
6799                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6800  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6801                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6802                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6803}
6804
6805let Constraints = "$src1 = $dst" in {
6806  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6807                         int_x86_aesni_aesenc, memop, 1>;
6808  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6809                         int_x86_aesni_aesenclast, memop, 1>;
6810  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6811                         int_x86_aesni_aesdec, memop, 1>;
6812  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6813                         int_x86_aesni_aesdeclast, memop, 1>;
6814}
6815
6816// Perform the AES InvMixColumn Transformation
6817let Predicates = [HasAVX, HasAES] in {
6818  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6819      (ins VR128:$src1),
6820      "vaesimc\t{$src1, $dst|$dst, $src1}",
6821      [(set VR128:$dst,
6822        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6823      VEX, VEX_WIG;
6824  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6825      (ins i128mem:$src1),
6826      "vaesimc\t{$src1, $dst|$dst, $src1}",
6827      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6828      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6829}
6830def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6831  (ins VR128:$src1),
6832  "aesimc\t{$src1, $dst|$dst, $src1}",
6833  [(set VR128:$dst,
6834    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6835def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6836  (ins i128mem:$src1),
6837  "aesimc\t{$src1, $dst|$dst, $src1}",
6838  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6839  Sched<[WriteAESIMC.Folded]>;
6840
6841// AES Round Key Generation Assist
6842let Predicates = [HasAVX, HasAES] in {
6843  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6844      (ins VR128:$src1, u8imm:$src2),
6845      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6846      [(set VR128:$dst,
6847        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6848      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6849  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6850      (ins i128mem:$src1, u8imm:$src2),
6851      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6852      [(set VR128:$dst,
6853        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6854      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6855}
6856def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6857  (ins VR128:$src1, u8imm:$src2),
6858  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6859  [(set VR128:$dst,
6860    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6861  Sched<[WriteAESKeyGen]>;
6862def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6863  (ins i128mem:$src1, u8imm:$src2),
6864  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6865  [(set VR128:$dst,
6866    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6867  Sched<[WriteAESKeyGen.Folded]>;
6868
6869//===----------------------------------------------------------------------===//
6870// PCLMUL Instructions
6871//===----------------------------------------------------------------------===//
6872
6873// Immediate transform to help with commuting.
6874def PCLMULCommuteImm : SDNodeXForm<timm, [{
6875  uint8_t Imm = N->getZExtValue();
6876  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6877}]>;
6878
6879// SSE carry-less Multiplication instructions
6880let Predicates = [NoAVX, HasPCLMUL] in {
6881  let Constraints = "$src1 = $dst" in {
6882    let isCommutable = 1 in
6883    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6884              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6885              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6886              [(set VR128:$dst,
6887                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6888                Sched<[WriteCLMul]>;
6889
6890    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6891              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6892              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6893              [(set VR128:$dst,
6894                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6895                  timm:$src3))]>,
6896              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6897  } // Constraints = "$src1 = $dst"
6898
6899  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6900                                (i8 timm:$src3)),
6901            (PCLMULQDQrm VR128:$src1, addr:$src2,
6902                          (PCLMULCommuteImm timm:$src3))>;
6903} // Predicates = [NoAVX, HasPCLMUL]
6904
6905// SSE aliases
6906foreach HI = ["hq","lq"] in
6907foreach LO = ["hq","lq"] in {
6908  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6909                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6910                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6911  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6912                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6913                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6914}
6915
6916// AVX carry-less Multiplication instructions
6917multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6918                      PatFrag LdFrag, Intrinsic IntId> {
6919  let isCommutable = 1 in
6920  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6921            (ins RC:$src1, RC:$src2, u8imm:$src3),
6922            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6923            [(set RC:$dst,
6924              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6925            Sched<[WriteCLMul]>;
6926
6927  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6928            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6929            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6930            [(set RC:$dst,
6931               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6932            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6933
6934  // We can commute a load in the first operand by swapping the sources and
6935  // rotating the immediate.
6936  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6937            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6938                                           (PCLMULCommuteImm timm:$src3))>;
6939}
6940
6941let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6942defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6943                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6944
6945let Predicates = [NoVLX, HasVPCLMULQDQ] in
6946defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6947                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6948
6949multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6950                                   X86MemOperand MemOp, string Hi, string Lo> {
6951  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6952                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6953                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6954  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6955                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6956                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6957}
6958
6959multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6960                              X86MemOperand MemOp> {
6961  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6962  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6963  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6964  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6965}
6966
6967// AVX aliases
6968defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6969defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6970
6971//===----------------------------------------------------------------------===//
6972// SSE4A Instructions
6973//===----------------------------------------------------------------------===//
6974
6975let Predicates = [HasSSE4A] in {
6976
6977let ExeDomain = SSEPackedInt in {
6978let Constraints = "$src = $dst" in {
6979def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6980                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6981                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6982                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6983                                    timm:$idx))]>,
6984                 PD, Sched<[SchedWriteVecALU.XMM]>;
6985def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6986              (ins VR128:$src, VR128:$mask),
6987              "extrq\t{$mask, $src|$src, $mask}",
6988              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6989                                 VR128:$mask))]>,
6990              PD, Sched<[SchedWriteVecALU.XMM]>;
6991
6992def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6993                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6994                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6995                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6996                                      timm:$len, timm:$idx))]>,
6997                   XD, Sched<[SchedWriteVecALU.XMM]>;
6998def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6999                 (ins VR128:$src, VR128:$mask),
7000                 "insertq\t{$mask, $src|$src, $mask}",
7001                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7002                                    VR128:$mask))]>,
7003                 XD, Sched<[SchedWriteVecALU.XMM]>;
7004}
7005} // ExeDomain = SSEPackedInt
7006
7007// Non-temporal (unaligned) scalar stores.
7008let AddedComplexity = 400 in { // Prefer non-temporal versions
7009let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7010def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7011                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7012
7013def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7014                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7015} // SchedRW
7016
7017def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7018          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7019
7020def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7021          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7022
7023} // AddedComplexity
7024} // HasSSE4A
7025
7026//===----------------------------------------------------------------------===//
7027// AVX Instructions
7028//===----------------------------------------------------------------------===//
7029
7030//===----------------------------------------------------------------------===//
7031// VBROADCAST - Load from memory and broadcast to all elements of the
7032//              destination operand
7033//
7034class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7035                           X86MemOperand x86memop, ValueType VT,
7036                           PatFrag bcast_frag, SchedWrite Sched> :
7037  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7038        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7039        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
7040        Sched<[Sched]>, VEX;
7041
7042// AVX2 adds register forms
7043class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7044                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7045  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7046         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7047         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7048         Sched<[Sched]>, VEX;
7049
7050let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7051  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7052                                         f32mem, v4f32, X86VBroadcastld32,
7053                                         SchedWriteFShuffle.XMM.Folded>;
7054  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7055                                         f32mem, v8f32, X86VBroadcastld32,
7056                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
7057}
7058let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7059def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7060                                        v4f64, X86VBroadcastld64,
7061                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
7062
7063let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7064  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7065                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
7066  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7067                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7068}
7069let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7070def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7071                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7072
7073//===----------------------------------------------------------------------===//
7074// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7075//                  halves of a 256-bit vector.
7076//
7077let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7078def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7079                           (ins i128mem:$src),
7080                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7081                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7082
7083let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7084    ExeDomain = SSEPackedSingle in
7085def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7086                           (ins f128mem:$src),
7087                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7088                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7089
7090let Predicates = [HasAVX, NoVLX] in {
7091def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
7092          (VBROADCASTF128 addr:$src)>;
7093def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
7094          (VBROADCASTF128 addr:$src)>;
7095// NOTE: We're using FP instructions here, but execution domain fixing can
7096// convert to integer when profitable.
7097def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
7098          (VBROADCASTF128 addr:$src)>;
7099def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
7100          (VBROADCASTF128 addr:$src)>;
7101def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
7102          (VBROADCASTF128 addr:$src)>;
7103def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
7104          (VBROADCASTF128 addr:$src)>;
7105def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
7106          (VBROADCASTF128 addr:$src)>;
7107}
7108
7109//===----------------------------------------------------------------------===//
7110// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7111//
7112
7113let ExeDomain = SSEPackedSingle in {
7114let isCommutable = 1 in
7115def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7116          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7117          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7118          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
7119def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7120          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7121          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7122          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7123}
7124
7125// Immediate transform to help with commuting.
7126def Perm2XCommuteImm : SDNodeXForm<timm, [{
7127  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7128}]>;
7129
7130multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7131  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7132            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7133  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7134            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7135  // Pattern with load in other operand.
7136  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7137            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7138                                             (Perm2XCommuteImm timm:$imm))>;
7139}
7140
7141let Predicates = [HasAVX] in {
7142  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7143  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
7144}
7145
7146let Predicates = [HasAVX1Only] in {
7147  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
7148  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
7149  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7150  defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
7151  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
7152}
7153
7154//===----------------------------------------------------------------------===//
7155// VINSERTF128 - Insert packed floating-point values
7156//
7157let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7158def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7159          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7160          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7161          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7162let mayLoad = 1 in
7163def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7164          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7165          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7166          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7167}
7168
7169// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7170// with YMM register containing zero.
7171// FIXME: Avoid producing vxorps to clear the fake inputs.
7172let Predicates = [HasAVX1Only] in {
7173def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7174}
7175
7176multiclass vinsert_lowering<string InstrStr, string PermStr,
7177                            ValueType From, ValueType To,
7178                            PatFrag frommemop_frag, PatFrag tomemop_frag> {
7179  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7180                                   (iPTR imm)),
7181            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7182                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7183  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7184                                    (From (frommemop_frag addr:$src2)),
7185                                    (iPTR imm)),
7186            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7187                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7188  // Folding "To" vector - convert to perm2x128 and commute inputs.
7189  def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
7190                                    (From VR128:$src2),
7191                                    (iPTR imm)),
7192            (!cast<Instruction>(PermStr#rm)
7193              (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
7194              addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
7195}
7196
7197let Predicates = [HasAVX, NoVLX] in {
7198  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
7199  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
7200}
7201
7202let Predicates = [HasAVX1Only] in {
7203  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
7204  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
7205  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
7206  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
7207  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7208  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7209}
7210
7211//===----------------------------------------------------------------------===//
7212// VEXTRACTF128 - Extract packed floating-point values
7213//
7214let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7215def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7216          (ins VR256:$src1, u8imm:$src2),
7217          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7218          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7219let mayStore = 1 in
7220def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7221          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7222          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7223          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7224}
7225
7226multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7227  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7228            (To (!cast<Instruction>(InstrStr#rr)
7229                                    (From VR256:$src1),
7230                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7231  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7232                                                 (iPTR imm))), addr:$dst),
7233            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7234             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7235}
7236
7237// AVX1 patterns
7238let Predicates = [HasAVX, NoVLX] in {
7239  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7240  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7241}
7242
7243let Predicates = [HasAVX1Only] in {
7244  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7245  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7246  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7247  defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
7248  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7249  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7250}
7251
7252//===----------------------------------------------------------------------===//
7253// VMASKMOV - Conditional SIMD Packed Loads and Stores
7254//
7255multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7256                          Intrinsic IntLd, Intrinsic IntLd256,
7257                          Intrinsic IntSt, Intrinsic IntSt256,
7258                          X86SchedWriteMaskMove schedX,
7259                          X86SchedWriteMaskMove schedY> {
7260  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7261             (ins VR128:$src1, f128mem:$src2),
7262             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7263             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7264             VEX_4V, Sched<[schedX.RM]>;
7265  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7266             (ins VR256:$src1, f256mem:$src2),
7267             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7268             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7269             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7270  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7271             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7272             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7273             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7274             VEX_4V, Sched<[schedX.MR]>;
7275  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7276             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7277             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7278             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7279             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7280}
7281
7282let ExeDomain = SSEPackedSingle in
7283defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7284                                 int_x86_avx_maskload_ps,
7285                                 int_x86_avx_maskload_ps_256,
7286                                 int_x86_avx_maskstore_ps,
7287                                 int_x86_avx_maskstore_ps_256,
7288                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7289let ExeDomain = SSEPackedDouble in
7290defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7291                                 int_x86_avx_maskload_pd,
7292                                 int_x86_avx_maskload_pd_256,
7293                                 int_x86_avx_maskstore_pd,
7294                                 int_x86_avx_maskstore_pd_256,
7295                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7296
7297//===----------------------------------------------------------------------===//
7298// AVX_VNNI
7299//===----------------------------------------------------------------------===//
7300let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
7301    ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
7302multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7303                       bit IsCommutable> {
7304  let isCommutable = IsCommutable in
7305  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
7306             (ins VR128:$src1, VR128:$src2, VR128:$src3),
7307             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7308             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
7309                                       VR128:$src2, VR128:$src3)))]>,
7310             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7311
7312  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
7313             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
7314             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7315             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
7316                                      (loadv4i32 addr:$src3))))]>,
7317             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7318
7319  let isCommutable = IsCommutable in
7320  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
7321             (ins VR256:$src1, VR256:$src2, VR256:$src3),
7322             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7323             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
7324                                       VR256:$src2, VR256:$src3)))]>,
7325             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7326
7327  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
7328             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
7329             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7330             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
7331                                      (loadv8i32 addr:$src3))))]>,
7332             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7333}
7334
7335defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
7336defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
7337defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>;
7338defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
7339
7340def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
7341                             (X86vpmaddwd node:$lhs, node:$rhs), [{
7342  return N->hasOneUse();
7343}]>;
7344
7345let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
7346  def : Pat<(v8i32 (add VR256:$src1,
7347                        (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
7348            (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
7349  def : Pat<(v8i32 (add VR256:$src1,
7350                        (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
7351            (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
7352  def : Pat<(v4i32 (add VR128:$src1,
7353                        (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
7354            (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
7355  def : Pat<(v4i32 (add VR128:$src1,
7356                        (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
7357            (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
7358}
7359
7360//===----------------------------------------------------------------------===//
7361// VPERMIL - Permute Single and Double Floating-Point Values
7362//
7363
7364multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7365                      RegisterClass RC, X86MemOperand x86memop_f,
7366                      X86MemOperand x86memop_i,
7367                      ValueType f_vt, ValueType i_vt,
7368                      X86FoldableSchedWrite sched,
7369                      X86FoldableSchedWrite varsched> {
7370  let Predicates = [HasAVX, NoVLX] in {
7371    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7372               (ins RC:$src1, RC:$src2),
7373               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7374               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7375               Sched<[varsched]>;
7376    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7377               (ins RC:$src1, x86memop_i:$src2),
7378               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7379               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7380                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7381               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7382
7383    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7384             (ins RC:$src1, u8imm:$src2),
7385             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7386             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7387             Sched<[sched]>;
7388    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7389             (ins x86memop_f:$src1, u8imm:$src2),
7390             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7391             [(set RC:$dst,
7392               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7393             Sched<[sched.Folded]>;
7394  }// Predicates = [HasAVX, NoVLX]
7395}
7396
7397let ExeDomain = SSEPackedSingle in {
7398  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7399                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7400                               SchedWriteFVarShuffle.XMM>;
7401  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7402                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7403                               SchedWriteFVarShuffle.YMM>, VEX_L;
7404}
7405let ExeDomain = SSEPackedDouble in {
7406  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7407                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7408                               SchedWriteFVarShuffle.XMM>;
7409  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7410                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7411                               SchedWriteFVarShuffle.YMM>, VEX_L;
7412}
7413
7414//===----------------------------------------------------------------------===//
7415// VZERO - Zero YMM registers
7416// Note: These instruction do not affect the YMM16-YMM31.
7417//
7418
7419let SchedRW = [WriteSystem] in {
7420let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7421            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7422  // Zero All YMM registers
7423  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7424                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7425                  Requires<[HasAVX]>, VEX_WIG;
7426
7427  // Zero Upper bits of YMM registers
7428  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7429                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7430                     Requires<[HasAVX]>, VEX_WIG;
7431} // Defs
7432} // SchedRW
7433
7434//===----------------------------------------------------------------------===//
7435// Half precision conversion instructions
7436//
7437
7438multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7439                      X86FoldableSchedWrite sched> {
7440  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7441             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7442             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7443             T8PD, VEX, Sched<[sched]>;
7444  let hasSideEffects = 0, mayLoad = 1 in
7445  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7446             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7447             []>, T8PD, VEX, Sched<[sched.Folded]>;
7448}
7449
7450multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7451                      SchedWrite RR, SchedWrite MR> {
7452  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7453               (ins RC:$src1, i32u8imm:$src2),
7454               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7455               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7456               TAPD, VEX, Sched<[RR]>;
7457  let hasSideEffects = 0, mayStore = 1 in
7458  def mr : Ii8<0x1D, MRMDestMem, (outs),
7459               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7460               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7461               TAPD, VEX, Sched<[MR]>;
7462}
7463
7464let Predicates = [HasF16C, NoVLX] in {
7465  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7466  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7467  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7468                               WriteCvtPS2PHSt>, SIMD_EXC;
7469  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7470                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7471
7472  // Pattern match vcvtph2ps of a scalar i64 load.
7473  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7474            (VCVTPH2PSrm addr:$src)>;
7475  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7476              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7477            (VCVTPH2PSrm addr:$src)>;
7478  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7479            (VCVTPH2PSYrm addr:$src)>;
7480
7481  def : Pat<(store (f64 (extractelt
7482                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7483                         (iPTR 0))), addr:$dst),
7484            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7485  def : Pat<(store (i64 (extractelt
7486                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7487                         (iPTR 0))), addr:$dst),
7488            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7489  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7490            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7491}
7492
7493//===----------------------------------------------------------------------===//
7494// AVX2 Instructions
7495//===----------------------------------------------------------------------===//
7496
7497/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7498multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7499                          ValueType OpVT, X86FoldableSchedWrite sched,
7500                          RegisterClass RC,
7501                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7502  let isCommutable = 1 in
7503  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7504        (ins RC:$src1, RC:$src2, u8imm:$src3),
7505        !strconcat(OpcodeStr,
7506            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7507        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7508        Sched<[sched]>, VEX_4V;
7509  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7510        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7511        !strconcat(OpcodeStr,
7512            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7513        [(set RC:$dst,
7514          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7515        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7516
7517  // Pattern to commute if load is in first source.
7518  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7519            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7520                                            (commuteXForm timm:$src3))>;
7521}
7522
7523let Predicates = [HasAVX2] in {
7524defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7525                               SchedWriteBlend.XMM, VR128, i128mem,
7526                               BlendCommuteImm4>;
7527defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7528                                SchedWriteBlend.YMM, VR256, i256mem,
7529                                BlendCommuteImm8>, VEX_L;
7530
7531def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7532          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7533def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7534          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7535def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7536          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7537
7538def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7539          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7540def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7541          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7542def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7543          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7544}
7545
7546// For insertion into the zero index (low half) of a 256-bit vector, it is
7547// more efficient to generate a blend with immediate instead of an insert*128.
7548// NOTE: We're using FP instructions here, but execution domain fixing should
7549// take care of using integer instructions when profitable.
7550let Predicates = [HasAVX] in {
7551def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7552          (VBLENDPSYrri VR256:$src1,
7553                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7554                                       VR128:$src2, sub_xmm), 0xf)>;
7555def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7556          (VBLENDPSYrri VR256:$src1,
7557                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7558                                       VR128:$src2, sub_xmm), 0xf)>;
7559def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7560          (VBLENDPSYrri VR256:$src1,
7561                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7562                                       VR128:$src2, sub_xmm), 0xf)>;
7563def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
7564          (VBLENDPSYrri VR256:$src1,
7565                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7566                                       VR128:$src2, sub_xmm), 0xf)>;
7567def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7568          (VBLENDPSYrri VR256:$src1,
7569                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7570                                       VR128:$src2, sub_xmm), 0xf)>;
7571
7572def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7573          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7574                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7575def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7576          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7577                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7578def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7579          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7580                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7581def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
7582          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7583                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7584def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7585          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7586                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7587}
7588
7589//===----------------------------------------------------------------------===//
7590// VPBROADCAST - Load from memory and broadcast to all elements of the
7591//               destination operand
7592//
7593multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7594                          X86MemOperand x86memop, PatFrag bcast_frag,
7595                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7596  let Predicates = [HasAVX2, prd] in {
7597    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7598                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7599                  [(set VR128:$dst,
7600                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7601                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7602    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7603                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7604                  [(set VR128:$dst,
7605                   (OpVT128 (bcast_frag addr:$src)))]>,
7606                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7607    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7608                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7609                   [(set VR256:$dst,
7610                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7611                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7612    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7613                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7614                   [(set VR256:$dst,
7615                    (OpVT256 (bcast_frag addr:$src)))]>,
7616                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7617
7618    // Provide aliases for broadcast from the same register class that
7619    // automatically does the extract.
7620    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7621              (!cast<Instruction>(NAME#"Yrr")
7622                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7623  }
7624}
7625
7626defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7627                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7628defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7629                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7630defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7631                                    v4i32, v8i32, NoVLX>;
7632defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7633                                    v2i64, v4i64, NoVLX>;
7634
7635let Predicates = [HasAVX2, NoVLX] in {
7636  // Provide fallback in case the load node that is used in the patterns above
7637  // is used by additional users, which prevents the pattern selection.
7638    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7639              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7640    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7641              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7642    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7643              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7644}
7645
7646let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7647  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7648        (VPBROADCASTBrr (VMOVDI2PDIrr
7649                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7650                                             GR8:$src, sub_8bit))))>;
7651  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7652        (VPBROADCASTBYrr (VMOVDI2PDIrr
7653                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7654                                              GR8:$src, sub_8bit))))>;
7655
7656  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7657        (VPBROADCASTWrr (VMOVDI2PDIrr
7658                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7659                                             GR16:$src, sub_16bit))))>;
7660  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7661        (VPBROADCASTWYrr (VMOVDI2PDIrr
7662                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7663                                              GR16:$src, sub_16bit))))>;
7664
7665  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
7666            (VPBROADCASTWrm addr:$src)>;
7667  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
7668            (VPBROADCASTWYrm addr:$src)>;
7669
7670  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
7671            (VPBROADCASTWrr VR128:$src)>;
7672  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
7673            (VPBROADCASTWYrr VR128:$src)>;
7674
7675  def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
7676            (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7677  def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
7678            (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7679}
7680let Predicates = [HasAVX2, NoVLX] in {
7681  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7682            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7683  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7684            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7685  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7686            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7687  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7688            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7689}
7690
7691// AVX1 broadcast patterns
7692let Predicates = [HasAVX1Only] in {
7693def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7694          (VBROADCASTSSYrm addr:$src)>;
7695def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7696          (VBROADCASTSDYrm addr:$src)>;
7697def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7698          (VBROADCASTSSrm addr:$src)>;
7699}
7700
7701  // Provide fallback in case the load node that is used in the patterns above
7702  // is used by additional users, which prevents the pattern selection.
7703let Predicates = [HasAVX, NoVLX] in {
7704  // 128bit broadcasts:
7705  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7706            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7707  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7708            (VMOVDDUPrm addr:$src)>;
7709
7710  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7711            (VMOVDDUPrr VR128:$src)>;
7712}
7713
7714let Predicates = [HasAVX1Only] in {
7715  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7716            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7717  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7718            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7719              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7720              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7721  def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
7722            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7723              (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
7724              (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
7725  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7726            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7727              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7728              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7729  def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
7730            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7731              (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
7732              (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
7733
7734  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7735            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7736  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7737            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7738              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7739              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7740  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7741            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7742              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7743              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7744
7745  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7746            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7747  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7748            (VMOVDDUPrm addr:$src)>;
7749}
7750
7751//===----------------------------------------------------------------------===//
7752// VPERM - Permute instructions
7753//
7754
7755multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7756                     ValueType OpVT, X86FoldableSchedWrite Sched,
7757                     X86MemOperand memOp> {
7758  let Predicates = [HasAVX2, NoVLX] in {
7759    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7760                     (ins VR256:$src1, VR256:$src2),
7761                     !strconcat(OpcodeStr,
7762                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7763                     [(set VR256:$dst,
7764                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7765                     Sched<[Sched]>, VEX_4V, VEX_L;
7766    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7767                     (ins VR256:$src1, memOp:$src2),
7768                     !strconcat(OpcodeStr,
7769                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7770                     [(set VR256:$dst,
7771                       (OpVT (X86VPermv VR256:$src1,
7772                              (load addr:$src2))))]>,
7773                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7774  }
7775}
7776
7777defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7778let ExeDomain = SSEPackedSingle in
7779defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7780
7781multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7782                         ValueType OpVT, X86FoldableSchedWrite Sched,
7783                         X86MemOperand memOp> {
7784  let Predicates = [HasAVX2, NoVLX] in {
7785    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7786                       (ins VR256:$src1, u8imm:$src2),
7787                       !strconcat(OpcodeStr,
7788                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7789                       [(set VR256:$dst,
7790                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7791                       Sched<[Sched]>, VEX, VEX_L;
7792    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7793                       (ins memOp:$src1, u8imm:$src2),
7794                       !strconcat(OpcodeStr,
7795                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7796                       [(set VR256:$dst,
7797                         (OpVT (X86VPermi (mem_frag addr:$src1),
7798                                (i8 timm:$src2))))]>,
7799                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7800  }
7801}
7802
7803defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7804                            WriteShuffle256, i256mem>, VEX_W;
7805let ExeDomain = SSEPackedDouble in
7806defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7807                             WriteFShuffle256, f256mem>, VEX_W;
7808
7809//===----------------------------------------------------------------------===//
7810// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
7811//
7812let isCommutable = 1 in
7813def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7814          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7815          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7816          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7817def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7818          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7819          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7820          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7821
7822let Predicates = [HasAVX2] in {
7823  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
7824  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
7825  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7826  defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
7827  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7828  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7829}
7830
7831//===----------------------------------------------------------------------===//
7832// VINSERTI128 - Insert packed integer values
7833//
7834let hasSideEffects = 0 in {
7835def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7836          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7837          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7838          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7839let mayLoad = 1 in
7840def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7841          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7842          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7843          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7844}
7845
7846let Predicates = [HasAVX2, NoVLX] in {
7847  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
7848  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
7849  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
7850  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16,  loadv16f16>;
7851  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7852  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7853}
7854
7855//===----------------------------------------------------------------------===//
7856// VEXTRACTI128 - Extract packed integer values
7857//
7858def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7859          (ins VR256:$src1, u8imm:$src2),
7860          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7861          Sched<[WriteShuffle256]>, VEX, VEX_L;
7862let hasSideEffects = 0, mayStore = 1 in
7863def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7864          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7865          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7866          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7867
7868let Predicates = [HasAVX2, NoVLX] in {
7869  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7870  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7871  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7872  defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
7873  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7874  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7875}
7876
7877//===----------------------------------------------------------------------===//
7878// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7879//
7880multiclass avx2_pmovmask<string OpcodeStr,
7881                         Intrinsic IntLd128, Intrinsic IntLd256,
7882                         Intrinsic IntSt128, Intrinsic IntSt256,
7883                         X86SchedWriteMaskMove schedX,
7884                         X86SchedWriteMaskMove schedY> {
7885  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7886             (ins VR128:$src1, i128mem:$src2),
7887             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7888             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7889             VEX_4V, Sched<[schedX.RM]>;
7890  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7891             (ins VR256:$src1, i256mem:$src2),
7892             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7893             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7894             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7895  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7896             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7897             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7898             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7899             VEX_4V, Sched<[schedX.MR]>;
7900  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7901             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7902             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7903             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7904             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7905}
7906
7907defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7908                                int_x86_avx2_maskload_d,
7909                                int_x86_avx2_maskload_d_256,
7910                                int_x86_avx2_maskstore_d,
7911                                int_x86_avx2_maskstore_d_256,
7912                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
7913defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7914                                int_x86_avx2_maskload_q,
7915                                int_x86_avx2_maskload_q_256,
7916                                int_x86_avx2_maskstore_q,
7917                                int_x86_avx2_maskstore_q_256,
7918                                WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
7919
7920multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7921                          ValueType MaskVT> {
7922    // masked store
7923    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7924             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7925    // masked load
7926    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7927             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7928    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7929                              (VT immAllZerosV))),
7930             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7931}
7932let Predicates = [HasAVX] in {
7933  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7934  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7935  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7936  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7937}
7938let Predicates = [HasAVX1Only] in {
7939  // load/store i32/i64 not supported use ps/pd version
7940  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7941  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7942  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7943  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7944}
7945let Predicates = [HasAVX2] in {
7946  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7947  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7948  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7949  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7950}
7951
7952//===----------------------------------------------------------------------===//
7953// Variable Bit Shifts
7954//
7955multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7956                          ValueType vt128, ValueType vt256> {
7957  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7958             (ins VR128:$src1, VR128:$src2),
7959             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7960             [(set VR128:$dst,
7961               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7962             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7963  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7964             (ins VR128:$src1, i128mem:$src2),
7965             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7966             [(set VR128:$dst,
7967               (vt128 (OpNode VR128:$src1,
7968                       (vt128 (load addr:$src2)))))]>,
7969             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7970                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7971  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7972             (ins VR256:$src1, VR256:$src2),
7973             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7974             [(set VR256:$dst,
7975               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7976             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7977  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7978             (ins VR256:$src1, i256mem:$src2),
7979             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7980             [(set VR256:$dst,
7981               (vt256 (OpNode VR256:$src1,
7982                       (vt256 (load addr:$src2)))))]>,
7983             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7984                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7985}
7986
7987let Predicates = [HasAVX2, NoVLX] in {
7988  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7989  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7990  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7991  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7992  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7993}
7994
7995//===----------------------------------------------------------------------===//
7996// VGATHER - GATHER Operations
7997
7998// FIXME: Improve scheduling of gather instructions.
7999multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8000                       X86MemOperand memop128, X86MemOperand memop256> {
8001let mayLoad = 1, hasSideEffects = 0 in {
8002  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8003            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8004            !strconcat(OpcodeStr,
8005              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8006            []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8007  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8008            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8009            !strconcat(OpcodeStr,
8010              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8011            []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8012}
8013}
8014
8015let Predicates = [HasAVX2] in {
8016  let mayLoad = 1, hasSideEffects = 0, Constraints
8017    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8018    in {
8019    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
8020                                  VR256, vx128mem, vx256mem>, VEX_W;
8021    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
8022                                  VR256, vx128mem, vy256mem>, VEX_W;
8023    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
8024                                  VR256, vx128mem, vy256mem>;
8025    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
8026                                  VR128, vx64mem, vy128mem>;
8027
8028    let ExeDomain = SSEPackedDouble in {
8029      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
8030                                    VR256, vx128mem, vx256mem>, VEX_W;
8031      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
8032                                    VR256, vx128mem, vy256mem>, VEX_W;
8033    }
8034
8035    let ExeDomain = SSEPackedSingle in {
8036      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
8037                                    VR256, vx128mem, vy256mem>;
8038      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
8039                                    VR128, vx64mem, vy128mem>;
8040    }
8041  }
8042}
8043
8044//===----------------------------------------------------------------------===//
8045// GFNI instructions
8046//===----------------------------------------------------------------------===//
8047
8048multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8049                        RegisterClass RC, PatFrag MemOpFrag,
8050                        X86MemOperand X86MemOp, bit Is2Addr = 0> {
8051  let ExeDomain = SSEPackedInt,
8052      AsmString = !if(Is2Addr,
8053        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
8054        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8055    let isCommutable = 1 in
8056    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8057                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8058             Sched<[SchedWriteVecALU.XMM]>, T8PD;
8059
8060    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8061                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8062                                 (MemOpFrag addr:$src2))))]>,
8063             Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
8064  }
8065}
8066
8067multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8068                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8069                           X86MemOperand X86MemOp, bit Is2Addr = 0> {
8070  let AsmString = !if(Is2Addr,
8071      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8072      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8073  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8074              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8075              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
8076              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
8077  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8078              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8079              [(set RC:$dst, (OpVT (OpNode RC:$src1,
8080                                    (MemOpFrag addr:$src2),
8081                              timm:$src3)))], SSEPackedInt>,
8082              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
8083  }
8084}
8085
8086multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8087  let Constraints = "$src1 = $dst",
8088      Predicates  = [HasGFNI, UseSSE2] in
8089  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8090                                      VR128, load, i128mem, 1>;
8091  let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8092    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
8093                                      load, i128mem>, VEX_4V, VEX_W;
8094    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
8095                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
8096  }
8097}
8098
8099// GF2P8MULB
8100let Constraints = "$src1 = $dst",
8101    Predicates  = [HasGFNI, UseSSE2] in
8102defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8103                                    i128mem, 1>;
8104let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8105  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8106                                   i128mem>, VEX_4V;
8107  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8108                                   i256mem>, VEX_4V, VEX_L;
8109}
8110// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8111let isCommutable = 0 in {
8112  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8113                                             X86GF2P8affineinvqb>, TAPD;
8114  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8115                                             X86GF2P8affineqb>, TAPD;
8116}
8117
8118