xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               PatFrags mem_frags, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "",
116                   [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>;
117  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
118                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
119  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
120                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
121  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
122                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
123}
124
125//===----------------------------------------------------------------------===//
126// AVX & SSE - Zero/One Vectors
127//===----------------------------------------------------------------------===//
128
129// Alias instruction that maps zero vector to pxor / xorp* for sse.
130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
131// swizzled by ExecutionDomainFix to pxor.
132// We set canFoldAsLoad because this can be converted to a constant-pool
133// load of an all-zeros value if folding it would be beneficial.
134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
135    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
137               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
138}
139
140let Predicates = [NoAVX512] in {
141def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
142def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
143def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
144def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
145def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
146def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
147}
148
149
150// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
151// and doesn't need it because on sandy bridge the register is set to zero
152// at the rename stage without using any execution unit, so SET0PSY
153// and SET0PDY can be used for vector int instructions without penalty
154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
155    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
157                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
158}
159
160let Predicates = [NoAVX512] in {
161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
167}
168
169// We set canFoldAsLoad because this can be converted to a constant-pool
170// load of an all-ones value if folding it would be beneficial.
171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
172    isPseudo = 1, SchedRW = [WriteZero] in {
173  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
174                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
175  let Predicates = [HasAVX1Only, OptForMinSize] in {
176  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178  }
179  let Predicates = [HasAVX2] in
180  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
181                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
182}
183
184//===----------------------------------------------------------------------===//
185// SSE 1 & 2 - Move FP Scalar Instructions
186//
187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
188// register copies because it's a partial register update; Register-to-register
189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
190// that the insert be implementable in terms of a copy, and just mentioned, we
191// don't use movss/movsd for copies.
192//===----------------------------------------------------------------------===//
193
194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
195                         string asm_opr, Domain d> {
196  let isCommutable = 1 in
197  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
198              (ins VR128:$src1, VR128:$src2),
199              !strconcat(base_opc, asm_opr),
200              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
201              Sched<[SchedWriteFShuffle.XMM]>;
202
203  // For the disassembler
204  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
205  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
206                  (ins VR128:$src1, VR128:$src2),
207                  !strconcat(base_opc, asm_opr), []>,
208                  Sched<[SchedWriteFShuffle.XMM]>;
209}
210
211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
212                      X86MemOperand x86memop, string OpcodeStr,
213                      Domain d, Predicate pred> {
214  // AVX
215  let Predicates = [UseAVX, OptForSize] in
216  defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
217                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
218                              VEX_4V, VEX_LIG, WIG;
219
220  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
221                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
222                     [(store RC:$src, addr:$dst)], d>,
223                     VEX, VEX_LIG, Sched<[WriteFStore]>, WIG;
224  // SSE1 & 2
225  let Constraints = "$src1 = $dst" in {
226    let Predicates = [pred, NoSSE41_Or_OptForSize] in
227    defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
228                              "\t{$src2, $dst|$dst, $src2}", d>;
229  }
230
231  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
232                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
233                     [(store RC:$src, addr:$dst)], d>,
234                     Sched<[WriteFStore]>;
235
236  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
237                  (!cast<Instruction>("V"#NAME#"rr_REV")
238                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
239  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
240                  (!cast<Instruction>(NAME#"rr_REV")
241                   VR128:$dst, VR128:$src2), 0>;
242}
243
244// Loading from memory automatically zeroing upper bits.
245multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
246                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
247                         Domain d> {
248  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
249                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
250                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
251                     VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG;
252  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
253                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
254                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
255                     Sched<[WriteFLoad]>;
256
257  // _alt version uses FR32/FR64 register class.
258  let isCodeGenOnly = 1 in {
259  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
260                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
261                         [(set RC:$dst, (mem_pat addr:$src))], d>,
262                         VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG;
263  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
264                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
265                         [(set RC:$dst, (mem_pat addr:$src))], d>,
266                         Sched<[WriteFLoad]>;
267  }
268}
269
270defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
271                        SSEPackedSingle, UseSSE1>, XS;
272defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
273                        SSEPackedDouble, UseSSE2>, XD;
274
275let canFoldAsLoad = 1, isReMaterializable = 1 in {
276  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
277                             SSEPackedSingle>, XS;
278  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
279                             SSEPackedDouble>, XD;
280}
281
282// Patterns
283let Predicates = [UseAVX] in {
284  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
285            (VMOVSSrm addr:$src)>;
286  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
287            (VMOVSDrm addr:$src)>;
288
289  // Represent the same patterns above but in the form they appear for
290  // 256-bit types
291  def : Pat<(v8f32 (X86vzload32 addr:$src)),
292            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
293  def : Pat<(v4f64 (X86vzload64 addr:$src)),
294            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
295}
296
297let Predicates = [UseAVX, OptForSize] in {
298  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
299  // MOVSS to the lower bits.
300  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
301            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
302  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
303            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
304
305  // Move low f32 and clear high bits.
306  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
307            (SUBREG_TO_REG (i32 0),
308             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
309              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
310  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
311            (SUBREG_TO_REG (i32 0),
312             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
313              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
314}
315
316let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
317// Move scalar to XMM zero-extended, zeroing a VR128 then do a
318// MOVSS to the lower bits.
319def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
320          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
321def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
322          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
323}
324
325let Predicates = [UseSSE2] in
326def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
327          (MOVSDrm addr:$src)>;
328
329let Predicates = [UseSSE1] in
330def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
331          (MOVSSrm addr:$src)>;
332
333//===----------------------------------------------------------------------===//
334// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
335//===----------------------------------------------------------------------===//
336
337multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
338                            X86MemOperand x86memop, PatFrag ld_frag,
339                            string asm, Domain d,
340                            X86SchedWriteMoveLS sched> {
341let hasSideEffects = 0, isMoveReg = 1 in
342  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
343              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
344           Sched<[sched.RR]>;
345let canFoldAsLoad = 1, isReMaterializable = 1 in
346  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
347              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
348                   [(set RC:$dst, (ld_frag addr:$src))], d>,
349           Sched<[sched.RM]>;
350}
351
352let Predicates = [HasAVX, NoVLX] in {
353defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
354                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
355                                PS, VEX, WIG;
356defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
357                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
358                                PD, VEX, WIG;
359defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
360                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
361                                PS, VEX, WIG;
362defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
363                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
364                                PD, VEX, WIG;
365
366defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
367                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
368                                 PS, VEX, VEX_L, WIG;
369defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
370                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
371                                 PD, VEX, VEX_L, WIG;
372defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
373                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
374                                 PS, VEX, VEX_L, WIG;
375defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
376                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
377                                 PD, VEX, VEX_L, WIG;
378}
379
380let Predicates = [UseSSE1] in {
381defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
382                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
383                               PS;
384defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
385                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
386                               PS;
387}
388let Predicates = [UseSSE2] in {
389defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
390                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
391                               PD;
392defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
393                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
394                               PD;
395}
396
397let Predicates = [HasAVX, NoVLX]  in {
398let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
399def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
400                   "movaps\t{$src, $dst|$dst, $src}",
401                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
402                   VEX, WIG;
403def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
404                   "movapd\t{$src, $dst|$dst, $src}",
405                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
406                   VEX, WIG;
407def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
408                   "movups\t{$src, $dst|$dst, $src}",
409                   [(store (v4f32 VR128:$src), addr:$dst)]>,
410                   VEX, WIG;
411def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
412                   "movupd\t{$src, $dst|$dst, $src}",
413                   [(store (v2f64 VR128:$src), addr:$dst)]>,
414                   VEX, WIG;
415} // SchedRW
416
417let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
418def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
419                   "movaps\t{$src, $dst|$dst, $src}",
420                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
421                   VEX, VEX_L, WIG;
422def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
423                   "movapd\t{$src, $dst|$dst, $src}",
424                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
425                   VEX, VEX_L, WIG;
426def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
427                   "movups\t{$src, $dst|$dst, $src}",
428                   [(store (v8f32 VR256:$src), addr:$dst)]>,
429                   VEX, VEX_L, WIG;
430def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
431                   "movupd\t{$src, $dst|$dst, $src}",
432                   [(store (v4f64 VR256:$src), addr:$dst)]>,
433                   VEX, VEX_L, WIG;
434} // SchedRW
435} // Predicate
436
437// For disassembler
438let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
439    isMoveReg = 1 in {
440let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
441  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
442                          (ins VR128:$src),
443                          "movaps\t{$src, $dst|$dst, $src}", []>,
444                          VEX, WIG;
445  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
446                           (ins VR128:$src),
447                           "movapd\t{$src, $dst|$dst, $src}", []>,
448                           VEX, WIG;
449  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
450                           (ins VR128:$src),
451                           "movups\t{$src, $dst|$dst, $src}", []>,
452                           VEX, WIG;
453  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
454                           (ins VR128:$src),
455                           "movupd\t{$src, $dst|$dst, $src}", []>,
456                           VEX, WIG;
457} // SchedRW
458
459let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
460  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
461                            (ins VR256:$src),
462                            "movaps\t{$src, $dst|$dst, $src}", []>,
463                            VEX, VEX_L, WIG;
464  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
465                            (ins VR256:$src),
466                            "movapd\t{$src, $dst|$dst, $src}", []>,
467                            VEX, VEX_L, WIG;
468  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
469                            (ins VR256:$src),
470                            "movups\t{$src, $dst|$dst, $src}", []>,
471                            VEX, VEX_L, WIG;
472  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
473                            (ins VR256:$src),
474                            "movupd\t{$src, $dst|$dst, $src}", []>,
475                            VEX, VEX_L, WIG;
476} // SchedRW
477} // Predicate
478
479// Reversed version with ".s" suffix for GAS compatibility.
480def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
481                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
482def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
483                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
484def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
485                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
486def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
487                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
488def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
489                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
490def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
491                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
492def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
493                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
494def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
495                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
496
497let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
498def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
499                   "movaps\t{$src, $dst|$dst, $src}",
500                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
501def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
502                   "movapd\t{$src, $dst|$dst, $src}",
503                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
504def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
505                   "movups\t{$src, $dst|$dst, $src}",
506                   [(store (v4f32 VR128:$src), addr:$dst)]>;
507def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
508                   "movupd\t{$src, $dst|$dst, $src}",
509                   [(store (v2f64 VR128:$src), addr:$dst)]>;
510} // SchedRW
511
512// For disassembler
513let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
514    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
515  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
516                         "movaps\t{$src, $dst|$dst, $src}", []>;
517  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
518                         "movapd\t{$src, $dst|$dst, $src}", []>;
519  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520                         "movups\t{$src, $dst|$dst, $src}", []>;
521  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
522                         "movupd\t{$src, $dst|$dst, $src}", []>;
523}
524
525// Reversed version with ".s" suffix for GAS compatibility.
526def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
527                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
528def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
529                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
530def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
531                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
532def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
533                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
534
535let Predicates = [HasAVX, NoVLX] in {
536  // 256-bit load/store need to use floating point load/store in case we don't
537  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
538  // available and changing the domain is beneficial.
539  def : Pat<(alignedloadv4i64 addr:$src),
540            (VMOVAPSYrm addr:$src)>;
541  def : Pat<(alignedloadv8i32 addr:$src),
542            (VMOVAPSYrm addr:$src)>;
543  def : Pat<(alignedloadv16i16 addr:$src),
544            (VMOVAPSYrm addr:$src)>;
545  def : Pat<(alignedloadv32i8 addr:$src),
546            (VMOVAPSYrm addr:$src)>;
547  def : Pat<(loadv4i64 addr:$src),
548            (VMOVUPSYrm addr:$src)>;
549  def : Pat<(loadv8i32 addr:$src),
550            (VMOVUPSYrm addr:$src)>;
551  def : Pat<(loadv16i16 addr:$src),
552            (VMOVUPSYrm addr:$src)>;
553  def : Pat<(loadv32i8 addr:$src),
554            (VMOVUPSYrm addr:$src)>;
555
556  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
557            (VMOVAPSYmr addr:$dst, VR256:$src)>;
558  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
559            (VMOVAPSYmr addr:$dst, VR256:$src)>;
560  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
561            (VMOVAPSYmr addr:$dst, VR256:$src)>;
562  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
563            (VMOVAPSYmr addr:$dst, VR256:$src)>;
564  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
565            (VMOVUPSYmr addr:$dst, VR256:$src)>;
566  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
567            (VMOVUPSYmr addr:$dst, VR256:$src)>;
568  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
569            (VMOVUPSYmr addr:$dst, VR256:$src)>;
570  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
571            (VMOVUPSYmr addr:$dst, VR256:$src)>;
572
573  def : Pat<(alignedloadv8f16 addr:$src),
574            (VMOVAPSrm addr:$src)>;
575  def : Pat<(alignedloadv8bf16 addr:$src),
576            (VMOVAPSrm addr:$src)>;
577  def : Pat<(loadv8f16 addr:$src),
578            (VMOVUPSrm addr:$src)>;
579  def : Pat<(loadv8bf16 addr:$src),
580            (VMOVUPSrm addr:$src)>;
581  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
582            (VMOVAPSmr addr:$dst, VR128:$src)>;
583  def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst),
584            (VMOVAPSmr addr:$dst, VR128:$src)>;
585  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
586            (VMOVUPSmr addr:$dst, VR128:$src)>;
587  def : Pat<(store (v8bf16 VR128:$src), addr:$dst),
588            (VMOVUPSmr addr:$dst, VR128:$src)>;
589
590  def : Pat<(alignedloadv16f16 addr:$src),
591            (VMOVAPSYrm addr:$src)>;
592  def : Pat<(alignedloadv16bf16 addr:$src),
593            (VMOVAPSYrm addr:$src)>;
594  def : Pat<(loadv16f16 addr:$src),
595            (VMOVUPSYrm addr:$src)>;
596  def : Pat<(loadv16bf16 addr:$src),
597            (VMOVUPSYrm addr:$src)>;
598  def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
599            (VMOVAPSYmr addr:$dst, VR256:$src)>;
600  def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst),
601            (VMOVAPSYmr addr:$dst, VR256:$src)>;
602  def : Pat<(store (v16f16 VR256:$src), addr:$dst),
603            (VMOVUPSYmr addr:$dst, VR256:$src)>;
604  def : Pat<(store (v16bf16 VR256:$src), addr:$dst),
605            (VMOVUPSYmr addr:$dst, VR256:$src)>;
606}
607
608// Use movaps / movups for SSE integer load / store (one byte shorter).
609// The instructions selected below are then converted to MOVDQA/MOVDQU
610// during the SSE domain pass.
611let Predicates = [UseSSE1] in {
612  def : Pat<(alignedloadv2i64 addr:$src),
613            (MOVAPSrm addr:$src)>;
614  def : Pat<(alignedloadv4i32 addr:$src),
615            (MOVAPSrm addr:$src)>;
616  def : Pat<(alignedloadv8i16 addr:$src),
617            (MOVAPSrm addr:$src)>;
618  def : Pat<(alignedloadv16i8 addr:$src),
619            (MOVAPSrm addr:$src)>;
620  def : Pat<(loadv2i64 addr:$src),
621            (MOVUPSrm addr:$src)>;
622  def : Pat<(loadv4i32 addr:$src),
623            (MOVUPSrm addr:$src)>;
624  def : Pat<(loadv8i16 addr:$src),
625            (MOVUPSrm addr:$src)>;
626  def : Pat<(loadv16i8 addr:$src),
627            (MOVUPSrm addr:$src)>;
628
629  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
630            (MOVAPSmr addr:$dst, VR128:$src)>;
631  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
632            (MOVAPSmr addr:$dst, VR128:$src)>;
633  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
634            (MOVAPSmr addr:$dst, VR128:$src)>;
635  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
636            (MOVAPSmr addr:$dst, VR128:$src)>;
637  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
638            (MOVUPSmr addr:$dst, VR128:$src)>;
639  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
640            (MOVUPSmr addr:$dst, VR128:$src)>;
641  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
642            (MOVUPSmr addr:$dst, VR128:$src)>;
643  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
644            (MOVUPSmr addr:$dst, VR128:$src)>;
645}
646
647let Predicates = [UseSSE2] in {
648  def : Pat<(alignedloadv8f16 addr:$src),
649            (MOVAPSrm addr:$src)>;
650  def : Pat<(loadv8f16 addr:$src),
651            (MOVUPSrm addr:$src)>;
652  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
653            (MOVAPSmr addr:$dst, VR128:$src)>;
654  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
655            (MOVUPSmr addr:$dst, VR128:$src)>;
656}
657
658//===----------------------------------------------------------------------===//
659// SSE 1 & 2 - Move Low packed FP Instructions
660//===----------------------------------------------------------------------===//
661
662multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
663                                      string base_opc, string asm_opr> {
664  // No pattern as they need be special cased between high and low.
665  let hasSideEffects = 0, mayLoad = 1 in
666  def PSrm : PI<opc, MRMSrcMem,
667                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
668                !strconcat(base_opc, "s", asm_opr),
669                [], SSEPackedSingle>, PS,
670                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
671
672  def PDrm : PI<opc, MRMSrcMem,
673         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
674         !strconcat(base_opc, "d", asm_opr),
675     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
676                              (scalar_to_vector (loadf64 addr:$src2)))))],
677              SSEPackedDouble>, PD,
678     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
679}
680
681multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
682                                 string base_opc> {
683  let Predicates = [UseAVX] in
684    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
685                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
686                                    VEX_4V, WIG;
687
688  let Constraints = "$src1 = $dst" in
689    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
690                                    "\t{$src2, $dst|$dst, $src2}">;
691}
692
693defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
694
695let SchedRW = [WriteFStore] in {
696let Predicates = [UseAVX] in {
697let mayStore = 1, hasSideEffects = 0 in
698def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
699                     "movlps\t{$src, $dst|$dst, $src}",
700                     []>,
701                     VEX, WIG;
702def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703                     "movlpd\t{$src, $dst|$dst, $src}",
704                     [(store (f64 (extractelt (v2f64 VR128:$src),
705                                   (iPTR 0))), addr:$dst)]>,
706                     VEX, WIG;
707}// UseAVX
708let mayStore = 1, hasSideEffects = 0 in
709def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
710                   "movlps\t{$src, $dst|$dst, $src}",
711                   []>;
712def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713                   "movlpd\t{$src, $dst|$dst, $src}",
714                   [(store (f64 (extractelt (v2f64 VR128:$src),
715                                 (iPTR 0))), addr:$dst)]>;
716} // SchedRW
717
718let Predicates = [UseSSE1] in {
719  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
720  // end up with a movsd or blend instead of shufp.
721  // No need for aligned load, we're only loading 64-bits.
722  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
723                      (i8 -28)),
724            (MOVLPSrm VR128:$src1, addr:$src2)>;
725  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
726            (MOVLPSrm VR128:$src1, addr:$src2)>;
727
728  def : Pat<(v4f32 (X86vzload64 addr:$src)),
729            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
730  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
731            (MOVLPSmr addr:$dst, VR128:$src)>;
732}
733
734//===----------------------------------------------------------------------===//
735// SSE 1 & 2 - Move Hi packed FP Instructions
736//===----------------------------------------------------------------------===//
737
738defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
739
740let SchedRW = [WriteFStore] in {
741// v2f64 extract element 1 is always custom lowered to unpack high to low
742// and extract element 0 so the non-store version isn't too horrible.
743let Predicates = [UseAVX] in {
744let mayStore = 1, hasSideEffects = 0 in
745def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
746                   "movhps\t{$src, $dst|$dst, $src}",
747                   []>, VEX, WIG;
748def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
749                   "movhpd\t{$src, $dst|$dst, $src}",
750                   [(store (f64 (extractelt
751                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
752                                 (iPTR 0))), addr:$dst)]>, VEX, WIG;
753} // UseAVX
754let mayStore = 1, hasSideEffects = 0 in
755def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
756                   "movhps\t{$src, $dst|$dst, $src}",
757                   []>;
758def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
759                   "movhpd\t{$src, $dst|$dst, $src}",
760                   [(store (f64 (extractelt
761                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
762                                 (iPTR 0))), addr:$dst)]>;
763} // SchedRW
764
765let Predicates = [UseAVX] in {
766  // MOVHPD patterns
767  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
768            (VMOVHPDrm VR128:$src1, addr:$src2)>;
769
770  def : Pat<(store (f64 (extractelt
771                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
772                          (iPTR 0))), addr:$dst),
773            (VMOVHPDmr addr:$dst, VR128:$src)>;
774
775  // MOVLPD patterns
776  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
777            (VMOVLPDrm VR128:$src1, addr:$src2)>;
778}
779
780let Predicates = [UseSSE1] in {
781  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
782  // end up with a movsd or blend instead of shufp.
783  // No need for aligned load, we're only loading 64-bits.
784  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
785            (MOVHPSrm VR128:$src1, addr:$src2)>;
786  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
787            (MOVHPSrm VR128:$src1, addr:$src2)>;
788
789  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
790                                addr:$dst),
791            (MOVHPSmr addr:$dst, VR128:$src)>;
792}
793
794let Predicates = [UseSSE2] in {
795  // MOVHPD patterns
796  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
797            (MOVHPDrm VR128:$src1, addr:$src2)>;
798
799  def : Pat<(store (f64 (extractelt
800                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
801                          (iPTR 0))), addr:$dst),
802            (MOVHPDmr addr:$dst, VR128:$src)>;
803
804  // MOVLPD patterns
805  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
806            (MOVLPDrm VR128:$src1, addr:$src2)>;
807}
808
809let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
810  // Use MOVLPD to load into the low bits from a full vector unless we can use
811  // BLENDPD.
812  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
813            (MOVLPDrm VR128:$src1, addr:$src2)>;
814}
815
816//===----------------------------------------------------------------------===//
817// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
818//===----------------------------------------------------------------------===//
819
820let Predicates = [UseAVX] in {
821  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
822                                       (ins VR128:$src1, VR128:$src2),
823                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
824                      [(set VR128:$dst,
825                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
826                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
827  let isCommutable = 1 in
828  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
829                                       (ins VR128:$src1, VR128:$src2),
830                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
831                      [(set VR128:$dst,
832                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
833                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
834}
835let Constraints = "$src1 = $dst" in {
836  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
837                                       (ins VR128:$src1, VR128:$src2),
838                      "movlhps\t{$src2, $dst|$dst, $src2}",
839                      [(set VR128:$dst,
840                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
841                      Sched<[SchedWriteFShuffle.XMM]>;
842  let isCommutable = 1 in
843  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
844                                       (ins VR128:$src1, VR128:$src2),
845                      "movhlps\t{$src2, $dst|$dst, $src2}",
846                      [(set VR128:$dst,
847                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
848                      Sched<[SchedWriteFShuffle.XMM]>;
849}
850
851//===----------------------------------------------------------------------===//
852// SSE 1 & 2 - Conversion Instructions
853//===----------------------------------------------------------------------===//
854
855multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
856                     SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
857                     string asm, string mem, X86FoldableSchedWrite sched,
858                     Domain d,
859                     SchedRead Int2Fpu = ReadDefault> {
860  let ExeDomain = d in {
861  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
862              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
863              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
864              Sched<[sched, Int2Fpu]>;
865  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
866              mem#"\t{$src, $dst|$dst, $src}",
867              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
868              Sched<[sched.Folded]>;
869  }
870}
871
872multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
873                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
874                       string asm, Domain d, X86FoldableSchedWrite sched> {
875let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
876  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
877             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
878             Sched<[sched]>;
879  let mayLoad = 1 in
880  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
881             [(set RC:$dst, (DstTy (any_sint_to_fp
882                                    (SrcTy (ld_frag addr:$src)))))], d>,
883             Sched<[sched.Folded]>;
884}
885}
886
887multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
888                          X86MemOperand x86memop, string asm, string mem,
889                          X86FoldableSchedWrite sched, Domain d> {
890let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
891  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
892              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
893              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
894  let mayLoad = 1 in
895  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
896              (ins DstRC:$src1, x86memop:$src),
897              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
898           Sched<[sched.Folded, sched.ReadAfterFold]>;
899} // hasSideEffects = 0
900}
901
902let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
903defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
904                                "cvttss2si", "cvttss2si",
905                                WriteCvtSS2I, SSEPackedSingle>,
906                                XS, VEX, VEX_LIG;
907defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
908                                "cvttss2si", "cvttss2si",
909                                WriteCvtSS2I, SSEPackedSingle>,
910                                XS, VEX, REX_W, VEX_LIG;
911defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
912                                "cvttsd2si", "cvttsd2si",
913                                WriteCvtSD2I, SSEPackedDouble>,
914                                XD, VEX, VEX_LIG;
915defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
916                                "cvttsd2si", "cvttsd2si",
917                                WriteCvtSD2I, SSEPackedDouble>,
918                                XD, VEX, REX_W, VEX_LIG;
919
920defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
921                               "cvtss2si", "cvtss2si",
922                               WriteCvtSS2I, SSEPackedSingle>,
923                               XS, VEX, VEX_LIG;
924defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
925                               "cvtss2si", "cvtss2si",
926                               WriteCvtSS2I, SSEPackedSingle>,
927                               XS, VEX, REX_W, VEX_LIG;
928defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
929                               "cvtsd2si", "cvtsd2si",
930                               WriteCvtSD2I, SSEPackedDouble>,
931                               XD, VEX, VEX_LIG;
932defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
933                               "cvtsd2si", "cvtsd2si",
934                               WriteCvtSD2I, SSEPackedDouble>,
935                               XD, VEX, REX_W, VEX_LIG;
936}
937
938// The assembler can recognize rr 64-bit instructions by seeing a rxx
939// register, but the same isn't true when only using memory operands,
940// provide other assembly "l" and "q" forms to address this explicitly
941// where appropriate to do so.
942let isCodeGenOnly = 1 in {
943defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
944                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
945                                  VEX_LIG, SIMD_EXC;
946defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
947                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
948                                  REX_W, VEX_LIG, SIMD_EXC;
949defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
950                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
951                                  VEX_LIG;
952defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
953                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
954                                  REX_W, VEX_LIG, SIMD_EXC;
955} // isCodeGenOnly = 1
956
957let Predicates = [UseAVX] in {
958  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
959            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
960  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
961            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
962  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
963            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
964  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
965            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
966
967  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
968            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
969  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
970            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
971  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
972            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
973  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
974            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
975
976  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
977  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
978
979  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
980  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
981}
982
983let isCodeGenOnly = 1 in {
984defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
985                      "cvttss2si", "cvttss2si",
986                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
987defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
988                      "cvttss2si", "cvttss2si",
989                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
990defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
991                      "cvttsd2si", "cvttsd2si",
992                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
993defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
994                      "cvttsd2si", "cvttsd2si",
995                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
996
997defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
998                     "cvtss2si", "cvtss2si",
999                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
1000defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
1001                     "cvtss2si", "cvtss2si",
1002                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
1003defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
1004                     "cvtsd2si", "cvtsd2si",
1005                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
1006defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
1007                     "cvtsd2si", "cvtsd2si",
1008                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
1009
1010defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
1011                      "cvtsi2ss", "cvtsi2ss{l}",
1012                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
1013defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
1014                      "cvtsi2ss", "cvtsi2ss{q}",
1015                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
1016defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
1017                      "cvtsi2sd", "cvtsi2sd{l}",
1018                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
1019defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
1020                      "cvtsi2sd", "cvtsi2sd{q}",
1021                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
1022} // isCodeGenOnly = 1
1023
1024let Predicates = [UseSSE1] in {
1025  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
1026  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
1027}
1028
1029let Predicates = [UseSSE2] in {
1030  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
1031  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
1032}
1033
1034// Conversion Instructions Intrinsics - Match intrinsics which expect MM
1035// and/or XMM operand(s).
1036
1037multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
1038                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
1039                          Operand memop, PatFrags mem_frags, string asm,
1040                          X86FoldableSchedWrite sched, Domain d> {
1041let ExeDomain = d in {
1042  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1043                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1044                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1045               Sched<[sched]>;
1046  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1047                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1048                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1049               Sched<[sched.Folded]>;
1050}
1051}
1052
1053multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1054                    RegisterClass DstRC, X86MemOperand x86memop,
1055                    string asm, string mem, X86FoldableSchedWrite sched,
1056                    Domain d, bit Is2Addr = 1> {
1057let hasSideEffects = 0, ExeDomain = d in {
1058  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1059                  !if(Is2Addr,
1060                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1061                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1062                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1063  let mayLoad = 1 in
1064  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1065                  (ins DstRC:$src1, x86memop:$src2),
1066                  !if(Is2Addr,
1067                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1068                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1069                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1070}
1071}
1072
1073let Uses = [MXCSR], mayRaiseFPException = 1 in {
1074let Predicates = [UseAVX] in {
1075defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1076                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1077                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1078defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1079                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1080                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, REX_W, VEX_LIG;
1081}
1082defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1083                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1084                 SSEPackedDouble>, XD;
1085defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1086                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1087                   SSEPackedDouble>, XD, REX_W;
1088}
1089
1090let Predicates = [UseAVX] in {
1091defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1092          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1093          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1094defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1095          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1096          XS, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
1097defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1098          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1099          XD, VEX_4V, VEX_LIG;
1100defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1101          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1102          XD, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
1103}
1104let Constraints = "$src1 = $dst" in {
1105  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1106                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1107                        XS, SIMD_EXC;
1108  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1109                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1110                        XS, REX_W, SIMD_EXC;
1111  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1112                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1113                        XD;
1114  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1115                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1116                        XD, REX_W, SIMD_EXC;
1117}
1118
1119def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1120               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1121def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1122               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1123def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1124               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1125def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1126               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1127
1128def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1129              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1130def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1131              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1132
1133def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1134                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1135def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1136                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1137def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1138                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1139def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1140                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1141
1142def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1143                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1144def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1145                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1146
1147/// SSE 1 Only
1148
1149// Aliases for intrinsics
1150let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1151defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1152                                ssmem, sse_load_f32, "cvttss2si",
1153                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1154defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1155                               X86cvtts2Int, ssmem, sse_load_f32,
1156                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1157                               XS, VEX, VEX_LIG, REX_W;
1158defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1159                                sdmem, sse_load_f64, "cvttsd2si",
1160                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1161defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1162                              X86cvtts2Int, sdmem, sse_load_f64,
1163                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1164                              XD, VEX, VEX_LIG, REX_W;
1165}
1166let Uses = [MXCSR], mayRaiseFPException = 1 in {
1167defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1168                                    ssmem, sse_load_f32, "cvttss2si",
1169                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1170defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1171                                   X86cvtts2Int, ssmem, sse_load_f32,
1172                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1173                                   XS, REX_W;
1174defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1175                                    sdmem, sse_load_f64, "cvttsd2si",
1176                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1177defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1178                                  X86cvtts2Int, sdmem, sse_load_f64,
1179                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1180                                  XD, REX_W;
1181}
1182
1183def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1184                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1185def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1186                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1187def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1188                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1189def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1190                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1191def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1192                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1193def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1194                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1195def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1196                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1197def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1198                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1199
1200def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1201                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1202def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1203                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1204def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1205                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1206def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1207                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1208def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1209                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1210def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1211                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1212def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1213                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1214def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1215                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1216
1217let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1218defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1219                                  ssmem, sse_load_f32, "cvtss2si",
1220                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1221defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1222                                  ssmem, sse_load_f32, "cvtss2si",
1223                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, REX_W, VEX_LIG;
1224}
1225let Uses = [MXCSR], mayRaiseFPException = 1 in {
1226defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1227                               ssmem, sse_load_f32, "cvtss2si",
1228                               WriteCvtSS2I, SSEPackedSingle>, XS;
1229defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1230                                 ssmem, sse_load_f32, "cvtss2si",
1231                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1232
1233defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1234                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1235                               SSEPackedSingle, WriteCvtI2PS>,
1236                               PS, VEX, Requires<[HasAVX, NoVLX]>, WIG;
1237defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1238                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1239                               SSEPackedSingle, WriteCvtI2PSY>,
1240                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
1241
1242defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1243                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1244                            SSEPackedSingle, WriteCvtI2PS>,
1245                            PS, Requires<[UseSSE2]>;
1246}
1247
1248// AVX aliases
1249def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1250                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1251def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1252                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1253def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1254                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1255def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1256                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1257def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1258                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1259def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1260                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1261def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1262                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1263def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1264                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1265
1266// SSE aliases
1267def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1268                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1269def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1270                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1271def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1272                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1273def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1274                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1275def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1276                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1277def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1278                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1279def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1280                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1281def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1282                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1283
1284/// SSE 2 Only
1285
1286// Convert scalar double to scalar single
1287let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
1288    ExeDomain = SSEPackedSingle in {
1289def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1290                        (ins FR32:$src1, FR64:$src2),
1291                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1292                        VEX_4V, VEX_LIG, WIG,
1293                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1294let mayLoad = 1 in
1295def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1296                     (ins FR32:$src1, f64mem:$src2),
1297                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1298                     XD, VEX_4V, VEX_LIG, WIG,
1299                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1300}
1301
1302def : Pat<(f32 (any_fpround FR64:$src)),
1303            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1304          Requires<[UseAVX]>;
1305
1306let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1307def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1308                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1309                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1310                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1311def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1312                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1313                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1314                    XD, Requires<[UseSSE2, OptForSize]>,
1315                    Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1316}
1317
1318let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
1319def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1320                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1321                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1322                       [(set VR128:$dst,
1323                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1324                       XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
1325                       Sched<[WriteCvtSD2SS]>;
1326def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1327                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1328                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1329                       [(set VR128:$dst,
1330                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1331                       XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
1332                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1333let Constraints = "$src1 = $dst" in {
1334def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1335                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1336                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1337                       [(set VR128:$dst,
1338                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1339                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1340def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1341                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1342                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1343                       [(set VR128:$dst,
1344                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1345                       XD, Requires<[UseSSE2]>,
1346                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1347}
1348}
1349
1350// Convert scalar single to scalar double
1351// SSE2 instructions with XS prefix
1352let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
1353def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1354                    (ins FR64:$src1, FR32:$src2),
1355                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1356                    XS, VEX_4V, VEX_LIG, WIG,
1357                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1358let mayLoad = 1 in
1359def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1360                    (ins FR64:$src1, f32mem:$src2),
1361                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1362                    XS, VEX_4V, VEX_LIG, WIG,
1363                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1364                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1365} // isCodeGenOnly = 1, hasSideEffects = 0
1366
1367def : Pat<(f64 (any_fpextend FR32:$src)),
1368    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1369def : Pat<(any_fpextend (loadf32 addr:$src)),
1370    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1371
1372let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1373def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1374                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1375                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1376                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1377def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1378                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1379                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1380                   XS, Requires<[UseSSE2, OptForSize]>,
1381                   Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
1382} // isCodeGenOnly = 1
1383
1384let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1385    ExeDomain = SSEPackedSingle in {
1386def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1387                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1388                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1389                    []>, XS, VEX_4V, VEX_LIG, WIG,
1390                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1391let mayLoad = 1 in
1392def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1393                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1394                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1395                    []>, XS, VEX_4V, VEX_LIG, WIG, Requires<[HasAVX]>,
1396                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1397let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1398def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1399                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1400                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1401                    []>, XS, Requires<[UseSSE2]>,
1402                    Sched<[WriteCvtSS2SD]>;
1403let mayLoad = 1 in
1404def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1405                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1406                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1407                    []>, XS, Requires<[UseSSE2]>,
1408                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1409}
1410} // hasSideEffects = 0
1411
1412// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1413// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1414// vmovs{s,d} instructions
1415let Predicates = [UseAVX] in {
1416def : Pat<(v4f32 (X86Movss
1417                   (v4f32 VR128:$dst),
1418                   (v4f32 (scalar_to_vector
1419                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1420          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1421
1422def : Pat<(v2f64 (X86Movsd
1423                   (v2f64 VR128:$dst),
1424                   (v2f64 (scalar_to_vector
1425                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1426          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1427
1428def : Pat<(v4f32 (X86Movss
1429                   (v4f32 VR128:$dst),
1430                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1431          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1432
1433def : Pat<(v4f32 (X86Movss
1434                   (v4f32 VR128:$dst),
1435                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1436          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1437
1438def : Pat<(v4f32 (X86Movss
1439                   (v4f32 VR128:$dst),
1440                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1441          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1442
1443def : Pat<(v4f32 (X86Movss
1444                   (v4f32 VR128:$dst),
1445                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1446          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1447
1448def : Pat<(v2f64 (X86Movsd
1449                   (v2f64 VR128:$dst),
1450                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1451          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1452
1453def : Pat<(v2f64 (X86Movsd
1454                   (v2f64 VR128:$dst),
1455                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1456          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1457
1458def : Pat<(v2f64 (X86Movsd
1459                   (v2f64 VR128:$dst),
1460                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1461          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1462
1463def : Pat<(v2f64 (X86Movsd
1464                   (v2f64 VR128:$dst),
1465                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1466          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1467} // Predicates = [UseAVX]
1468
1469let Predicates = [UseSSE2] in {
1470def : Pat<(v4f32 (X86Movss
1471                   (v4f32 VR128:$dst),
1472                   (v4f32 (scalar_to_vector
1473                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1474          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1475
1476def : Pat<(v2f64 (X86Movsd
1477                   (v2f64 VR128:$dst),
1478                   (v2f64 (scalar_to_vector
1479                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1480          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1481
1482def : Pat<(v2f64 (X86Movsd
1483                   (v2f64 VR128:$dst),
1484                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1485          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1486
1487def : Pat<(v2f64 (X86Movsd
1488                   (v2f64 VR128:$dst),
1489                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1490          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1491
1492def : Pat<(v2f64 (X86Movsd
1493                   (v2f64 VR128:$dst),
1494                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1495          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1496
1497def : Pat<(v2f64 (X86Movsd
1498                   (v2f64 VR128:$dst),
1499                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1500          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1501} // Predicates = [UseSSE2]
1502
1503let Predicates = [UseSSE1] in {
1504def : Pat<(v4f32 (X86Movss
1505                   (v4f32 VR128:$dst),
1506                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1507          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1508
1509def : Pat<(v4f32 (X86Movss
1510                   (v4f32 VR128:$dst),
1511                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1512          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1513
1514def : Pat<(v4f32 (X86Movss
1515                   (v4f32 VR128:$dst),
1516                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1517          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1518
1519def : Pat<(v4f32 (X86Movss
1520                   (v4f32 VR128:$dst),
1521                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1522          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1523} // Predicates = [UseSSE1]
1524
1525let Predicates = [HasAVX, NoVLX] in {
1526// Convert packed single/double fp to doubleword
1527def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1528                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1529                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1530                       VEX, Sched<[WriteCvtPS2I]>, WIG, SIMD_EXC;
1531def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1532                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1533                       [(set VR128:$dst,
1534                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1535                       VEX, Sched<[WriteCvtPS2ILd]>, WIG, SIMD_EXC;
1536def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1537                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1538                        [(set VR256:$dst,
1539                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1540                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG, SIMD_EXC;
1541def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1542                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1543                        [(set VR256:$dst,
1544                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1545                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, WIG, SIMD_EXC;
1546}
1547def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1548                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1549                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1550                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1551def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1552                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1553                     [(set VR128:$dst,
1554                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1555                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1556
1557
1558// Convert Packed Double FP to Packed DW Integers
1559let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1560// The assembler can recognize rr 256-bit instructions by seeing a ymm
1561// register, but the same isn't true when using memory operands instead.
1562// Provide other assembly rr and rm forms to address this explicitly.
1563def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1564                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1565                       [(set VR128:$dst,
1566                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1567                       VEX, Sched<[WriteCvtPD2I]>, WIG;
1568
1569// XMM only
1570def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1571                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1572                      [(set VR128:$dst,
1573                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1574                      Sched<[WriteCvtPD2ILd]>, WIG;
1575
1576// YMM only
1577def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1578                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1579                       [(set VR128:$dst,
1580                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1581                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG;
1582def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1583                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1584                       [(set VR128:$dst,
1585                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1586                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
1587}
1588
1589def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1590                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1591def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1592                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1593
1594def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1595                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1596                      [(set VR128:$dst,
1597                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1598                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1599def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1600                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1601                      [(set VR128:$dst,
1602                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1603                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1604
1605// Convert with truncation packed single/double fp to doubleword
1606// SSE2 packed instructions with XS prefix
1607let Uses = [MXCSR], mayRaiseFPException = 1 in {
1608let Predicates = [HasAVX, NoVLX] in {
1609def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1610                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1611                         [(set VR128:$dst,
1612                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1613                         VEX, Sched<[WriteCvtPS2I]>, WIG;
1614def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1615                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1616                         [(set VR128:$dst,
1617                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1618                         VEX, Sched<[WriteCvtPS2ILd]>, WIG;
1619def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1620                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1621                          [(set VR256:$dst,
1622                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1623                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG;
1624def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1625                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1626                          [(set VR256:$dst,
1627                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1628                          VEX, VEX_L,
1629                          Sched<[WriteCvtPS2IYLd]>, WIG;
1630}
1631
1632def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1633                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1634                       [(set VR128:$dst,
1635                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1636                       Sched<[WriteCvtPS2I]>;
1637def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1638                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1639                       [(set VR128:$dst,
1640                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1641                       Sched<[WriteCvtPS2ILd]>;
1642}
1643
1644// The assembler can recognize rr 256-bit instructions by seeing a ymm
1645// register, but the same isn't true when using memory operands instead.
1646// Provide other assembly rr and rm forms to address this explicitly.
1647let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1648// XMM only
1649def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1650                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1651                        [(set VR128:$dst,
1652                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1653                        VEX, Sched<[WriteCvtPD2I]>, WIG;
1654def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1655                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1656                        [(set VR128:$dst,
1657                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1658                        VEX, Sched<[WriteCvtPD2ILd]>, WIG;
1659
1660// YMM only
1661def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1662                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1663                         [(set VR128:$dst,
1664                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1665                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG;
1666def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1667                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1668                         [(set VR128:$dst,
1669                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1670                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG;
1671} // Predicates = [HasAVX, NoVLX]
1672
1673def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1674                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1675def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1676                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1677
1678let Predicates = [HasAVX, NoVLX] in {
1679  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1680            (VCVTTPD2DQYrr VR256:$src)>;
1681  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1682            (VCVTTPD2DQYrm addr:$src)>;
1683}
1684
1685def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1686                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1687                      [(set VR128:$dst,
1688                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1689                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1690def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1691                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1692                      [(set VR128:$dst,
1693                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1694                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1695
1696// Convert packed single to packed double
1697let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1698                  // SSE2 instructions without OpSize prefix
1699def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1700                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1701                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1702                    PS, VEX, Sched<[WriteCvtPS2PD]>, WIG;
1703def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1704                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1705                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1706                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
1707def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1708                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1709                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1710                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
1711def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1712                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1713                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1714                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
1715}
1716
1717let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1718def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1719                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1720                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1721                   PS, Sched<[WriteCvtPS2PD]>;
1722def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1723                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1724                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1725                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1726}
1727
1728// Convert Packed DW Integers to Packed Double FP
1729let Predicates = [HasAVX, NoVLX] in {
1730let hasSideEffects = 0, mayLoad = 1 in
1731def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1732                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1733                        [(set VR128:$dst,
1734                          (v2f64 (X86any_VSintToFP
1735                                  (bc_v4i32
1736                                   (v2i64 (scalar_to_vector
1737                                           (loadi64 addr:$src)))))))]>,
1738                        VEX, Sched<[WriteCvtI2PDLd]>, WIG;
1739def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1740                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1741                        [(set VR128:$dst,
1742                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1743                        VEX, Sched<[WriteCvtI2PD]>, WIG;
1744def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1745                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1746                         [(set VR256:$dst,
1747                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1748                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1749                         WIG;
1750def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1751                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1752                         [(set VR256:$dst,
1753                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1754                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, WIG;
1755}
1756
1757let hasSideEffects = 0, mayLoad = 1 in
1758def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1759                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1760                       [(set VR128:$dst,
1761                         (v2f64 (X86any_VSintToFP
1762                                 (bc_v4i32
1763                                  (v2i64 (scalar_to_vector
1764                                          (loadi64 addr:$src)))))))]>,
1765                       Sched<[WriteCvtI2PDLd]>;
1766def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1767                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1768                       [(set VR128:$dst,
1769                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1770                       Sched<[WriteCvtI2PD]>;
1771
1772// AVX register conversion intrinsics
1773let Predicates = [HasAVX, NoVLX] in {
1774  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1775            (VCVTDQ2PDrm addr:$src)>;
1776} // Predicates = [HasAVX, NoVLX]
1777
1778// SSE2 register conversion intrinsics
1779let Predicates = [UseSSE2] in {
1780  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1781            (CVTDQ2PDrm addr:$src)>;
1782} // Predicates = [UseSSE2]
1783
1784// Convert packed double to packed single
1785// The assembler can recognize rr 256-bit instructions by seeing a ymm
1786// register, but the same isn't true when using memory operands instead.
1787// Provide other assembly rr and rm forms to address this explicitly.
1788let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1789// XMM only
1790def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1791                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1792                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1793                       VEX, Sched<[WriteCvtPD2PS]>, WIG;
1794def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1795                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1796                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
1797                       VEX, Sched<[WriteCvtPD2PS.Folded]>, WIG;
1798
1799def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1800                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1801                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
1802                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, WIG;
1803def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1804                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1805                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
1806                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, WIG;
1807} // Predicates = [HasAVX, NoVLX]
1808
1809def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1810                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1811def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1812                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1813
1814def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1815                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1816                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1817                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1818def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1819                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1820                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
1821                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1822
1823//===----------------------------------------------------------------------===//
1824// SSE 1 & 2 - Compare Instructions
1825//===----------------------------------------------------------------------===//
1826
1827// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1828multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1829                            Operand memop, SDNode OpNode, ValueType VT,
1830                            PatFrag ld_frag, string asm,
1831                            X86FoldableSchedWrite sched,
1832                            PatFrags mem_frags> {
1833  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1834                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1835                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1836                                              VR128:$src2, timm:$cc))]>,
1837           Sched<[sched]>, SIMD_EXC;
1838  let mayLoad = 1 in
1839  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1840                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1841                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1842                                              (mem_frags addr:$src2), timm:$cc))]>,
1843           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1844
1845  let isCodeGenOnly = 1 in {
1846    let isCommutable = 1 in
1847    def rr : SIi8<0xC2, MRMSrcReg,
1848                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1849                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1850                  Sched<[sched]>, SIMD_EXC;
1851    def rm : SIi8<0xC2, MRMSrcMem,
1852                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1853                  [(set RC:$dst, (OpNode RC:$src1,
1854                                         (ld_frag addr:$src2), timm:$cc))]>,
1855                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1856  }
1857}
1858
1859let ExeDomain = SSEPackedSingle in
1860defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1861                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1862                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1863                 XS, VEX_4V, VEX_LIG, WIG;
1864let ExeDomain = SSEPackedDouble in
1865defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1866                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1867                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1868                 XD, VEX_4V, VEX_LIG, WIG;
1869
1870let Constraints = "$src1 = $dst" in {
1871  let ExeDomain = SSEPackedSingle in
1872  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1873                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1874                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1875  let ExeDomain = SSEPackedDouble in
1876  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1877                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1878                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1879}
1880
1881// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1882multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
1883                         ValueType vt, X86MemOperand x86memop,
1884                         PatFrag ld_frag, string OpcodeStr, Domain d,
1885                         X86FoldableSchedWrite sched = WriteFComX> {
1886  let ExeDomain = d in {
1887  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1888                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1889                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1890          Sched<[sched]>, SIMD_EXC;
1891  let mayLoad = 1 in
1892  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1893                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1894                     [(set EFLAGS, (OpNode (vt RC:$src1),
1895                                           (ld_frag addr:$src2)))]>,
1896          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1897}
1898}
1899
1900// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1901multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1902                             ValueType vt, Operand memop,
1903                             PatFrags mem_frags, string OpcodeStr,
1904                             Domain d,
1905                             X86FoldableSchedWrite sched = WriteFComX> {
1906let ExeDomain = d in {
1907  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1908                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1909                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1910          Sched<[sched]>, SIMD_EXC;
1911let mayLoad = 1 in
1912  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1913                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1914                     [(set EFLAGS, (OpNode (vt RC:$src1),
1915                                           (mem_frags addr:$src2)))]>,
1916          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1917}
1918}
1919
1920let Defs = [EFLAGS] in {
1921  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1922                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1923  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1924                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1925  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1926                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1927  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1928                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1929
1930  let isCodeGenOnly = 1 in {
1931    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1932                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1933    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1934                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1935
1936    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1937                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
1938    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1939                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
1940  }
1941  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1942                                  "ucomiss", SSEPackedSingle>, PS;
1943  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1944                                  "ucomisd", SSEPackedDouble>, PD;
1945  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1946                                  "comiss", SSEPackedSingle>, PS;
1947  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1948                                  "comisd", SSEPackedDouble>, PD;
1949
1950  let isCodeGenOnly = 1 in {
1951    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1952                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1953    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1954                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1955
1956    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1957                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1958    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1959                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1960  }
1961} // Defs = [EFLAGS]
1962
1963// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1964multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1965                            ValueType VT, string asm,
1966                            X86FoldableSchedWrite sched,
1967                            Domain d, PatFrag ld_frag> {
1968  let isCommutable = 1 in
1969  def rri : PIi8<0xC2, MRMSrcReg,
1970             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1971             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1972            Sched<[sched]>, SIMD_EXC;
1973  def rmi : PIi8<0xC2, MRMSrcMem,
1974             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1975             [(set RC:$dst,
1976               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1977            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1978}
1979
1980defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1981               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1982               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, WIG;
1983defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1984               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1985               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, WIG;
1986defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1987               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1988               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, WIG;
1989defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1990               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1991               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, WIG;
1992let Constraints = "$src1 = $dst" in {
1993  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1994                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1995                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1996  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1997                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1998                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1999}
2000
2001def CommutableCMPCC : PatLeaf<(timm), [{
2002  uint64_t Imm = N->getZExtValue() & 0x7;
2003  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
2004}]>;
2005
2006// Patterns to select compares with loads in first operand.
2007let Predicates = [HasAVX] in {
2008  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
2009                                CommutableCMPCC:$cc)),
2010            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2011
2012  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
2013                                CommutableCMPCC:$cc)),
2014            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
2015
2016  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
2017                                CommutableCMPCC:$cc)),
2018            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2019
2020  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
2021                                CommutableCMPCC:$cc)),
2022            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2023
2024  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2025                          CommutableCMPCC:$cc)),
2026            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2027
2028  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2029                          CommutableCMPCC:$cc)),
2030            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2031}
2032
2033let Predicates = [UseSSE2] in {
2034  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
2035                                CommutableCMPCC:$cc)),
2036            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
2037
2038  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
2039                          CommutableCMPCC:$cc)),
2040            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
2041}
2042
2043let Predicates = [UseSSE1] in {
2044  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2045                                CommutableCMPCC:$cc)),
2046            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2047
2048  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2049                          CommutableCMPCC:$cc)),
2050            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2051}
2052
2053//===----------------------------------------------------------------------===//
2054// SSE 1 & 2 - Shuffle Instructions
2055//===----------------------------------------------------------------------===//
2056
2057/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2058multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2059                         ValueType vt, string asm, PatFrag mem_frag,
2060                         X86FoldableSchedWrite sched, Domain d,
2061                         bit IsCommutable = 0> {
2062  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2063                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2064                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2065                                       (i8 timm:$src3))))], d>,
2066            Sched<[sched.Folded, sched.ReadAfterFold]>;
2067  let isCommutable = IsCommutable in
2068  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2069                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2070                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2071                                     (i8 timm:$src3))))], d>,
2072            Sched<[sched]>;
2073}
2074
2075let Predicates = [HasAVX, NoVLX] in {
2076  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2077           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2078           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2079           PS, VEX_4V, WIG;
2080  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2081           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2082           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2083           PS, VEX_4V, VEX_L, WIG;
2084  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2085           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2086           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2087           PD, VEX_4V, WIG;
2088  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2089           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2090           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2091           PD, VEX_4V, VEX_L, WIG;
2092}
2093let Constraints = "$src1 = $dst" in {
2094  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2095                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2096                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2097  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2098                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2099                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2100}
2101
2102//===----------------------------------------------------------------------===//
2103// SSE 1 & 2 - Unpack FP Instructions
2104//===----------------------------------------------------------------------===//
2105
2106/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2107multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2108                                   PatFrag mem_frag, RegisterClass RC,
2109                                   X86MemOperand x86memop, string asm,
2110                                   X86FoldableSchedWrite sched, Domain d,
2111                                   bit IsCommutable = 0> {
2112    let isCommutable = IsCommutable in
2113    def rr : PI<opc, MRMSrcReg,
2114                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2115                asm, [(set RC:$dst,
2116                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2117                Sched<[sched]>;
2118    def rm : PI<opc, MRMSrcMem,
2119                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2120                asm, [(set RC:$dst,
2121                           (vt (OpNode RC:$src1,
2122                                       (mem_frag addr:$src2))))], d>,
2123             Sched<[sched.Folded, sched.ReadAfterFold]>;
2124}
2125
2126let Predicates = [HasAVX, NoVLX] in {
2127defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2128      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2129                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
2130defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2131      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2132                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, WIG;
2133defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2134      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2135                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
2136defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2137      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2138                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, WIG;
2139
2140defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2141      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2142                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
2143defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2144      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2145                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
2146defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2147      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2148                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
2149defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2150      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2151                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
2152}// Predicates = [HasAVX, NoVLX]
2153
2154let Constraints = "$src1 = $dst" in {
2155  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2156        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2157                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2158  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2159        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2160                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2161  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2162        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2163                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2164  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2165        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2166                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2167} // Constraints = "$src1 = $dst"
2168
2169let Predicates = [HasAVX1Only] in {
2170  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2171            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2172  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2173            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2174  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2175            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2176  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2177            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2178
2179  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2180            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2181  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2182            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2183  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2184            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2185  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2186            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2187}
2188
2189let Predicates = [UseSSE2] in {
2190  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2191  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2192                              (v2f64 (simple_load addr:$src2)))),
2193            (MOVHPDrm VR128:$src1, addr:$src2)>;
2194}
2195
2196//===----------------------------------------------------------------------===//
2197// SSE 1 & 2 - Extract Floating-Point Sign mask
2198//===----------------------------------------------------------------------===//
2199
2200/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2201multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2202                                string asm, Domain d> {
2203  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2204              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2205              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2206              Sched<[WriteFMOVMSK]>;
2207}
2208
2209let Predicates = [HasAVX] in {
2210  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2211                                        SSEPackedSingle>, PS, VEX, WIG;
2212  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2213                                        SSEPackedDouble>, PD, VEX, WIG;
2214  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2215                                         SSEPackedSingle>, PS, VEX, VEX_L, WIG;
2216  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2217                                         SSEPackedDouble>, PD, VEX, VEX_L, WIG;
2218
2219  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2220  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2221            (VMOVMSKPSrr VR128:$src)>;
2222  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2223            (VMOVMSKPDrr VR128:$src)>;
2224  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2225            (VMOVMSKPSYrr VR256:$src)>;
2226  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2227            (VMOVMSKPDYrr VR256:$src)>;
2228}
2229
2230defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2231                                     SSEPackedSingle>, PS;
2232defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2233                                     SSEPackedDouble>, PD;
2234
2235let Predicates = [UseSSE2] in {
2236  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2237  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2238            (MOVMSKPSrr VR128:$src)>;
2239  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2240            (MOVMSKPDrr VR128:$src)>;
2241}
2242
2243//===---------------------------------------------------------------------===//
2244// SSE2 - Packed Integer Logical Instructions
2245//===---------------------------------------------------------------------===//
2246
2247let ExeDomain = SSEPackedInt in { // SSE integer instructions
2248
2249/// PDI_binop_rm - Simple SSE2 binary operator.
2250multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2251                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2252                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2253                        bit IsCommutable, bit Is2Addr> {
2254  let isCommutable = IsCommutable in
2255  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2256       (ins RC:$src1, RC:$src2),
2257       !if(Is2Addr,
2258           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2259           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2260       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2261       Sched<[sched]>;
2262  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2263       (ins RC:$src1, x86memop:$src2),
2264       !if(Is2Addr,
2265           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2266           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2267       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2268       Sched<[sched.Folded, sched.ReadAfterFold]>;
2269}
2270} // ExeDomain = SSEPackedInt
2271
2272multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2273                         ValueType OpVT128, ValueType OpVT256,
2274                         X86SchedWriteWidths sched, bit IsCommutable,
2275                         Predicate prd> {
2276let Predicates = [HasAVX, prd] in
2277  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2278                             VR128, load, i128mem, sched.XMM,
2279                             IsCommutable, 0>, VEX_4V, WIG;
2280
2281let Constraints = "$src1 = $dst" in
2282  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2283                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2284
2285let Predicates = [HasAVX2, prd] in
2286  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2287                               OpVT256, VR256, load, i256mem, sched.YMM,
2288                               IsCommutable, 0>, VEX_4V, VEX_L, WIG;
2289}
2290
2291// These are ordered here for pattern ordering requirements with the fp versions
2292
2293defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2294                           SchedWriteVecLogic, 1, NoVLX>;
2295defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2296                           SchedWriteVecLogic, 1, NoVLX>;
2297defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2298                           SchedWriteVecLogic, 1, NoVLX>;
2299defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2300                           SchedWriteVecLogic, 0, NoVLX>;
2301
2302//===----------------------------------------------------------------------===//
2303// SSE 1 & 2 - Logical Instructions
2304//===----------------------------------------------------------------------===//
2305
2306/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2307///
2308/// There are no patterns here because isel prefers integer versions for SSE2
2309/// and later. There are SSE1 v4f32 patterns later.
2310multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2311                                   X86SchedWriteWidths sched> {
2312  let Predicates = [HasAVX, NoVLX] in {
2313  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2314        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2315        [], [], 0>, PS, VEX_4V, VEX_L, WIG;
2316
2317  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2318        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2319        [], [], 0>, PD, VEX_4V, VEX_L, WIG;
2320
2321  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2322       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2323       [], [], 0>, PS, VEX_4V, WIG;
2324
2325  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2326       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2327       [], [], 0>, PD, VEX_4V, WIG;
2328  }
2329
2330  let Constraints = "$src1 = $dst" in {
2331    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2332         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2333         [], []>, PS;
2334
2335    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2336         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2337         [], []>, PD;
2338  }
2339}
2340
2341defm AND  : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
2342defm OR   : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
2343defm XOR  : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
2344let isCommutable = 0 in
2345  defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
2346
2347let Predicates = [HasAVX2, NoVLX] in {
2348  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2349            (VPANDYrr VR256:$src1, VR256:$src2)>;
2350  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2351            (VPANDYrr VR256:$src1, VR256:$src2)>;
2352  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2353            (VPANDYrr VR256:$src1, VR256:$src2)>;
2354
2355  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2356            (VPORYrr VR256:$src1, VR256:$src2)>;
2357  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2358            (VPORYrr VR256:$src1, VR256:$src2)>;
2359  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2360            (VPORYrr VR256:$src1, VR256:$src2)>;
2361
2362  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2363            (VPXORYrr VR256:$src1, VR256:$src2)>;
2364  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2365            (VPXORYrr VR256:$src1, VR256:$src2)>;
2366  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2367            (VPXORYrr VR256:$src1, VR256:$src2)>;
2368
2369  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2370            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2371  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2372            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2373  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2374            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2375
2376  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2377            (VPANDYrm VR256:$src1, addr:$src2)>;
2378  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2379            (VPANDYrm VR256:$src1, addr:$src2)>;
2380  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2381            (VPANDYrm VR256:$src1, addr:$src2)>;
2382
2383  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2384            (VPORYrm VR256:$src1, addr:$src2)>;
2385  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2386            (VPORYrm VR256:$src1, addr:$src2)>;
2387  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2388            (VPORYrm VR256:$src1, addr:$src2)>;
2389
2390  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2391            (VPXORYrm VR256:$src1, addr:$src2)>;
2392  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2393            (VPXORYrm VR256:$src1, addr:$src2)>;
2394  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2395            (VPXORYrm VR256:$src1, addr:$src2)>;
2396
2397  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2398            (VPANDNYrm VR256:$src1, addr:$src2)>;
2399  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2400            (VPANDNYrm VR256:$src1, addr:$src2)>;
2401  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2402            (VPANDNYrm VR256:$src1, addr:$src2)>;
2403}
2404
2405// If only AVX1 is supported, we need to handle integer operations with
2406// floating point instructions since the integer versions aren't available.
2407let Predicates = [HasAVX1Only] in {
2408  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2409            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2410  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2411            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2412  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2413            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2414  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2415            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2416
2417  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2418            (VORPSYrr VR256:$src1, VR256:$src2)>;
2419  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2420            (VORPSYrr VR256:$src1, VR256:$src2)>;
2421  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2422            (VORPSYrr VR256:$src1, VR256:$src2)>;
2423  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2424            (VORPSYrr VR256:$src1, VR256:$src2)>;
2425
2426  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2427            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2428  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2429            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2430  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2431            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2432  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2433            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2434
2435  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2436            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2437  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2438            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2439  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2440            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2441  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2442            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2443
2444  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2445            (VANDPSYrm VR256:$src1, addr:$src2)>;
2446  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2447            (VANDPSYrm VR256:$src1, addr:$src2)>;
2448  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2449            (VANDPSYrm VR256:$src1, addr:$src2)>;
2450  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2451            (VANDPSYrm VR256:$src1, addr:$src2)>;
2452
2453  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2454            (VORPSYrm VR256:$src1, addr:$src2)>;
2455  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2456            (VORPSYrm VR256:$src1, addr:$src2)>;
2457  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2458            (VORPSYrm VR256:$src1, addr:$src2)>;
2459  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2460            (VORPSYrm VR256:$src1, addr:$src2)>;
2461
2462  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2463            (VXORPSYrm VR256:$src1, addr:$src2)>;
2464  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2465            (VXORPSYrm VR256:$src1, addr:$src2)>;
2466  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2467            (VXORPSYrm VR256:$src1, addr:$src2)>;
2468  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2469            (VXORPSYrm VR256:$src1, addr:$src2)>;
2470
2471  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2472            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2473  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2474            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2475  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2476            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2477  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2478            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2479}
2480
2481let Predicates = [HasAVX, NoVLX] in {
2482  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2483            (VPANDrr VR128:$src1, VR128:$src2)>;
2484  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2485            (VPANDrr VR128:$src1, VR128:$src2)>;
2486  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2487            (VPANDrr VR128:$src1, VR128:$src2)>;
2488
2489  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2490            (VPORrr VR128:$src1, VR128:$src2)>;
2491  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2492            (VPORrr VR128:$src1, VR128:$src2)>;
2493  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2494            (VPORrr VR128:$src1, VR128:$src2)>;
2495
2496  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2497            (VPXORrr VR128:$src1, VR128:$src2)>;
2498  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2499            (VPXORrr VR128:$src1, VR128:$src2)>;
2500  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2501            (VPXORrr VR128:$src1, VR128:$src2)>;
2502
2503  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2504            (VPANDNrr VR128:$src1, VR128:$src2)>;
2505  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2506            (VPANDNrr VR128:$src1, VR128:$src2)>;
2507  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2508            (VPANDNrr VR128:$src1, VR128:$src2)>;
2509
2510  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2511            (VPANDrm VR128:$src1, addr:$src2)>;
2512  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2513            (VPANDrm VR128:$src1, addr:$src2)>;
2514  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2515            (VPANDrm VR128:$src1, addr:$src2)>;
2516
2517  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2518            (VPORrm VR128:$src1, addr:$src2)>;
2519  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2520            (VPORrm VR128:$src1, addr:$src2)>;
2521  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2522            (VPORrm VR128:$src1, addr:$src2)>;
2523
2524  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2525            (VPXORrm VR128:$src1, addr:$src2)>;
2526  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2527            (VPXORrm VR128:$src1, addr:$src2)>;
2528  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2529            (VPXORrm VR128:$src1, addr:$src2)>;
2530
2531  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2532            (VPANDNrm VR128:$src1, addr:$src2)>;
2533  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2534            (VPANDNrm VR128:$src1, addr:$src2)>;
2535  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2536            (VPANDNrm VR128:$src1, addr:$src2)>;
2537}
2538
2539let Predicates = [UseSSE2] in {
2540  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2541            (PANDrr VR128:$src1, VR128:$src2)>;
2542  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2543            (PANDrr VR128:$src1, VR128:$src2)>;
2544  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2545            (PANDrr VR128:$src1, VR128:$src2)>;
2546
2547  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2548            (PORrr VR128:$src1, VR128:$src2)>;
2549  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2550            (PORrr VR128:$src1, VR128:$src2)>;
2551  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2552            (PORrr VR128:$src1, VR128:$src2)>;
2553
2554  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2555            (PXORrr VR128:$src1, VR128:$src2)>;
2556  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2557            (PXORrr VR128:$src1, VR128:$src2)>;
2558  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2559            (PXORrr VR128:$src1, VR128:$src2)>;
2560
2561  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2562            (PANDNrr VR128:$src1, VR128:$src2)>;
2563  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2564            (PANDNrr VR128:$src1, VR128:$src2)>;
2565  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2566            (PANDNrr VR128:$src1, VR128:$src2)>;
2567
2568  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2569            (PANDrm VR128:$src1, addr:$src2)>;
2570  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2571            (PANDrm VR128:$src1, addr:$src2)>;
2572  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2573            (PANDrm VR128:$src1, addr:$src2)>;
2574
2575  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2576            (PORrm VR128:$src1, addr:$src2)>;
2577  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2578            (PORrm VR128:$src1, addr:$src2)>;
2579  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2580            (PORrm VR128:$src1, addr:$src2)>;
2581
2582  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2583            (PXORrm VR128:$src1, addr:$src2)>;
2584  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2585            (PXORrm VR128:$src1, addr:$src2)>;
2586  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2587            (PXORrm VR128:$src1, addr:$src2)>;
2588
2589  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2590            (PANDNrm VR128:$src1, addr:$src2)>;
2591  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2592            (PANDNrm VR128:$src1, addr:$src2)>;
2593  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2594            (PANDNrm VR128:$src1, addr:$src2)>;
2595}
2596
2597// Patterns for packed operations when we don't have integer type available.
2598def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2599          (ANDPSrr VR128:$src1, VR128:$src2)>;
2600def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2601          (ORPSrr VR128:$src1, VR128:$src2)>;
2602def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2603          (XORPSrr VR128:$src1, VR128:$src2)>;
2604def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2605          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2606
2607def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2608          (ANDPSrm VR128:$src1, addr:$src2)>;
2609def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2610          (ORPSrm VR128:$src1, addr:$src2)>;
2611def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2612          (XORPSrm VR128:$src1, addr:$src2)>;
2613def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2614          (ANDNPSrm VR128:$src1, addr:$src2)>;
2615
2616//===----------------------------------------------------------------------===//
2617// SSE 1 & 2 - Arithmetic Instructions
2618//===----------------------------------------------------------------------===//
2619
2620/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2621/// vector forms.
2622///
2623/// In addition, we also have a special variant of the scalar form here to
2624/// represent the associated intrinsic operation.  This form is unlike the
2625/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2626/// and leaves the top elements unmodified (therefore these cannot be commuted).
2627///
2628/// These three forms can each be reg+reg or reg+mem.
2629///
2630
2631/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2632/// classes below
2633multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2634                                  SDPatternOperator OpNode, X86SchedWriteSizes sched> {
2635let Uses = [MXCSR], mayRaiseFPException = 1 in {
2636  let Predicates = [HasAVX, NoVLX] in {
2637  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2638                               VR128, v4f32, f128mem, loadv4f32,
2639                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, WIG;
2640  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2641                               VR128, v2f64, f128mem, loadv2f64,
2642                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, WIG;
2643
2644  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2645                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2646                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, WIG;
2647  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2648                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2649                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, WIG;
2650  }
2651
2652  let Constraints = "$src1 = $dst" in {
2653    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2654                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2655                              sched.PS.XMM>, PS;
2656    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2657                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2658                              sched.PD.XMM>, PD;
2659  }
2660}
2661}
2662
2663multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2664                                  X86SchedWriteSizes sched> {
2665let Uses = [MXCSR], mayRaiseFPException = 1 in {
2666  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2667                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2668                         XS, VEX_4V, VEX_LIG, WIG;
2669  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2670                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2671                         XD, VEX_4V, VEX_LIG, WIG;
2672
2673  let Constraints = "$src1 = $dst" in {
2674    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2675                              OpNode, FR32, f32mem, SSEPackedSingle,
2676                              sched.PS.Scl>, XS;
2677    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2678                              OpNode, FR64, f64mem, SSEPackedDouble,
2679                              sched.PD.Scl>, XD;
2680  }
2681}
2682}
2683
2684multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2685                                      SDPatternOperator OpNode,
2686                                      X86SchedWriteSizes sched> {
2687let Uses = [MXCSR], mayRaiseFPException = 1 in {
2688  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2689                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2690                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, WIG;
2691  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2692                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2693                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, WIG;
2694
2695  let Constraints = "$src1 = $dst" in {
2696    defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2697                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2698                   SSEPackedSingle, sched.PS.Scl>, XS;
2699    defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2700                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2701                   SSEPackedDouble, sched.PD.Scl>, XD;
2702  }
2703}
2704}
2705
2706// Binary Arithmetic instructions
2707defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2708           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2709           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2710defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2711           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2712           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2713let isCommutable = 0 in {
2714  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2715             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2716             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2717  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2718             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2719             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2720  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2721             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2722             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2723  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2724             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2725             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2726}
2727
2728let isCodeGenOnly = 1 in {
2729  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2730             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2731  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2732             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2733}
2734
2735// Patterns used to select SSE scalar fp arithmetic instructions from
2736// either:
2737//
2738// (1) a scalar fp operation followed by a blend
2739//
2740// The effect is that the backend no longer emits unnecessary vector
2741// insert instructions immediately after SSE scalar fp instructions
2742// like addss or mulss.
2743//
2744// For example, given the following code:
2745//   __m128 foo(__m128 A, __m128 B) {
2746//     A[0] += B[0];
2747//     return A;
2748//   }
2749//
2750// Previously we generated:
2751//   addss %xmm0, %xmm1
2752//   movss %xmm1, %xmm0
2753//
2754// We now generate:
2755//   addss %xmm1, %xmm0
2756//
2757// (2) a vector packed single/double fp operation followed by a vector insert
2758//
2759// The effect is that the backend converts the packed fp instruction
2760// followed by a vector insert into a single SSE scalar fp instruction.
2761//
2762// For example, given the following code:
2763//   __m128 foo(__m128 A, __m128 B) {
2764//     __m128 C = A + B;
2765//     return (__m128) {c[0], a[1], a[2], a[3]};
2766//   }
2767//
2768// Previously we generated:
2769//   addps %xmm0, %xmm1
2770//   movss %xmm1, %xmm0
2771//
2772// We now generate:
2773//   addss %xmm1, %xmm0
2774
2775// TODO: Some canonicalization in lowering would simplify the number of
2776// patterns we have to try to match.
2777multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
2778                                ValueType VT, ValueType EltTy,
2779                                RegisterClass RC, PatFrag ld_frag,
2780                                Predicate BasePredicate> {
2781  let Predicates = [BasePredicate] in {
2782    // extracted scalar math op with insert via movss/movsd
2783    def : Pat<(VT (Move (VT VR128:$dst),
2784                        (VT (scalar_to_vector
2785                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2786                                 RC:$src))))),
2787              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2788               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2789    def : Pat<(VT (Move (VT VR128:$dst),
2790                        (VT (scalar_to_vector
2791                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2792                                 (ld_frag addr:$src)))))),
2793              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2794  }
2795
2796  // Repeat for AVX versions of the instructions.
2797  let Predicates = [UseAVX] in {
2798    // extracted scalar math op with insert via movss/movsd
2799    def : Pat<(VT (Move (VT VR128:$dst),
2800                        (VT (scalar_to_vector
2801                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2802                                 RC:$src))))),
2803              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2804               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2805    def : Pat<(VT (Move (VT VR128:$dst),
2806                        (VT (scalar_to_vector
2807                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2808                                 (ld_frag addr:$src)))))),
2809              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2810  }
2811}
2812
2813defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2814defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2815defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2816defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2817
2818defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2819defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2820defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2821defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2822
2823/// Unop Arithmetic
2824/// In addition, we also have a special variant of the scalar form here to
2825/// represent the associated intrinsic operation.  This form is unlike the
2826/// plain scalar form, in that it takes an entire vector (instead of a
2827/// scalar) and leaves the top elements undefined.
2828///
2829/// And, we have a special variant form for a full-vector intrinsic form.
2830
2831/// sse_fp_unop_s - SSE1 unops in scalar form
2832/// For the non-AVX defs, we need $src1 to be tied to $dst because
2833/// the HW instructions are 2 operand / destructive.
2834multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2835                          X86MemOperand x86memop, Operand intmemop,
2836                          SDPatternOperator OpNode, Domain d,
2837                          X86FoldableSchedWrite sched, Predicate target> {
2838  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2839  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2840              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2841            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2842            Requires<[target]>;
2843  let mayLoad = 1 in
2844  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2845            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2846            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2847            Sched<[sched.Folded]>,
2848            Requires<[target, OptForSize]>;
2849  }
2850
2851  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2852  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2853                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2854                Sched<[sched]>;
2855  let mayLoad = 1 in
2856  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2857                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2858                Sched<[sched.Folded, sched.ReadAfterFold]>;
2859  }
2860
2861}
2862
2863multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2864                              Intrinsic Intr, Predicate target> {
2865  let Predicates = [target] in {
2866  // These are unary operations, but they are modeled as having 2 source operands
2867  // because the high elements of the destination are unchanged in SSE.
2868  def : Pat<(Intr VR128:$src),
2869            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2870  }
2871  // We don't want to fold scalar loads into these instructions unless
2872  // optimizing for size. This is because the folded instruction will have a
2873  // partial register update, while the unfolded sequence will not, e.g.
2874  // movss mem, %xmm0
2875  // rcpss %xmm0, %xmm0
2876  // which has a clobber before the rcp, vs.
2877  // rcpss mem, %xmm0
2878  let Predicates = [target, OptForSize] in {
2879    def : Pat<(Intr (mem_frags addr:$src2)),
2880               (!cast<Instruction>(NAME#m_Int)
2881                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2882  }
2883}
2884
2885multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2886                              Intrinsic Intr, Predicate target> {
2887  let Predicates = [target] in {
2888   def : Pat<(Intr VR128:$src),
2889             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2890                                 VR128:$src)>;
2891  }
2892  let Predicates = [target, OptForSize] in {
2893    def : Pat<(Intr (mem_frags addr:$src2)),
2894              (!cast<Instruction>(NAME#m_Int)
2895                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2896  }
2897}
2898
2899multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2900                          ValueType ScalarVT, X86MemOperand x86memop,
2901                          Operand intmemop, SDPatternOperator OpNode, Domain d,
2902                          X86FoldableSchedWrite sched, Predicate target> {
2903  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2904  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2905            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2906            [], d>, Sched<[sched]>;
2907  let mayLoad = 1 in
2908  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2909             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2910            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2911  }
2912  let hasSideEffects = 0, ExeDomain = d in {
2913  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2914                (ins VR128:$src1, VR128:$src2),
2915             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2916             []>, Sched<[sched]>;
2917  let mayLoad = 1 in
2918  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2919                (ins VR128:$src1, intmemop:$src2),
2920             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2921             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2922  }
2923
2924  // We don't want to fold scalar loads into these instructions unless
2925  // optimizing for size. This is because the folded instruction will have a
2926  // partial register update, while the unfolded sequence will not, e.g.
2927  // vmovss mem, %xmm0
2928  // vrcpss %xmm0, %xmm0, %xmm0
2929  // which has a clobber before the rcp, vs.
2930  // vrcpss mem, %xmm0, %xmm0
2931  // TODO: In theory, we could fold the load, and avoid the stall caused by
2932  // the partial register store, either in BreakFalseDeps or with smarter RA.
2933  let Predicates = [target] in {
2934   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2935                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2936  }
2937  let Predicates = [target, OptForSize] in {
2938    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2939              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2940            addr:$src)>;
2941  }
2942}
2943
2944/// sse1_fp_unop_p - SSE1 unops in packed form.
2945multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2946                          X86SchedWriteWidths sched, list<Predicate> prds> {
2947let Predicates = prds in {
2948  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2949                       !strconcat("v", OpcodeStr,
2950                                  "ps\t{$src, $dst|$dst, $src}"),
2951                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2952                       VEX, Sched<[sched.XMM]>, WIG;
2953  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2954                       !strconcat("v", OpcodeStr,
2955                                  "ps\t{$src, $dst|$dst, $src}"),
2956                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2957                       VEX, Sched<[sched.XMM.Folded]>, WIG;
2958  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2959                        !strconcat("v", OpcodeStr,
2960                                   "ps\t{$src, $dst|$dst, $src}"),
2961                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2962                        VEX, VEX_L, Sched<[sched.YMM]>, WIG;
2963  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2964                        !strconcat("v", OpcodeStr,
2965                                   "ps\t{$src, $dst|$dst, $src}"),
2966                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2967                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG;
2968}
2969
2970  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2971                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2972                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2973                Sched<[sched.XMM]>;
2974  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2975                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2976                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2977                Sched<[sched.XMM.Folded]>;
2978}
2979
2980/// sse2_fp_unop_p - SSE2 unops in vector forms.
2981multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2982                          SDPatternOperator OpNode, X86SchedWriteWidths sched> {
2983let Predicates = [HasAVX, NoVLX] in {
2984  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2985                       !strconcat("v", OpcodeStr,
2986                                  "pd\t{$src, $dst|$dst, $src}"),
2987                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2988                       VEX, Sched<[sched.XMM]>, WIG;
2989  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2990                       !strconcat("v", OpcodeStr,
2991                                  "pd\t{$src, $dst|$dst, $src}"),
2992                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2993                       VEX, Sched<[sched.XMM.Folded]>, WIG;
2994  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2995                        !strconcat("v", OpcodeStr,
2996                                   "pd\t{$src, $dst|$dst, $src}"),
2997                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2998                        VEX, VEX_L, Sched<[sched.YMM]>, WIG;
2999  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
3000                        !strconcat("v", OpcodeStr,
3001                                   "pd\t{$src, $dst|$dst, $src}"),
3002                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
3003                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG;
3004}
3005
3006  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3007                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3008                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
3009                Sched<[sched.XMM]>;
3010  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
3011                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
3012                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
3013                Sched<[sched.XMM.Folded]>;
3014}
3015
3016multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
3017  defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
3018                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3019                      UseSSE1>, XS;
3020  defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
3021                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
3022                      AVXTarget>,
3023                      XS, VEX_4V, VEX_LIG, WIG;
3024}
3025
3026multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3027                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3028  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
3029                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
3030  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
3031                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
3032                       XS, VEX_4V, VEX_LIG, WIG;
3033}
3034
3035multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
3036                          X86SchedWriteWidths sched, Predicate AVXTarget> {
3037  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
3038                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
3039  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
3040                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
3041                         XD, VEX_4V, VEX_LIG, WIG;
3042}
3043
3044// Square root.
3045defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3046             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3047             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3048             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3049
3050// Reciprocal approximations. Note that these typically require refinement
3051// in order to obtain suitable precision.
3052defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3053             sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
3054             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3055defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3056             sse1_fp_unop_s_intr<"rcp", HasAVX>,
3057             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3058
3059// There is no f64 version of the reciprocal approximation instructions.
3060
3061multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
3062                                      ValueType VT, Predicate BasePredicate> {
3063  let Predicates = [BasePredicate] in {
3064    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3065                                  (OpNode (extractelt VT:$src, 0))))),
3066              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3067  }
3068
3069  // Repeat for AVX versions of the instructions.
3070  let Predicates = [UseAVX] in {
3071    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3072                                  (OpNode (extractelt VT:$src, 0))))),
3073              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3074  }
3075}
3076
3077defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3078defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3079
3080multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3081                                           SDNode Move, ValueType VT,
3082                                           Predicate BasePredicate> {
3083  let Predicates = [BasePredicate] in {
3084    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3085              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3086  }
3087
3088  // Repeat for AVX versions of the instructions.
3089  let Predicates = [HasAVX] in {
3090    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3091              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3092  }
3093}
3094
3095defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3096                                       v4f32, UseSSE1>;
3097defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3098                                       v4f32, UseSSE1>;
3099
3100
3101//===----------------------------------------------------------------------===//
3102// SSE 1 & 2 - Non-temporal stores
3103//===----------------------------------------------------------------------===//
3104
3105let AddedComplexity = 400 in { // Prefer non-temporal versions
3106let Predicates = [HasAVX, NoVLX] in {
3107let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3108def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3109                     (ins f128mem:$dst, VR128:$src),
3110                     "movntps\t{$src, $dst|$dst, $src}",
3111                     [(alignednontemporalstore (v4f32 VR128:$src),
3112                                               addr:$dst)]>, VEX, WIG;
3113def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3114                     (ins f128mem:$dst, VR128:$src),
3115                     "movntpd\t{$src, $dst|$dst, $src}",
3116                     [(alignednontemporalstore (v2f64 VR128:$src),
3117                                               addr:$dst)]>, VEX, WIG;
3118} // SchedRW
3119
3120let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3121def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3122                     (ins f256mem:$dst, VR256:$src),
3123                     "movntps\t{$src, $dst|$dst, $src}",
3124                     [(alignednontemporalstore (v8f32 VR256:$src),
3125                                               addr:$dst)]>, VEX, VEX_L, WIG;
3126def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3127                     (ins f256mem:$dst, VR256:$src),
3128                     "movntpd\t{$src, $dst|$dst, $src}",
3129                     [(alignednontemporalstore (v4f64 VR256:$src),
3130                                               addr:$dst)]>, VEX, VEX_L, WIG;
3131} // SchedRW
3132
3133let ExeDomain = SSEPackedInt in {
3134def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3135                         (ins i128mem:$dst, VR128:$src),
3136                         "movntdq\t{$src, $dst|$dst, $src}",
3137                         [(alignednontemporalstore (v2i64 VR128:$src),
3138                                                   addr:$dst)]>, VEX, WIG,
3139                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3140def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3141                    (ins i256mem:$dst, VR256:$src),
3142                    "movntdq\t{$src, $dst|$dst, $src}",
3143                    [(alignednontemporalstore (v4i64 VR256:$src),
3144                                              addr:$dst)]>, VEX, VEX_L, WIG,
3145                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3146} // ExeDomain
3147} // Predicates
3148
3149let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3150def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3151                    "movntps\t{$src, $dst|$dst, $src}",
3152                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3153def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3154                    "movntpd\t{$src, $dst|$dst, $src}",
3155                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3156} // SchedRW
3157
3158let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3159def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3160                    "movntdq\t{$src, $dst|$dst, $src}",
3161                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3162
3163let SchedRW = [WriteStoreNT] in {
3164// There is no AVX form for instructions below this point
3165def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3166                 "movnti{l}\t{$src, $dst|$dst, $src}",
3167                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3168               PS, Requires<[HasSSE2]>;
3169def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3170                     "movnti{q}\t{$src, $dst|$dst, $src}",
3171                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3172                  PS, Requires<[HasSSE2]>;
3173} // SchedRW = [WriteStoreNT]
3174
3175let Predicates = [HasAVX, NoVLX] in {
3176  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3177            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3178  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3179            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3180  def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
3181            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3182  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3183            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3184
3185  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3186            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3187  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3188            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3189  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3190            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3191  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3192            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3193}
3194
3195let Predicates = [UseSSE2] in {
3196  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3197            (MOVNTDQmr addr:$dst, VR128:$src)>;
3198  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3199            (MOVNTDQmr addr:$dst, VR128:$src)>;
3200  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
3201            (MOVNTDQmr addr:$dst, VR128:$src)>;
3202  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3203            (MOVNTDQmr addr:$dst, VR128:$src)>;
3204}
3205
3206} // AddedComplexity
3207
3208//===----------------------------------------------------------------------===//
3209// SSE 1 & 2 - Prefetch and memory fence
3210//===----------------------------------------------------------------------===//
3211
3212// Prefetch intrinsic.
3213let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3214def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3215    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3216def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3217    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3218def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3219    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3220def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3221    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3222}
3223
3224// FIXME: How should flush instruction be modeled?
3225let SchedRW = [WriteLoad] in {
3226// Flush cache
3227def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3228               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3229               PS, Requires<[HasCLFLUSH]>;
3230}
3231
3232let SchedRW = [WriteNop] in {
3233// Pause. This "instruction" is encoded as "rep; nop", so even though it
3234// was introduced with SSE2, it's backward compatible.
3235def PAUSE : I<0x90, RawFrm, (outs), (ins),
3236              "pause", [(int_x86_sse2_pause)]>, OBXS;
3237}
3238
3239let SchedRW = [WriteFence] in {
3240// Load, store, and memory fence
3241// TODO: As with mfence, we may want to ease the availability of sfence/lfence
3242// to include any 64-bit target.
3243def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3244               PS, Requires<[HasSSE1]>;
3245def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3246               PS, Requires<[HasSSE2]>;
3247def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3248               PS, Requires<[HasMFence]>;
3249} // SchedRW
3250
3251def : Pat<(X86MFence), (MFENCE)>;
3252
3253//===----------------------------------------------------------------------===//
3254// SSE 1 & 2 - Load/Store XCSR register
3255//===----------------------------------------------------------------------===//
3256
3257let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3258def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3259               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3260               VEX, Sched<[WriteLDMXCSR]>, WIG;
3261let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3262def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3263               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3264               VEX, Sched<[WriteSTMXCSR]>, WIG;
3265
3266let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
3267def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3268              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3269              PS, Sched<[WriteLDMXCSR]>;
3270let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
3271def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3272              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3273              PS, Sched<[WriteSTMXCSR]>;
3274
3275//===---------------------------------------------------------------------===//
3276// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3277//===---------------------------------------------------------------------===//
3278
3279let ExeDomain = SSEPackedInt in { // SSE integer instructions
3280
3281let hasSideEffects = 0 in {
3282def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3283                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3284                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG;
3285def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3286                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3287                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG;
3288def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3289                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3290                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG;
3291def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3292                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3293                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG;
3294}
3295
3296// For Disassembler
3297let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3298def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3299                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3300                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3301                          VEX, WIG;
3302def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3303                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3304                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3305                          VEX, VEX_L, WIG;
3306def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3307                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3308                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3309                          VEX, WIG;
3310def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3311                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3312                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3313                          VEX, VEX_L, WIG;
3314}
3315
3316let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3317    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3318def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3319                      "movdqa\t{$src, $dst|$dst, $src}",
3320                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3321                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG;
3322def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3323                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3324                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3325                      VEX, VEX_L, WIG;
3326def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3327                   "vmovdqu\t{$src, $dst|$dst, $src}",
3328                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3329                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3330                   XS, VEX, WIG;
3331def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3332                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3333                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3334                   XS, VEX, VEX_L, WIG;
3335}
3336
3337let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3338def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3339                      (ins i128mem:$dst, VR128:$src),
3340                      "movdqa\t{$src, $dst|$dst, $src}",
3341                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3342                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, WIG;
3343def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3344                      (ins i256mem:$dst, VR256:$src),
3345                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3346                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, WIG;
3347def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3348                   "vmovdqu\t{$src, $dst|$dst, $src}",
3349                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3350                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, WIG;
3351def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3352                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3353                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, WIG;
3354}
3355
3356let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3357let hasSideEffects = 0 in {
3358def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3359                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3360
3361def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3362                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3363                   XS, Requires<[UseSSE2]>;
3364}
3365
3366// For Disassembler
3367let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3368def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3369                       "movdqa\t{$src, $dst|$dst, $src}", []>;
3370
3371def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3372                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3373                       XS, Requires<[UseSSE2]>;
3374}
3375} // SchedRW
3376
3377let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3378    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3379def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3380                   "movdqa\t{$src, $dst|$dst, $src}",
3381                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3382def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3383                   "movdqu\t{$src, $dst|$dst, $src}",
3384                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3385                 XS, Requires<[UseSSE2]>;
3386}
3387
3388let mayStore = 1, hasSideEffects = 0,
3389    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3390def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3391                   "movdqa\t{$src, $dst|$dst, $src}",
3392                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3393def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3394                   "movdqu\t{$src, $dst|$dst, $src}",
3395                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3396                 XS, Requires<[UseSSE2]>;
3397}
3398
3399} // ExeDomain = SSEPackedInt
3400
3401// Reversed version with ".s" suffix for GAS compatibility.
3402def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3403                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3404def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3405                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3406def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3407                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3408def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3409                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3410
3411// Reversed version with ".s" suffix for GAS compatibility.
3412def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3413                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3414def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3415                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3416
3417let Predicates = [HasAVX, NoVLX] in {
3418  // Additional patterns for other integer sizes.
3419  def : Pat<(alignedloadv4i32 addr:$src),
3420            (VMOVDQArm addr:$src)>;
3421  def : Pat<(alignedloadv8i16 addr:$src),
3422            (VMOVDQArm addr:$src)>;
3423  def : Pat<(alignedloadv8f16 addr:$src),
3424            (VMOVDQArm addr:$src)>;
3425  def : Pat<(alignedloadv16i8 addr:$src),
3426            (VMOVDQArm addr:$src)>;
3427  def : Pat<(loadv4i32 addr:$src),
3428            (VMOVDQUrm addr:$src)>;
3429  def : Pat<(loadv8i16 addr:$src),
3430            (VMOVDQUrm addr:$src)>;
3431  def : Pat<(loadv8f16 addr:$src),
3432            (VMOVDQUrm addr:$src)>;
3433  def : Pat<(loadv16i8 addr:$src),
3434            (VMOVDQUrm addr:$src)>;
3435
3436  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3437            (VMOVDQAmr addr:$dst, VR128:$src)>;
3438  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3439            (VMOVDQAmr addr:$dst, VR128:$src)>;
3440  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
3441            (VMOVDQAmr addr:$dst, VR128:$src)>;
3442  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3443            (VMOVDQAmr addr:$dst, VR128:$src)>;
3444  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3445            (VMOVDQUmr addr:$dst, VR128:$src)>;
3446  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3447            (VMOVDQUmr addr:$dst, VR128:$src)>;
3448  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
3449            (VMOVDQUmr addr:$dst, VR128:$src)>;
3450  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3451            (VMOVDQUmr addr:$dst, VR128:$src)>;
3452}
3453
3454//===---------------------------------------------------------------------===//
3455// SSE2 - Packed Integer Arithmetic Instructions
3456//===---------------------------------------------------------------------===//
3457
3458let ExeDomain = SSEPackedInt in { // SSE integer instructions
3459
3460/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3461multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3462                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3463                         PatFrag memop_frag, X86MemOperand x86memop,
3464                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3465  let isCommutable = 1 in
3466  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3467       (ins RC:$src1, RC:$src2),
3468       !if(Is2Addr,
3469           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3470           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3471       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3472       Sched<[sched]>;
3473  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3474       (ins RC:$src1, x86memop:$src2),
3475       !if(Is2Addr,
3476           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3477           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3478       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3479                                     (memop_frag addr:$src2))))]>,
3480       Sched<[sched.Folded, sched.ReadAfterFold]>;
3481}
3482} // ExeDomain = SSEPackedInt
3483
3484defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3485                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3486defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3487                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3488defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3489                             SchedWriteVecALU, 1, NoVLX>;
3490defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3491                             SchedWriteVecALU, 1, NoVLX>;
3492defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3493                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3494defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3495                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3496defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3497                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3498defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3499                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3500defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3501                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3502defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3503                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3504defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3505                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3506defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3507                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3508defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3509                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3510defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3511                             SchedWriteVecALU, 0, NoVLX>;
3512defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3513                             SchedWriteVecALU, 0, NoVLX>;
3514defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3515                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3516defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3517                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3518defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3519                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3520defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3521                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3522defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3523                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3524defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3525                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3526defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3527                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3528defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3529                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3530defm PAVGB   : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8,
3531                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3532defm PAVGW   : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16,
3533                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3534defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3535                             SchedWriteVecIMul, 1, NoVLX>;
3536
3537let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3538defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3539                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3540                              VEX_4V, WIG;
3541
3542let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3543defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3544                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3545                               0>, VEX_4V, VEX_L, WIG;
3546let Constraints = "$src1 = $dst" in
3547defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3548                             memop, i128mem, SchedWriteVecIMul.XMM>;
3549
3550let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3551defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3552                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3553                             VEX_4V, WIG;
3554let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3555defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3556                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3557                             VEX_4V, VEX_L, WIG;
3558let Constraints = "$src1 = $dst" in
3559defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3560                            memop, i128mem, SchedWritePSADBW.XMM>;
3561
3562//===---------------------------------------------------------------------===//
3563// SSE2 - Packed Integer Logical Instructions
3564//===---------------------------------------------------------------------===//
3565
3566multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3567                         string OpcodeStr, SDNode OpNode,
3568                         SDNode OpNode2, RegisterClass RC,
3569                         X86FoldableSchedWrite sched,
3570                         X86FoldableSchedWrite schedImm,
3571                         ValueType DstVT, ValueType SrcVT,
3572                         PatFrag ld_frag, bit Is2Addr = 1> {
3573  // src2 is always 128-bit
3574  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3575       (ins RC:$src1, VR128:$src2),
3576       !if(Is2Addr,
3577           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3578           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3579       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3580       Sched<[sched]>;
3581  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3582       (ins RC:$src1, i128mem:$src2),
3583       !if(Is2Addr,
3584           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3585           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3586       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3587                       (SrcVT (ld_frag addr:$src2)))))]>,
3588       Sched<[sched.Folded, sched.ReadAfterFold]>;
3589  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3590       (ins RC:$src1, u8imm:$src2),
3591       !if(Is2Addr,
3592           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3593           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3594       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3595       Sched<[schedImm]>;
3596}
3597
3598multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3599                             string OpcodeStr, SDNode OpNode,
3600                             SDNode OpNode2, ValueType DstVT128,
3601                             ValueType DstVT256, ValueType SrcVT,
3602                             X86SchedWriteWidths sched,
3603                             X86SchedWriteWidths schedImm, Predicate prd> {
3604let Predicates = [HasAVX, prd] in
3605  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3606                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3607                              DstVT128, SrcVT, load, 0>, VEX_4V, WIG;
3608let Predicates = [HasAVX2, prd] in
3609  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3610                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3611                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3612                                WIG;
3613let Constraints = "$src1 = $dst" in
3614  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3615                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3616                            memop>;
3617}
3618
3619multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3620                        SDNode OpNode, RegisterClass RC, ValueType VT,
3621                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3622  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3623       !if(Is2Addr,
3624           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3625           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3626       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3627       Sched<[sched]>;
3628}
3629
3630multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3631                            SDNode OpNode, X86SchedWriteWidths sched> {
3632let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3633  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3634                             VR128, v16i8, sched.XMM, 0>, VEX_4V, WIG;
3635let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3636  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3637                               VR256, v32i8, sched.YMM, 0>,
3638                               VEX_4V, VEX_L, WIG;
3639let Constraints = "$src1 = $dst" in
3640  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3641                           sched.XMM>;
3642}
3643
3644let ExeDomain = SSEPackedInt in {
3645  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3646                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3647                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3648  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3649                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3650                                 SchedWriteVecShiftImm, NoVLX>;
3651  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3652                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3653                                 SchedWriteVecShiftImm, NoVLX>;
3654
3655  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3656                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3657                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3658  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3659                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3660                                 SchedWriteVecShiftImm, NoVLX>;
3661  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3662                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3663                                 SchedWriteVecShiftImm, NoVLX>;
3664
3665  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3666                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3667                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3668  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3669                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3670                                 SchedWriteVecShiftImm, NoVLX>;
3671
3672  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3673                                 SchedWriteShuffle>;
3674  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3675                                 SchedWriteShuffle>;
3676} // ExeDomain = SSEPackedInt
3677
3678//===---------------------------------------------------------------------===//
3679// SSE2 - Packed Integer Comparison Instructions
3680//===---------------------------------------------------------------------===//
3681
3682defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3683                             SchedWriteVecALU, 1, TruePredicate>;
3684defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3685                             SchedWriteVecALU, 1, TruePredicate>;
3686defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3687                             SchedWriteVecALU, 1, TruePredicate>;
3688defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3689                             SchedWriteVecALU, 0, TruePredicate>;
3690defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3691                             SchedWriteVecALU, 0, TruePredicate>;
3692defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3693                             SchedWriteVecALU, 0, TruePredicate>;
3694
3695//===---------------------------------------------------------------------===//
3696// SSE2 - Packed Integer Shuffle Instructions
3697//===---------------------------------------------------------------------===//
3698
3699let ExeDomain = SSEPackedInt in {
3700multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3701                         SDNode OpNode, X86SchedWriteWidths sched,
3702                         Predicate prd> {
3703let Predicates = [HasAVX, prd] in {
3704  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3705                      (ins VR128:$src1, u8imm:$src2),
3706                      !strconcat("v", OpcodeStr,
3707                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3708                      [(set VR128:$dst,
3709                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3710                      VEX, Sched<[sched.XMM]>, WIG;
3711  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3712                      (ins i128mem:$src1, u8imm:$src2),
3713                      !strconcat("v", OpcodeStr,
3714                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3715                     [(set VR128:$dst,
3716                       (vt128 (OpNode (load addr:$src1),
3717                        (i8 timm:$src2))))]>, VEX,
3718                  Sched<[sched.XMM.Folded]>, WIG;
3719}
3720
3721let Predicates = [HasAVX2, prd] in {
3722  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3723                       (ins VR256:$src1, u8imm:$src2),
3724                       !strconcat("v", OpcodeStr,
3725                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3726                       [(set VR256:$dst,
3727                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3728                       VEX, VEX_L, Sched<[sched.YMM]>, WIG;
3729  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3730                       (ins i256mem:$src1, u8imm:$src2),
3731                       !strconcat("v", OpcodeStr,
3732                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3733                      [(set VR256:$dst,
3734                        (vt256 (OpNode (load addr:$src1),
3735                         (i8 timm:$src2))))]>, VEX, VEX_L,
3736                   Sched<[sched.YMM.Folded]>, WIG;
3737}
3738
3739let Predicates = [UseSSE2] in {
3740  def ri : Ii8<0x70, MRMSrcReg,
3741               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3742               !strconcat(OpcodeStr,
3743                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3744               [(set VR128:$dst,
3745                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3746               Sched<[sched.XMM]>;
3747  def mi : Ii8<0x70, MRMSrcMem,
3748               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3749               !strconcat(OpcodeStr,
3750                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3751               [(set VR128:$dst,
3752                 (vt128 (OpNode (memop addr:$src1),
3753                        (i8 timm:$src2))))]>,
3754               Sched<[sched.XMM.Folded]>;
3755}
3756}
3757} // ExeDomain = SSEPackedInt
3758
3759defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3760                             SchedWriteShuffle, NoVLX>, PD;
3761defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3762                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3763defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3764                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3765
3766//===---------------------------------------------------------------------===//
3767// Packed Integer Pack Instructions (SSE & AVX)
3768//===---------------------------------------------------------------------===//
3769
3770let ExeDomain = SSEPackedInt in {
3771multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3772                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3773                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3774                     PatFrag ld_frag, bit Is2Addr = 1> {
3775  def rr : PDI<opc, MRMSrcReg,
3776               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3777               !if(Is2Addr,
3778                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3779                   !strconcat(OpcodeStr,
3780                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3781               [(set RC:$dst,
3782                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3783               Sched<[sched]>;
3784  def rm : PDI<opc, MRMSrcMem,
3785               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3786               !if(Is2Addr,
3787                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3788                   !strconcat(OpcodeStr,
3789                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3790               [(set RC:$dst,
3791                     (OutVT (OpNode (ArgVT RC:$src1),
3792                                    (ld_frag addr:$src2))))]>,
3793               Sched<[sched.Folded, sched.ReadAfterFold]>;
3794}
3795
3796multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3797                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3798                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3799                     PatFrag ld_frag, bit Is2Addr = 1> {
3800  def rr : SS48I<opc, MRMSrcReg,
3801                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3802                 !if(Is2Addr,
3803                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3804                     !strconcat(OpcodeStr,
3805                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3806                 [(set RC:$dst,
3807                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3808                 Sched<[sched]>;
3809  def rm : SS48I<opc, MRMSrcMem,
3810                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3811                 !if(Is2Addr,
3812                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3813                     !strconcat(OpcodeStr,
3814                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3815                 [(set RC:$dst,
3816                       (OutVT (OpNode (ArgVT RC:$src1),
3817                                      (ld_frag addr:$src2))))]>,
3818                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3819}
3820
3821let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3822  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3823                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3824                             VEX_4V, WIG;
3825  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3826                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3827                             VEX_4V, WIG;
3828
3829  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3830                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3831                             VEX_4V, WIG;
3832  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3833                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3834                             VEX_4V, WIG;
3835}
3836
3837let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3838  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3839                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3840                              VEX_4V, VEX_L, WIG;
3841  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3842                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3843                              VEX_4V, VEX_L, WIG;
3844
3845  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3846                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3847                              VEX_4V, VEX_L, WIG;
3848  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3849                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3850                              VEX_4V, VEX_L, WIG;
3851}
3852
3853let Constraints = "$src1 = $dst" in {
3854  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3855                            i128mem, SchedWriteShuffle.XMM, memop>;
3856  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3857                            i128mem, SchedWriteShuffle.XMM, memop>;
3858
3859  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3860                            i128mem, SchedWriteShuffle.XMM, memop>;
3861
3862  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3863                            i128mem, SchedWriteShuffle.XMM, memop>;
3864}
3865} // ExeDomain = SSEPackedInt
3866
3867//===---------------------------------------------------------------------===//
3868// SSE2 - Packed Integer Unpack Instructions
3869//===---------------------------------------------------------------------===//
3870
3871let ExeDomain = SSEPackedInt in {
3872multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3873                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3874                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3875                       bit Is2Addr = 1> {
3876  def rr : PDI<opc, MRMSrcReg,
3877      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3878      !if(Is2Addr,
3879          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3880          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3881      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3882      Sched<[sched]>;
3883  def rm : PDI<opc, MRMSrcMem,
3884      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3885      !if(Is2Addr,
3886          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3887          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3888      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3889      Sched<[sched.Folded, sched.ReadAfterFold]>;
3890}
3891
3892let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3893  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3894                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3895                                 VEX_4V, WIG;
3896  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3897                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3898                                 VEX_4V, WIG;
3899  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3900                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3901                                 VEX_4V, WIG;
3902  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3903                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3904                                 VEX_4V, WIG;
3905}
3906
3907let Predicates = [HasAVX, NoVLX] in {
3908  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3909                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3910                                 VEX_4V, WIG;
3911  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3912                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3913                                 VEX_4V, WIG;
3914  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3915                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3916                                 VEX_4V, WIG;
3917  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3918                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3919                                 VEX_4V, WIG;
3920}
3921
3922let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3923  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3924                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3925                                  VEX_4V, VEX_L, WIG;
3926  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3927                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3928                                  VEX_4V, VEX_L, WIG;
3929  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3930                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3931                                  VEX_4V, VEX_L, WIG;
3932  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3933                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3934                                  VEX_4V, VEX_L, WIG;
3935}
3936
3937let Predicates = [HasAVX2, NoVLX] in {
3938  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3939                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3940                                  VEX_4V, VEX_L, WIG;
3941  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3942                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3943                                  VEX_4V, VEX_L, WIG;
3944  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3945                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3946                                  VEX_4V, VEX_L, WIG;
3947  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3948                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3949                                  VEX_4V, VEX_L, WIG;
3950}
3951
3952let Constraints = "$src1 = $dst" in {
3953  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3954                                i128mem, SchedWriteShuffle.XMM, memop>;
3955  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3956                                i128mem, SchedWriteShuffle.XMM, memop>;
3957  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3958                                i128mem, SchedWriteShuffle.XMM, memop>;
3959  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3960                                i128mem, SchedWriteShuffle.XMM, memop>;
3961
3962  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3963                                i128mem, SchedWriteShuffle.XMM, memop>;
3964  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3965                                i128mem, SchedWriteShuffle.XMM, memop>;
3966  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3967                                i128mem, SchedWriteShuffle.XMM, memop>;
3968  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3969                                i128mem, SchedWriteShuffle.XMM, memop>;
3970}
3971} // ExeDomain = SSEPackedInt
3972
3973//===---------------------------------------------------------------------===//
3974// SSE2 - Packed Integer Extract and Insert
3975//===---------------------------------------------------------------------===//
3976
3977let ExeDomain = SSEPackedInt in {
3978multiclass sse2_pinsrw<bit Is2Addr = 1> {
3979  def rr : Ii8<0xC4, MRMSrcReg,
3980       (outs VR128:$dst), (ins VR128:$src1,
3981        GR32orGR64:$src2, u8imm:$src3),
3982       !if(Is2Addr,
3983           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3984           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3985       [(set VR128:$dst,
3986         (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
3987       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3988  def rm : Ii8<0xC4, MRMSrcMem,
3989                      (outs VR128:$dst), (ins VR128:$src1,
3990                       i16mem:$src2, u8imm:$src3),
3991       !if(Is2Addr,
3992           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3993           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3994       [(set VR128:$dst,
3995         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3996                    timm:$src3))]>,
3997       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3998}
3999
4000// Extract
4001let Predicates = [HasAVX, NoBWI] in
4002def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
4003                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4004                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4005                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4006                                            timm:$src2))]>,
4007                PD, VEX, WIG, Sched<[WriteVecExtract]>;
4008def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
4009                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
4010                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
4011                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
4012                                            timm:$src2))]>,
4013               Sched<[WriteVecExtract]>;
4014
4015// Insert
4016let Predicates = [HasAVX, NoBWI] in
4017defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, WIG;
4018
4019let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
4020defm PINSRW : sse2_pinsrw, PD;
4021
4022} // ExeDomain = SSEPackedInt
4023
4024// Always select FP16 instructions if available.
4025let Predicates = [UseSSE2], AddedComplexity = -10 in {
4026  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4027  def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>;
4028  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4029  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4030}
4031
4032let Predicates = [HasAVX, NoBWI] in {
4033  def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>;
4034  def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>;
4035  def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>;
4036}
4037
4038//===---------------------------------------------------------------------===//
4039// SSE2 - Packed Mask Creation
4040//===---------------------------------------------------------------------===//
4041
4042let ExeDomain = SSEPackedInt in {
4043
4044def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4045           (ins VR128:$src),
4046           "pmovmskb\t{$src, $dst|$dst, $src}",
4047           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4048           Sched<[WriteVecMOVMSK]>, VEX, WIG;
4049
4050let Predicates = [HasAVX2] in {
4051def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
4052           (ins VR256:$src),
4053           "pmovmskb\t{$src, $dst|$dst, $src}",
4054           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
4055           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, WIG;
4056}
4057
4058def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
4059           "pmovmskb\t{$src, $dst|$dst, $src}",
4060           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
4061           Sched<[WriteVecMOVMSK]>;
4062
4063} // ExeDomain = SSEPackedInt
4064
4065//===---------------------------------------------------------------------===//
4066// SSE2 - Conditional Store
4067//===---------------------------------------------------------------------===//
4068
4069let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4070// As VEX does not have separate instruction contexts for address size
4071// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict.
4072// Prefer VMASKMODDQU64.
4073let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4074def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4075           (ins VR128:$src, VR128:$mask),
4076           "maskmovdqu\t{$mask, $src|$src, $mask}",
4077           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4078           VEX, WIG;
4079let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in
4080def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4081           (ins VR128:$src, VR128:$mask),
4082           "maskmovdqu\t{$mask, $src|$src, $mask}",
4083           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4084           VEX, WIG;
4085
4086let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4087def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4088           "maskmovdqu\t{$mask, $src|$src, $mask}",
4089           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4090let Uses = [EDI], Predicates = [UseSSE2] in
4091def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4092           "maskmovdqu\t{$mask, $src|$src, $mask}",
4093           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4094
4095} // ExeDomain = SSEPackedInt
4096
4097//===---------------------------------------------------------------------===//
4098// SSE2 - Move Doubleword/Quadword
4099//===---------------------------------------------------------------------===//
4100
4101//===---------------------------------------------------------------------===//
4102// Move Int Doubleword to Packed Double Int
4103//
4104let ExeDomain = SSEPackedInt in {
4105def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4106                        "movd\t{$src, $dst|$dst, $src}",
4107                        [(set VR128:$dst,
4108                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4109                          VEX, Sched<[WriteVecMoveFromGpr]>;
4110def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4111                        "movd\t{$src, $dst|$dst, $src}",
4112                        [(set VR128:$dst,
4113                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4114                        VEX, Sched<[WriteVecLoad]>;
4115def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4116                          "movq\t{$src, $dst|$dst, $src}",
4117                          [(set VR128:$dst,
4118                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4119                          VEX, Sched<[WriteVecMoveFromGpr]>;
4120let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4121def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4122                          "movq\t{$src, $dst|$dst, $src}", []>,
4123                          VEX, Sched<[WriteVecLoad]>;
4124let isCodeGenOnly = 1 in
4125def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4126                         "movq\t{$src, $dst|$dst, $src}",
4127                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4128                         VEX, Sched<[WriteVecMoveFromGpr]>;
4129
4130def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4131                      "movd\t{$src, $dst|$dst, $src}",
4132                      [(set VR128:$dst,
4133                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4134                      Sched<[WriteVecMoveFromGpr]>;
4135def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4136                      "movd\t{$src, $dst|$dst, $src}",
4137                      [(set VR128:$dst,
4138                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4139                      Sched<[WriteVecLoad]>;
4140def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4141                        "movq\t{$src, $dst|$dst, $src}",
4142                        [(set VR128:$dst,
4143                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4144                        Sched<[WriteVecMoveFromGpr]>;
4145let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4146def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4147                        "movq\t{$src, $dst|$dst, $src}", []>,
4148                        Sched<[WriteVecLoad]>;
4149let isCodeGenOnly = 1 in
4150def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4151                       "movq\t{$src, $dst|$dst, $src}",
4152                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4153                       Sched<[WriteVecMoveFromGpr]>;
4154} // ExeDomain = SSEPackedInt
4155
4156//===---------------------------------------------------------------------===//
4157// Move Int Doubleword to Single Scalar
4158//
4159let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4160  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4161                        "movd\t{$src, $dst|$dst, $src}",
4162                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4163                        VEX, Sched<[WriteVecMoveFromGpr]>;
4164
4165  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4166                        "movd\t{$src, $dst|$dst, $src}",
4167                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4168                        Sched<[WriteVecMoveFromGpr]>;
4169
4170} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4171
4172//===---------------------------------------------------------------------===//
4173// Move Packed Doubleword Int to Packed Double Int
4174//
4175let ExeDomain = SSEPackedInt in {
4176def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4177                         "movd\t{$src, $dst|$dst, $src}",
4178                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4179                                          (iPTR 0)))]>, VEX,
4180                         Sched<[WriteVecMoveToGpr]>;
4181def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4182                         (ins i32mem:$dst, VR128:$src),
4183                         "movd\t{$src, $dst|$dst, $src}",
4184                         [(store (i32 (extractelt (v4i32 VR128:$src),
4185                                       (iPTR 0))), addr:$dst)]>,
4186                         VEX, Sched<[WriteVecStore]>;
4187def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4188                       "movd\t{$src, $dst|$dst, $src}",
4189                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4190                                        (iPTR 0)))]>,
4191                   Sched<[WriteVecMoveToGpr]>;
4192def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4193                       "movd\t{$src, $dst|$dst, $src}",
4194                       [(store (i32 (extractelt (v4i32 VR128:$src),
4195                                     (iPTR 0))), addr:$dst)]>,
4196                       Sched<[WriteVecStore]>;
4197} // ExeDomain = SSEPackedInt
4198
4199//===---------------------------------------------------------------------===//
4200// Move Packed Doubleword Int first element to Doubleword Int
4201//
4202let ExeDomain = SSEPackedInt in {
4203let SchedRW = [WriteVecMoveToGpr] in {
4204def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4205                          "movq\t{$src, $dst|$dst, $src}",
4206                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4207                                                        (iPTR 0)))]>,
4208                      VEX;
4209
4210def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4211                        "movq\t{$src, $dst|$dst, $src}",
4212                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4213                                                         (iPTR 0)))]>;
4214} //SchedRW
4215
4216let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4217def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4218                          (ins i64mem:$dst, VR128:$src),
4219                          "movq\t{$src, $dst|$dst, $src}", []>,
4220                          VEX, Sched<[WriteVecStore]>;
4221let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4222def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4223                        "movq\t{$src, $dst|$dst, $src}", []>,
4224                        Sched<[WriteVecStore]>;
4225} // ExeDomain = SSEPackedInt
4226
4227//===---------------------------------------------------------------------===//
4228// Bitcast FR64 <-> GR64
4229//
4230let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4231  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4232                           "movq\t{$src, $dst|$dst, $src}",
4233                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4234                           VEX, Sched<[WriteVecMoveToGpr]>;
4235
4236  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4237                         "movq\t{$src, $dst|$dst, $src}",
4238                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4239                         Sched<[WriteVecMoveToGpr]>;
4240} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4241
4242//===---------------------------------------------------------------------===//
4243// Move Scalar Single to Double Int
4244//
4245let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4246  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4247                        "movd\t{$src, $dst|$dst, $src}",
4248                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4249                        VEX, Sched<[WriteVecMoveToGpr]>;
4250  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4251                        "movd\t{$src, $dst|$dst, $src}",
4252                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4253                        Sched<[WriteVecMoveToGpr]>;
4254} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4255
4256let Predicates = [UseAVX] in {
4257  def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))),
4258            (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
4259                                              GR8:$src, sub_8bit)))>;
4260  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4261            (VMOVDI2PDIrr GR32:$src)>;
4262
4263  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4264            (VMOV64toPQIrr GR64:$src)>;
4265
4266  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4267  // These instructions also write zeros in the high part of a 256-bit register.
4268  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4269            (VMOVDI2PDIrm addr:$src)>;
4270  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4271            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4272}
4273
4274let Predicates = [UseSSE2] in {
4275  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4276            (MOVDI2PDIrr GR32:$src)>;
4277
4278  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4279            (MOV64toPQIrr GR64:$src)>;
4280  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4281            (MOVDI2PDIrm addr:$src)>;
4282}
4283
4284// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4285// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4286// these aliases.
4287def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4288                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4289def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4290                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4291// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4292def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4293                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4294def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4295                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4296
4297//===---------------------------------------------------------------------===//
4298// SSE2 - Move Quadword
4299//===---------------------------------------------------------------------===//
4300
4301//===---------------------------------------------------------------------===//
4302// Move Quadword Int to Packed Quadword Int
4303//
4304
4305let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4306def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4307                    "vmovq\t{$src, $dst|$dst, $src}",
4308                    [(set VR128:$dst,
4309                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4310                    VEX, Requires<[UseAVX]>, WIG;
4311def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4312                    "movq\t{$src, $dst|$dst, $src}",
4313                    [(set VR128:$dst,
4314                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4315                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4316} // ExeDomain, SchedRW
4317
4318//===---------------------------------------------------------------------===//
4319// Move Packed Quadword Int to Quadword Int
4320//
4321let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4322def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4323                        "movq\t{$src, $dst|$dst, $src}",
4324                        [(store (i64 (extractelt (v2i64 VR128:$src),
4325                                      (iPTR 0))), addr:$dst)]>,
4326                        VEX, WIG;
4327def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4328                      "movq\t{$src, $dst|$dst, $src}",
4329                      [(store (i64 (extractelt (v2i64 VR128:$src),
4330                                    (iPTR 0))), addr:$dst)]>;
4331} // ExeDomain, SchedRW
4332
4333// For disassembler only
4334let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4335    SchedRW = [SchedWriteVecLogic.XMM] in {
4336def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4337                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, WIG;
4338def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4339                      "movq\t{$src, $dst|$dst, $src}", []>;
4340}
4341
4342def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4343                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4344def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4345                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4346
4347let Predicates = [UseAVX] in {
4348  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4349            (VMOVQI2PQIrm addr:$src)>;
4350  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4351            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4352
4353  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4354            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4355}
4356
4357let Predicates = [UseSSE2] in {
4358  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4359
4360  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4361            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4362}
4363
4364//===---------------------------------------------------------------------===//
4365// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4366// IA32 document. movq xmm1, xmm2 does clear the high bits.
4367//
4368let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4369def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4370                        "vmovq\t{$src, $dst|$dst, $src}",
4371                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4372                         XS, VEX, Requires<[UseAVX]>, WIG;
4373def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4374                        "movq\t{$src, $dst|$dst, $src}",
4375                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4376                        XS, Requires<[UseSSE2]>;
4377} // ExeDomain, SchedRW
4378
4379let Predicates = [UseAVX] in {
4380  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4381            (VMOVZPQILo2PQIrr VR128:$src)>;
4382}
4383let Predicates = [UseSSE2] in {
4384  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4385            (MOVZPQILo2PQIrr VR128:$src)>;
4386}
4387
4388let Predicates = [UseAVX] in {
4389  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4390            (SUBREG_TO_REG (i32 0),
4391             (v2f64 (VMOVZPQILo2PQIrr
4392                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4393             sub_xmm)>;
4394  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4395            (SUBREG_TO_REG (i32 0),
4396             (v2i64 (VMOVZPQILo2PQIrr
4397                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4398             sub_xmm)>;
4399}
4400
4401//===---------------------------------------------------------------------===//
4402// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4403//===---------------------------------------------------------------------===//
4404
4405multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4406                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4407                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4408def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4409                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4410                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4411                      Sched<[sched]>;
4412def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4413                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4414                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4415                      Sched<[sched.Folded]>;
4416}
4417
4418let Predicates = [HasAVX, NoVLX] in {
4419  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4420                                       v4f32, VR128, loadv4f32, f128mem,
4421                                       SchedWriteFShuffle.XMM>, VEX, WIG;
4422  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4423                                       v4f32, VR128, loadv4f32, f128mem,
4424                                       SchedWriteFShuffle.XMM>, VEX, WIG;
4425  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4426                                       v8f32, VR256, loadv8f32, f256mem,
4427                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG;
4428  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4429                                       v8f32, VR256, loadv8f32, f256mem,
4430                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG;
4431}
4432defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4433                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4434defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4435                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4436
4437let Predicates = [HasAVX, NoVLX] in {
4438  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4439            (VMOVSHDUPrr VR128:$src)>;
4440  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4441            (VMOVSHDUPrm addr:$src)>;
4442  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4443            (VMOVSLDUPrr VR128:$src)>;
4444  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4445            (VMOVSLDUPrm addr:$src)>;
4446  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4447            (VMOVSHDUPYrr VR256:$src)>;
4448  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4449            (VMOVSHDUPYrm addr:$src)>;
4450  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4451            (VMOVSLDUPYrr VR256:$src)>;
4452  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4453            (VMOVSLDUPYrm addr:$src)>;
4454}
4455
4456let Predicates = [UseSSE3] in {
4457  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4458            (MOVSHDUPrr VR128:$src)>;
4459  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4460            (MOVSHDUPrm addr:$src)>;
4461  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4462            (MOVSLDUPrr VR128:$src)>;
4463  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4464            (MOVSLDUPrm addr:$src)>;
4465}
4466
4467//===---------------------------------------------------------------------===//
4468// SSE3 - Replicate Double FP - MOVDDUP
4469//===---------------------------------------------------------------------===//
4470
4471multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4472def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4473                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4474                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4475                    Sched<[sched.XMM]>;
4476def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4477                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4478                    [(set VR128:$dst,
4479                      (v2f64 (X86Movddup
4480                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4481                    Sched<[sched.XMM.Folded]>;
4482}
4483
4484// FIXME: Merge with above classes when there are patterns for the ymm version
4485multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4486def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4487                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4488                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4489                    Sched<[sched.YMM]>;
4490def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4491                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4492                    [(set VR256:$dst,
4493                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4494                    Sched<[sched.YMM.Folded]>;
4495}
4496
4497let Predicates = [HasAVX, NoVLX] in {
4498  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4499                                      VEX, WIG;
4500  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4501                                        VEX, VEX_L, WIG;
4502}
4503
4504defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4505
4506
4507let Predicates = [HasAVX, NoVLX] in {
4508  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4509            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4510}
4511
4512let Predicates = [UseSSE3] in {
4513  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4514            (MOVDDUPrm addr:$src)>;
4515}
4516
4517//===---------------------------------------------------------------------===//
4518// SSE3 - Move Unaligned Integer
4519//===---------------------------------------------------------------------===//
4520
4521let Predicates = [HasAVX] in {
4522  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4523                      "vlddqu\t{$src, $dst|$dst, $src}",
4524                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4525                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG;
4526  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4527                       "vlddqu\t{$src, $dst|$dst, $src}",
4528                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4529                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, WIG;
4530} // Predicates
4531
4532def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4533                   "lddqu\t{$src, $dst|$dst, $src}",
4534                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4535                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4536
4537//===---------------------------------------------------------------------===//
4538// SSE3 - Arithmetic
4539//===---------------------------------------------------------------------===//
4540
4541multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4542                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4543                       PatFrag ld_frag, bit Is2Addr = 1> {
4544let Uses = [MXCSR], mayRaiseFPException = 1 in {
4545  def rr : I<0xD0, MRMSrcReg,
4546       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4547       !if(Is2Addr,
4548           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4549           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4550       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4551       Sched<[sched]>;
4552  def rm : I<0xD0, MRMSrcMem,
4553       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4554       !if(Is2Addr,
4555           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4556           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4557       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4558       Sched<[sched.Folded, sched.ReadAfterFold]>;
4559}
4560}
4561
4562let Predicates = [HasAVX] in {
4563  let ExeDomain = SSEPackedSingle in {
4564    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4565                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4566                                 XD, VEX_4V, WIG;
4567    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4568                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4569                                  XD, VEX_4V, VEX_L, WIG;
4570  }
4571  let ExeDomain = SSEPackedDouble in {
4572    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4573                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4574                                 PD, VEX_4V, WIG;
4575    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4576                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4577                                  PD, VEX_4V, VEX_L, WIG;
4578  }
4579}
4580let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4581  let ExeDomain = SSEPackedSingle in
4582  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4583                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4584  let ExeDomain = SSEPackedDouble in
4585  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4586                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4587}
4588
4589//===---------------------------------------------------------------------===//
4590// SSE3 Instructions
4591//===---------------------------------------------------------------------===//
4592
4593// Horizontal ops
4594multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4595                   X86MemOperand x86memop, SDNode OpNode,
4596                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4597                   bit Is2Addr = 1> {
4598let Uses = [MXCSR], mayRaiseFPException = 1 in {
4599  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4600       !if(Is2Addr,
4601         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4602         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4603      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4604      Sched<[sched]>;
4605
4606  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4607       !if(Is2Addr,
4608         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4609         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4610      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4611      Sched<[sched.Folded, sched.ReadAfterFold]>;
4612}
4613}
4614multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4615                  X86MemOperand x86memop, SDNode OpNode,
4616                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4617                  bit Is2Addr = 1> {
4618let Uses = [MXCSR], mayRaiseFPException = 1 in {
4619  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4620       !if(Is2Addr,
4621         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4622         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4623      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4624        Sched<[sched]>;
4625
4626  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4627       !if(Is2Addr,
4628         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4629         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4630      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4631        Sched<[sched.Folded, sched.ReadAfterFold]>;
4632}
4633}
4634
4635let Predicates = [HasAVX] in {
4636  let ExeDomain = SSEPackedSingle in {
4637    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4638                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
4639    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4640                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
4641    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4642                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
4643    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4644                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
4645  }
4646  let ExeDomain = SSEPackedDouble in {
4647    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4648                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
4649    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4650                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
4651    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4652                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
4653    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4654                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
4655  }
4656}
4657
4658let Constraints = "$src1 = $dst" in {
4659  let ExeDomain = SSEPackedSingle in {
4660    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4661                          WriteFHAdd, memopv4f32>;
4662    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4663                          WriteFHAdd, memopv4f32>;
4664  }
4665  let ExeDomain = SSEPackedDouble in {
4666    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4667                         WriteFHAdd, memopv2f64>;
4668    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4669                         WriteFHAdd, memopv2f64>;
4670  }
4671}
4672
4673//===---------------------------------------------------------------------===//
4674// SSSE3 - Packed Absolute Instructions
4675//===---------------------------------------------------------------------===//
4676
4677/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4678multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4679                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4680  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4681                 (ins VR128:$src),
4682                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4683                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4684                 Sched<[sched.XMM]>;
4685
4686  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4687                 (ins i128mem:$src),
4688                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4689                 [(set VR128:$dst,
4690                   (vt (OpNode (ld_frag addr:$src))))]>,
4691                 Sched<[sched.XMM.Folded]>;
4692}
4693
4694/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4695multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4696                          SDNode OpNode, X86SchedWriteWidths sched> {
4697  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4698                  (ins VR256:$src),
4699                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4700                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4701                  Sched<[sched.YMM]>;
4702
4703  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4704                  (ins i256mem:$src),
4705                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4706                  [(set VR256:$dst,
4707                    (vt (OpNode (load addr:$src))))]>,
4708                  Sched<[sched.YMM.Folded]>;
4709}
4710
4711let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4712  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4713                              load>, VEX, WIG;
4714  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4715                              load>, VEX, WIG;
4716}
4717let Predicates = [HasAVX, NoVLX] in {
4718  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4719                              load>, VEX, WIG;
4720}
4721let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4722  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4723                                VEX, VEX_L, WIG;
4724  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4725                                VEX, VEX_L, WIG;
4726}
4727let Predicates = [HasAVX2, NoVLX] in {
4728  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4729                                VEX, VEX_L, WIG;
4730}
4731
4732defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4733                          memop>;
4734defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4735                          memop>;
4736defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4737                          memop>;
4738
4739//===---------------------------------------------------------------------===//
4740// SSSE3 - Packed Binary Operator Instructions
4741//===---------------------------------------------------------------------===//
4742
4743/// SS3I_binop_rm - Simple SSSE3 bin op
4744multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4745                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4746                         PatFrag memop_frag, X86MemOperand x86memop,
4747                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4748  let isCommutable = 1 in
4749  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4750       (ins RC:$src1, RC:$src2),
4751       !if(Is2Addr,
4752         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4753         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4754       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4755       Sched<[sched]>;
4756  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4757       (ins RC:$src1, x86memop:$src2),
4758       !if(Is2Addr,
4759         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4760         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4761       [(set RC:$dst,
4762         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4763       Sched<[sched.Folded, sched.ReadAfterFold]>;
4764}
4765
4766/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4767multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4768                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4769                             PatFrag ld_frag, bit Is2Addr = 1> {
4770  let isCommutable = 1 in
4771  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4772       (ins VR128:$src1, VR128:$src2),
4773       !if(Is2Addr,
4774         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4775         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4776       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4777       Sched<[sched]>;
4778  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4779       (ins VR128:$src1, i128mem:$src2),
4780       !if(Is2Addr,
4781         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4782         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4783       [(set VR128:$dst,
4784         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4785       Sched<[sched.Folded, sched.ReadAfterFold]>;
4786}
4787
4788multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4789                               Intrinsic IntId256,
4790                               X86FoldableSchedWrite sched> {
4791  let isCommutable = 1 in
4792  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4793       (ins VR256:$src1, VR256:$src2),
4794       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4795       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4796       Sched<[sched]>;
4797  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4798       (ins VR256:$src1, i256mem:$src2),
4799       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4800       [(set VR256:$dst,
4801         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4802       Sched<[sched.Folded, sched.ReadAfterFold]>;
4803}
4804
4805let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4806let isCommutable = 0 in {
4807  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4808                                  VR128, load, i128mem,
4809                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, WIG;
4810  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4811                                  v16i8, VR128, load, i128mem,
4812                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
4813}
4814defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4815                                  VR128, load, i128mem,
4816                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
4817}
4818
4819let ImmT = NoImm, Predicates = [HasAVX] in {
4820let isCommutable = 0 in {
4821  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4822                                  load, i128mem,
4823                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4824  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4825                                  load, i128mem,
4826                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4827  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4828                                  load, i128mem,
4829                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4830  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4831                                  load, i128mem,
4832                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
4833  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4834                                      int_x86_ssse3_psign_b_128,
4835                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
4836  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4837                                      int_x86_ssse3_psign_w_128,
4838                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
4839  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4840                                      int_x86_ssse3_psign_d_128,
4841                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
4842  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4843                                      int_x86_ssse3_phadd_sw_128,
4844                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
4845  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4846                                      int_x86_ssse3_phsub_sw_128,
4847                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
4848}
4849}
4850
4851let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4852let isCommutable = 0 in {
4853  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4854                                  VR256, load, i256mem,
4855                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
4856  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4857                                   v32i8, VR256, load, i256mem,
4858                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
4859}
4860defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4861                                  VR256, load, i256mem,
4862                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
4863}
4864
4865let ImmT = NoImm, Predicates = [HasAVX2] in {
4866let isCommutable = 0 in {
4867  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4868                                  VR256, load, i256mem,
4869                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4870  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4871                                  load, i256mem,
4872                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4873  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4874                                  VR256, load, i256mem,
4875                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4876  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4877                                  load, i256mem,
4878                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
4879  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4880                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
4881  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4882                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
4883  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4884                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
4885  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4886                                       int_x86_avx2_phadd_sw,
4887                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
4888  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4889                                       int_x86_avx2_phsub_sw,
4890                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
4891}
4892}
4893
4894// None of these have i8 immediate fields.
4895let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4896let isCommutable = 0 in {
4897  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4898                                 memop, i128mem, SchedWritePHAdd.XMM>;
4899  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4900                                 memop, i128mem, SchedWritePHAdd.XMM>;
4901  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4902                                 memop, i128mem, SchedWritePHAdd.XMM>;
4903  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4904                                 memop, i128mem, SchedWritePHAdd.XMM>;
4905  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4906                                     SchedWriteVecALU.XMM, memop>;
4907  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4908                                     SchedWriteVecALU.XMM, memop>;
4909  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4910                                     SchedWriteVecALU.XMM, memop>;
4911  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4912                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4913  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4914                                     int_x86_ssse3_phadd_sw_128,
4915                                     SchedWritePHAdd.XMM, memop>;
4916  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4917                                     int_x86_ssse3_phsub_sw_128,
4918                                     SchedWritePHAdd.XMM, memop>;
4919  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4920                                 v16i8, VR128, memop, i128mem,
4921                                 SchedWriteVecIMul.XMM>;
4922}
4923defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4924                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4925}
4926
4927//===---------------------------------------------------------------------===//
4928// SSSE3 - Packed Align Instruction Patterns
4929//===---------------------------------------------------------------------===//
4930
4931multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4932                         PatFrag memop_frag, X86MemOperand x86memop,
4933                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4934  let hasSideEffects = 0 in {
4935  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4936      (ins RC:$src1, RC:$src2, u8imm:$src3),
4937      !if(Is2Addr,
4938        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4939        !strconcat(asm,
4940                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4941      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4942      Sched<[sched]>;
4943  let mayLoad = 1 in
4944  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4945      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4946      !if(Is2Addr,
4947        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4948        !strconcat(asm,
4949                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4950      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4951                                     (memop_frag addr:$src2),
4952                                     (i8 timm:$src3))))]>,
4953      Sched<[sched.Folded, sched.ReadAfterFold]>;
4954  }
4955}
4956
4957let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4958  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4959                                SchedWriteShuffle.XMM, 0>, VEX_4V, WIG;
4960let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4961  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4962                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
4963let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4964  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4965                               SchedWriteShuffle.XMM>;
4966
4967//===---------------------------------------------------------------------===//
4968// SSSE3 - Thread synchronization
4969//===---------------------------------------------------------------------===//
4970
4971let SchedRW = [WriteSystem] in {
4972let Uses = [EAX, ECX, EDX] in
4973def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4974                     TB, Requires<[HasSSE3, Not64BitMode]>;
4975let Uses = [RAX, ECX, EDX] in
4976def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4977                     TB, Requires<[HasSSE3, In64BitMode]>;
4978
4979let Uses = [ECX, EAX] in
4980def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4981                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4982} // SchedRW
4983
4984def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4985def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4986
4987def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4988      Requires<[Not64BitMode]>;
4989def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4990      Requires<[In64BitMode]>;
4991
4992//===----------------------------------------------------------------------===//
4993// SSE4.1 - Packed Move with Sign/Zero Extend
4994// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4995//===----------------------------------------------------------------------===//
4996
4997multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4998                            RegisterClass OutRC, RegisterClass InRC,
4999                            X86FoldableSchedWrite sched> {
5000  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
5001                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5002                 Sched<[sched]>;
5003
5004  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
5005                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
5006                 Sched<[sched.Folded]>;
5007}
5008
5009multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
5010                              X86MemOperand MemOp, X86MemOperand MemYOp,
5011                              Predicate prd> {
5012  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
5013                               SchedWriteShuffle.XMM>;
5014  let Predicates = [HasAVX, prd] in
5015    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
5016                                     VR128, VR128, SchedWriteVecExtend.XMM>,
5017                                     VEX, WIG;
5018  let Predicates = [HasAVX2, prd] in
5019    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
5020                                     VR256, VR128, SchedWriteVecExtend.YMM>,
5021                                     VEX, VEX_L, WIG;
5022}
5023
5024multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
5025                          X86MemOperand MemYOp, Predicate prd> {
5026  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
5027                                        MemOp, MemYOp, prd>;
5028  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
5029                                        !strconcat("pmovzx", OpcodeStr),
5030                                        MemOp, MemYOp, prd>;
5031}
5032
5033defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
5034defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
5035defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
5036
5037defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
5038defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
5039
5040defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
5041
5042// AVX2 Patterns
5043multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
5044                                     SDNode ExtOp, SDNode InVecOp> {
5045  // Register-Register patterns
5046  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5047  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
5048            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
5049  }
5050  let Predicates = [HasAVX2, NoVLX] in {
5051  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
5052            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
5053  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
5054            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
5055
5056  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
5057            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
5058  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
5059            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5060
5061  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5062            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5063  }
5064
5065  // Simple Register-Memory patterns
5066  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5067  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5068            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5069
5070  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5071            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5072  }
5073
5074  let Predicates = [HasAVX2, NoVLX] in {
5075  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5076            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5077  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5078            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5079
5080  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5081            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5082  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5083            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5084
5085  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5086            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5087  }
5088
5089  // AVX2 Register-Memory patterns
5090  let Predicates = [HasAVX2, NoVLX] in {
5091  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5092            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5093
5094  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5095            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5096  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5097            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5098  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5099            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5100
5101  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5102            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5103
5104  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5105            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5106  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5107            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5108
5109  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5110            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5111  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5112            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5113  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5114            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5115  }
5116}
5117
5118defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5119defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5120
5121// SSE4.1/AVX patterns.
5122multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5123                                SDNode ExtOp> {
5124  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5125  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5126            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5127  }
5128  let Predicates = [HasAVX, NoVLX] in {
5129  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5130            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5131  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5132            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5133
5134  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5135            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5136  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5137            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5138
5139  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5140            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5141  }
5142  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5143  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5144            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5145  }
5146  let Predicates = [HasAVX, NoVLX] in {
5147  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5148            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5149  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5150            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5151
5152  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5153            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5154  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5155            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5156
5157  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5158            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5159  }
5160  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5161  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5162            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5163  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5164            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5165  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5166            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5167  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5168            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5169  }
5170  let Predicates = [HasAVX, NoVLX] in {
5171  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5172            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5173  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5174            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5175  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5176            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5177
5178  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5179            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5180  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5181            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5182
5183  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5184            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5185  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5186            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5187  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5188            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5189  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5190            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5191
5192  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5193            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5194  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5195            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5196  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5197            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5198
5199  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5200            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5201  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5202            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5203  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5204            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5205  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5206            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5207  }
5208}
5209
5210defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5211defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5212
5213let Predicates = [UseSSE41] in {
5214  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5215  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5216}
5217
5218//===----------------------------------------------------------------------===//
5219// SSE4.1 - Extract Instructions
5220//===----------------------------------------------------------------------===//
5221
5222/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5223multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5224  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5225                 (ins VR128:$src1, u8imm:$src2),
5226                 !strconcat(OpcodeStr,
5227                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5228                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5229                                         timm:$src2))]>,
5230                  Sched<[WriteVecExtract]>;
5231  let hasSideEffects = 0, mayStore = 1 in
5232  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5233                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5234                 !strconcat(OpcodeStr,
5235                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5236                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
5237                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5238}
5239
5240let Predicates = [HasAVX, NoBWI] in
5241  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, WIG;
5242
5243defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5244
5245
5246/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5247multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5248  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5249  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5250                   (ins VR128:$src1, u8imm:$src2),
5251                   !strconcat(OpcodeStr,
5252                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5253                   Sched<[WriteVecExtract]>;
5254
5255  let hasSideEffects = 0, mayStore = 1 in
5256  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5257                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5258                 !strconcat(OpcodeStr,
5259                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5260                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
5261                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5262}
5263
5264let Predicates = [HasAVX, NoBWI] in
5265  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, WIG;
5266
5267defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5268
5269let Predicates = [UseSSE41] in
5270  def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5271
5272let Predicates = [HasAVX, NoBWI] in
5273  def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>;
5274
5275
5276/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5277multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5278  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5279                 (ins VR128:$src1, u8imm:$src2),
5280                 !strconcat(OpcodeStr,
5281                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5282                 [(set GR32:$dst,
5283                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5284                  Sched<[WriteVecExtract]>;
5285  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5286                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5287                 !strconcat(OpcodeStr,
5288                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5289                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5290                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5291}
5292
5293let Predicates = [HasAVX, NoDQI] in
5294  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5295
5296defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5297
5298/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5299multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5300  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5301                 (ins VR128:$src1, u8imm:$src2),
5302                 !strconcat(OpcodeStr,
5303                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5304                 [(set GR64:$dst,
5305                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5306                  Sched<[WriteVecExtract]>;
5307  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5308                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5309                 !strconcat(OpcodeStr,
5310                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5311                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5312                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5313}
5314
5315let Predicates = [HasAVX, NoDQI] in
5316  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, REX_W;
5317
5318defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5319
5320/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5321/// destination
5322multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5323  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5324                   (ins VR128:$src1, u8imm:$src2),
5325                   !strconcat(OpcodeStr,
5326                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5327                   [(set GR32orGR64:$dst,
5328                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5329                   Sched<[WriteVecExtract]>;
5330  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5331                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5332                   !strconcat(OpcodeStr,
5333                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5334                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5335                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5336}
5337
5338let ExeDomain = SSEPackedSingle in {
5339  let Predicates = [UseAVX] in
5340    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, WIG;
5341  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5342}
5343
5344//===----------------------------------------------------------------------===//
5345// SSE4.1 - Insert Instructions
5346//===----------------------------------------------------------------------===//
5347
5348multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5349  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5350      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5351      !if(Is2Addr,
5352        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5353        !strconcat(asm,
5354                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5355      [(set VR128:$dst,
5356        (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
5357      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5358  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5359      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5360      !if(Is2Addr,
5361        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5362        !strconcat(asm,
5363                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5364      [(set VR128:$dst,
5365        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
5366                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5367}
5368
5369let Predicates = [HasAVX, NoBWI] in {
5370  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, WIG;
5371  def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
5372            (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
5373                       GR8:$src2, sub_8bit), timm:$src3)>;
5374}
5375
5376let Constraints = "$src1 = $dst" in
5377  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5378
5379multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5380  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5381      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5382      !if(Is2Addr,
5383        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5384        !strconcat(asm,
5385                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5386      [(set VR128:$dst,
5387        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5388      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5389  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5390      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5391      !if(Is2Addr,
5392        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5393        !strconcat(asm,
5394                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5395      [(set VR128:$dst,
5396        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5397                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5398}
5399
5400let Predicates = [HasAVX, NoDQI] in
5401  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5402let Constraints = "$src1 = $dst" in
5403  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5404
5405multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5406  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5407      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5408      !if(Is2Addr,
5409        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5410        !strconcat(asm,
5411                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5412      [(set VR128:$dst,
5413        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5414      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5415  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5416      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5417      !if(Is2Addr,
5418        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5419        !strconcat(asm,
5420                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5421      [(set VR128:$dst,
5422        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5423                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5424}
5425
5426let Predicates = [HasAVX, NoDQI] in
5427  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, REX_W;
5428let Constraints = "$src1 = $dst" in
5429  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5430
5431// insertps has a few different modes, there's the first two here below which
5432// are optimized inserts that won't zero arbitrary elements in the destination
5433// vector. The next one matches the intrinsic and could zero arbitrary elements
5434// in the target vector.
5435multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5436  let isCommutable = 1 in
5437  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5438      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5439      !if(Is2Addr,
5440        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5441        !strconcat(asm,
5442                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5443      [(set VR128:$dst,
5444        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5445      Sched<[SchedWriteFShuffle.XMM]>;
5446  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5447      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5448      !if(Is2Addr,
5449        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5450        !strconcat(asm,
5451                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5452      [(set VR128:$dst,
5453        (X86insertps VR128:$src1,
5454                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5455                    timm:$src3))]>,
5456      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5457}
5458
5459let ExeDomain = SSEPackedSingle in {
5460  let Predicates = [UseAVX] in
5461    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5462                     VEX_4V, WIG;
5463  let Constraints = "$src1 = $dst" in
5464    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5465}
5466
5467//===----------------------------------------------------------------------===//
5468// SSE4.1 - Round Instructions
5469//===----------------------------------------------------------------------===//
5470
5471multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5472                           X86MemOperand x86memop, RegisterClass RC,
5473                           ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
5474                           X86FoldableSchedWrite sched> {
5475  // Intrinsic operation, reg.
5476  // Vector intrinsic operation, reg
5477let Uses = [MXCSR], mayRaiseFPException = 1 in {
5478  def r : SS4AIi8<opc, MRMSrcReg,
5479                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5480                  !strconcat(OpcodeStr,
5481                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5482                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5483                  Sched<[sched]>;
5484
5485  // Vector intrinsic operation, mem
5486  def m : SS4AIi8<opc, MRMSrcMem,
5487                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5488                  !strconcat(OpcodeStr,
5489                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5490                  [(set RC:$dst,
5491                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5492                  Sched<[sched.Folded]>;
5493}
5494}
5495
5496multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5497                          string OpcodeStr, X86FoldableSchedWrite sched> {
5498let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5499  def SSr : SS4AIi8<opcss, MRMSrcReg,
5500        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5501        !strconcat(OpcodeStr,
5502            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5503      []>, Sched<[sched]>;
5504
5505  let mayLoad = 1 in
5506  def SSm : SS4AIi8<opcss, MRMSrcMem,
5507        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5508        !strconcat(OpcodeStr,
5509             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5510        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5511} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5512
5513let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5514  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5515        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5516        !strconcat(OpcodeStr,
5517              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5518        []>, Sched<[sched]>;
5519
5520  let mayLoad = 1 in
5521  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5522        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5523        !strconcat(OpcodeStr,
5524             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5525        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5526} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5527}
5528
5529multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5530                           string OpcodeStr, X86FoldableSchedWrite sched> {
5531let Uses = [MXCSR], mayRaiseFPException = 1 in {
5532let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5533  def SSr : SS4AIi8<opcss, MRMSrcReg,
5534                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5535                    !strconcat(OpcodeStr,
5536                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5537                    []>, Sched<[sched]>;
5538
5539  let mayLoad = 1 in
5540  def SSm : SS4AIi8<opcss, MRMSrcMem,
5541                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5542                    !strconcat(OpcodeStr,
5543                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5544                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5545} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5546
5547let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5548  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5549                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5550                    !strconcat(OpcodeStr,
5551                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5552                    []>, Sched<[sched]>;
5553
5554  let mayLoad = 1 in
5555  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5556                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5557                    !strconcat(OpcodeStr,
5558                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5559                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5560} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5561}
5562}
5563
5564multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5565                            string OpcodeStr, X86FoldableSchedWrite sched,
5566                            ValueType VT32, ValueType VT64,
5567                            SDNode OpNode, bit Is2Addr = 1> {
5568let Uses = [MXCSR], mayRaiseFPException = 1 in {
5569let ExeDomain = SSEPackedSingle in {
5570  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5571        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5572        !if(Is2Addr,
5573            !strconcat(OpcodeStr,
5574                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5575            !strconcat(OpcodeStr,
5576                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5577        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5578        Sched<[sched]>;
5579
5580  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5581        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5582        !if(Is2Addr,
5583            !strconcat(OpcodeStr,
5584                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5585            !strconcat(OpcodeStr,
5586                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5587        [(set VR128:$dst,
5588             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5589        Sched<[sched.Folded, sched.ReadAfterFold]>;
5590} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5591
5592let ExeDomain = SSEPackedDouble in {
5593  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5594        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5595        !if(Is2Addr,
5596            !strconcat(OpcodeStr,
5597                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5598            !strconcat(OpcodeStr,
5599                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5600        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5601        Sched<[sched]>;
5602
5603  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5604        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5605        !if(Is2Addr,
5606            !strconcat(OpcodeStr,
5607                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5608            !strconcat(OpcodeStr,
5609                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5610        [(set VR128:$dst,
5611              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5612        Sched<[sched.Folded, sched.ReadAfterFold]>;
5613} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5614}
5615}
5616
5617// FP round - roundss, roundps, roundsd, roundpd
5618let Predicates = [HasAVX, NoVLX] in {
5619  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5620    // Intrinsic form
5621    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5622                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5623                                   VEX, WIG;
5624    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5625                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5626                                   VEX, VEX_L, WIG;
5627  }
5628
5629  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5630    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5631                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5632                                   VEX, WIG;
5633    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5634                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5635                                   VEX, VEX_L, WIG;
5636  }
5637}
5638let Predicates = [UseAVX] in {
5639  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5640                                  v4f32, v2f64, X86RndScales, 0>,
5641                                  VEX_4V, VEX_LIG, WIG, SIMD_EXC;
5642  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5643                                VEX_4V, VEX_LIG, WIG, SIMD_EXC;
5644}
5645
5646let Predicates = [UseAVX] in {
5647  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5648            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5649  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5650            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5651}
5652
5653let Predicates = [UseAVX, OptForSize] in {
5654  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5655            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5656  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5657            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5658}
5659
5660let ExeDomain = SSEPackedSingle in
5661defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5662                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5663let ExeDomain = SSEPackedDouble in
5664defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5665                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5666
5667defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5668
5669let Constraints = "$src1 = $dst" in
5670defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5671                               v4f32, v2f64, X86RndScales>;
5672
5673let Predicates = [UseSSE41] in {
5674  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5675            (ROUNDSSr FR32:$src1, timm:$src2)>;
5676  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5677            (ROUNDSDr FR64:$src1, timm:$src2)>;
5678}
5679
5680let Predicates = [UseSSE41, OptForSize] in {
5681  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5682            (ROUNDSSm addr:$src1, timm:$src2)>;
5683  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5684            (ROUNDSDm addr:$src1, timm:$src2)>;
5685}
5686
5687//===----------------------------------------------------------------------===//
5688// SSE4.1 - Packed Bit Test
5689//===----------------------------------------------------------------------===//
5690
5691// ptest instruction we'll lower to this in X86ISelLowering primarily from
5692// the intel intrinsic that corresponds to this.
5693let Defs = [EFLAGS], Predicates = [HasAVX] in {
5694def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5695                "vptest\t{$src2, $src1|$src1, $src2}",
5696                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5697                Sched<[SchedWriteVecTest.XMM]>, VEX, WIG;
5698def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5699                "vptest\t{$src2, $src1|$src1, $src2}",
5700                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5701                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5702                VEX, WIG;
5703
5704def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5705                "vptest\t{$src2, $src1|$src1, $src2}",
5706                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5707                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, WIG;
5708def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5709                "vptest\t{$src2, $src1|$src1, $src2}",
5710                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5711                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5712                VEX, VEX_L, WIG;
5713}
5714
5715let Defs = [EFLAGS] in {
5716def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5717              "ptest\t{$src2, $src1|$src1, $src2}",
5718              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5719              Sched<[SchedWriteVecTest.XMM]>;
5720def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5721              "ptest\t{$src2, $src1|$src1, $src2}",
5722              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5723              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5724}
5725
5726// The bit test instructions below are AVX only
5727multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5728                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5729                       X86FoldableSchedWrite sched> {
5730  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5731            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5732            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5733            Sched<[sched]>, VEX;
5734  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5735            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5736            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5737            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5738}
5739
5740let Defs = [EFLAGS], Predicates = [HasAVX] in {
5741let ExeDomain = SSEPackedSingle in {
5742defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5743                            SchedWriteFTest.XMM>;
5744defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5745                            SchedWriteFTest.YMM>, VEX_L;
5746}
5747let ExeDomain = SSEPackedDouble in {
5748defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5749                            SchedWriteFTest.XMM>;
5750defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5751                            SchedWriteFTest.YMM>, VEX_L;
5752}
5753}
5754
5755//===----------------------------------------------------------------------===//
5756// SSE4.1 - Misc Instructions
5757//===----------------------------------------------------------------------===//
5758
5759let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5760  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5761                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5762                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5763                     Sched<[WritePOPCNT]>, OpSize16, XS;
5764  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5765                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5766                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5767                      (implicit EFLAGS)]>,
5768                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5769
5770  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5771                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5772                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5773                     Sched<[WritePOPCNT]>, OpSize32, XS;
5774
5775  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5776                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5777                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5778                      (implicit EFLAGS)]>,
5779                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5780
5781  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5782                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5783                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5784                      Sched<[WritePOPCNT]>, XS;
5785  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5786                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5787                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5788                       (implicit EFLAGS)]>,
5789                       Sched<[WritePOPCNT.Folded]>, XS;
5790}
5791
5792// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5793multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5794                                 SDNode OpNode, PatFrag ld_frag,
5795                                 X86FoldableSchedWrite Sched> {
5796  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5797                 (ins VR128:$src),
5798                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5799                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5800                 Sched<[Sched]>;
5801  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5802                  (ins i128mem:$src),
5803                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5804                  [(set VR128:$dst,
5805                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5806                 Sched<[Sched.Folded]>;
5807}
5808
5809// PHMIN has the same profile as PSAD, thus we use the same scheduling
5810// model, although the naming is misleading.
5811let Predicates = [HasAVX] in
5812defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5813                                         X86phminpos, load,
5814                                         WritePHMINPOS>, VEX, WIG;
5815defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5816                                         X86phminpos, memop,
5817                                         WritePHMINPOS>;
5818
5819/// SS48I_binop_rm - Simple SSE41 binary operator.
5820multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5821                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5822                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5823                          bit Is2Addr = 1> {
5824  let isCommutable = 1 in
5825  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5826       (ins RC:$src1, RC:$src2),
5827       !if(Is2Addr,
5828           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5829           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5830       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5831       Sched<[sched]>;
5832  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5833       (ins RC:$src1, x86memop:$src2),
5834       !if(Is2Addr,
5835           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5836           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5837       [(set RC:$dst,
5838         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5839       Sched<[sched.Folded, sched.ReadAfterFold]>;
5840}
5841
5842let Predicates = [HasAVX, NoVLX] in {
5843  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5844                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5845                                  VEX_4V, WIG;
5846  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5847                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5848                                  VEX_4V, WIG;
5849  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5850                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5851                                  VEX_4V, WIG;
5852  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5853                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5854                                  VEX_4V, WIG;
5855  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5856                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5857                                  VEX_4V, WIG;
5858}
5859let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5860  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5861                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5862                                  VEX_4V, WIG;
5863  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5864                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5865                                  VEX_4V, WIG;
5866  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5867                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5868                                  VEX_4V, WIG;
5869  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5870                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5871                                  VEX_4V, WIG;
5872}
5873
5874let Predicates = [HasAVX2, NoVLX] in {
5875  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5876                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5877                                  VEX_4V, VEX_L, WIG;
5878  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5879                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5880                                  VEX_4V, VEX_L, WIG;
5881  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5882                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5883                                  VEX_4V, VEX_L, WIG;
5884  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5885                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5886                                  VEX_4V, VEX_L, WIG;
5887  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5888                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5889                                  VEX_4V, VEX_L, WIG;
5890}
5891let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5892  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5893                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5894                                  VEX_4V, VEX_L, WIG;
5895  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5896                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5897                                  VEX_4V, VEX_L, WIG;
5898  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5899                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5900                                  VEX_4V, VEX_L, WIG;
5901  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5902                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5903                                  VEX_4V, VEX_L, WIG;
5904}
5905
5906let Constraints = "$src1 = $dst" in {
5907  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5908                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5909  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5910                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5911  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5912                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5913  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5914                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5915  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5916                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5917  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5918                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5919  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5920                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5921  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5922                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5923  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5924                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5925}
5926
5927let Predicates = [HasAVX, NoVLX] in
5928  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5929                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5930                                 VEX_4V, WIG;
5931let Predicates = [HasAVX] in
5932  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5933                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5934                                 VEX_4V, WIG;
5935
5936let Predicates = [HasAVX2, NoVLX] in
5937  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5938                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5939                                  VEX_4V, VEX_L, WIG;
5940let Predicates = [HasAVX2] in
5941  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5942                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5943                                  VEX_4V, VEX_L, WIG;
5944
5945let Constraints = "$src1 = $dst" in {
5946  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5947                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5948  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5949                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5950}
5951
5952/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5953multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5954                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5955                 X86MemOperand x86memop, bit Is2Addr,
5956                 X86FoldableSchedWrite sched> {
5957  let isCommutable = 1 in
5958  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5959        (ins RC:$src1, RC:$src2, u8imm:$src3),
5960        !if(Is2Addr,
5961            !strconcat(OpcodeStr,
5962                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5963            !strconcat(OpcodeStr,
5964                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5965        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5966        Sched<[sched]>;
5967  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5968        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5969        !if(Is2Addr,
5970            !strconcat(OpcodeStr,
5971                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5972            !strconcat(OpcodeStr,
5973                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5974        [(set RC:$dst,
5975          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5976        Sched<[sched.Folded, sched.ReadAfterFold]>;
5977}
5978
5979/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5980multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5981                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5982                           X86MemOperand x86memop, bit Is2Addr,
5983                           X86FoldableSchedWrite sched> {
5984  let isCommutable = 1 in
5985  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5986        (ins RC:$src1, RC:$src2, u8imm:$src3),
5987        !if(Is2Addr,
5988            !strconcat(OpcodeStr,
5989                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5990            !strconcat(OpcodeStr,
5991                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5992        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5993        Sched<[sched]>;
5994  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5995        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5996        !if(Is2Addr,
5997            !strconcat(OpcodeStr,
5998                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5999            !strconcat(OpcodeStr,
6000                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6001        [(set RC:$dst,
6002          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6003        Sched<[sched.Folded, sched.ReadAfterFold]>;
6004}
6005
6006def BlendCommuteImm2 : SDNodeXForm<timm, [{
6007  uint8_t Imm = N->getZExtValue() & 0x03;
6008  return getI8Imm(Imm ^ 0x03, SDLoc(N));
6009}]>;
6010
6011def BlendCommuteImm4 : SDNodeXForm<timm, [{
6012  uint8_t Imm = N->getZExtValue() & 0x0f;
6013  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
6014}]>;
6015
6016def BlendCommuteImm8 : SDNodeXForm<timm, [{
6017  uint8_t Imm = N->getZExtValue() & 0xff;
6018  return getI8Imm(Imm ^ 0xff, SDLoc(N));
6019}]>;
6020
6021// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
6022def BlendScaleImm4 : SDNodeXForm<timm, [{
6023  uint8_t Imm = N->getZExtValue();
6024  uint8_t NewImm = 0;
6025  for (unsigned i = 0; i != 4; ++i) {
6026    if (Imm & (1 << i))
6027      NewImm |= 0x3 << (i * 2);
6028  }
6029  return getI8Imm(NewImm, SDLoc(N));
6030}]>;
6031
6032// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
6033def BlendScaleImm2 : SDNodeXForm<timm, [{
6034  uint8_t Imm = N->getZExtValue();
6035  uint8_t NewImm = 0;
6036  for (unsigned i = 0; i != 2; ++i) {
6037    if (Imm & (1 << i))
6038      NewImm |= 0xf << (i * 4);
6039  }
6040  return getI8Imm(NewImm, SDLoc(N));
6041}]>;
6042
6043// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
6044def BlendScaleImm2to4 : SDNodeXForm<timm, [{
6045  uint8_t Imm = N->getZExtValue();
6046  uint8_t NewImm = 0;
6047  for (unsigned i = 0; i != 2; ++i) {
6048    if (Imm & (1 << i))
6049      NewImm |= 0x3 << (i * 2);
6050  }
6051  return getI8Imm(NewImm, SDLoc(N));
6052}]>;
6053
6054// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
6055def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
6056  uint8_t Imm = N->getZExtValue();
6057  uint8_t NewImm = 0;
6058  for (unsigned i = 0; i != 4; ++i) {
6059    if (Imm & (1 << i))
6060      NewImm |= 0x3 << (i * 2);
6061  }
6062  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6063}]>;
6064
6065// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
6066def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
6067  uint8_t Imm = N->getZExtValue();
6068  uint8_t NewImm = 0;
6069  for (unsigned i = 0; i != 2; ++i) {
6070    if (Imm & (1 << i))
6071      NewImm |= 0xf << (i * 4);
6072  }
6073  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6074}]>;
6075
6076// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
6077def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
6078  uint8_t Imm = N->getZExtValue();
6079  uint8_t NewImm = 0;
6080  for (unsigned i = 0; i != 2; ++i) {
6081    if (Imm & (1 << i))
6082      NewImm |= 0x3 << (i * 2);
6083  }
6084  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6085}]>;
6086
6087let Predicates = [HasAVX] in {
6088  let isCommutable = 0 in {
6089    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6090                                        VR128, load, i128mem, 0,
6091                                        SchedWriteMPSAD.XMM>, VEX_4V, WIG;
6092  }
6093
6094let Uses = [MXCSR], mayRaiseFPException = 1 in {
6095  let ExeDomain = SSEPackedSingle in
6096  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6097                                   VR128, load, f128mem, 0,
6098                                   SchedWriteDPPS.XMM>, VEX_4V, WIG;
6099  let ExeDomain = SSEPackedDouble in
6100  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6101                                   VR128, load, f128mem, 0,
6102                                   SchedWriteDPPD.XMM>, VEX_4V, WIG;
6103  let ExeDomain = SSEPackedSingle in
6104  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6105                                    VR256, load, i256mem, 0,
6106                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, WIG;
6107}
6108}
6109
6110let Predicates = [HasAVX2] in {
6111  let isCommutable = 0 in {
6112  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6113                                  VR256, load, i256mem, 0,
6114                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, WIG;
6115  }
6116}
6117
6118let Constraints = "$src1 = $dst" in {
6119  let isCommutable = 0 in {
6120  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6121                                     VR128, memop, i128mem, 1,
6122                                     SchedWriteMPSAD.XMM>;
6123  }
6124
6125  let ExeDomain = SSEPackedSingle in
6126  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6127                                  VR128, memop, f128mem, 1,
6128                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6129  let ExeDomain = SSEPackedDouble in
6130  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6131                                  VR128, memop, f128mem, 1,
6132                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6133}
6134
6135/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6136multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6137                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6138                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6139                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6140let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6141  let isCommutable = 1 in
6142  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6143        (ins RC:$src1, RC:$src2, u8imm:$src3),
6144        !if(Is2Addr,
6145            !strconcat(OpcodeStr,
6146                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6147            !strconcat(OpcodeStr,
6148                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6149        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6150        Sched<[sched]>;
6151  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6152        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6153        !if(Is2Addr,
6154            !strconcat(OpcodeStr,
6155                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6156            !strconcat(OpcodeStr,
6157                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6158        [(set RC:$dst,
6159          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6160        Sched<[sched.Folded, sched.ReadAfterFold]>;
6161}
6162
6163  // Pattern to commute if load is in first source.
6164  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6165            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6166                                            (commuteXForm timm:$src3))>;
6167}
6168
6169let Predicates = [HasAVX] in {
6170  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6171                                  VR128, load, f128mem, 0, SSEPackedSingle,
6172                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6173                                  VEX_4V, WIG;
6174  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6175                                   VR256, load, f256mem, 0, SSEPackedSingle,
6176                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6177                                   VEX_4V, VEX_L, WIG;
6178  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6179                                  VR128, load, f128mem, 0, SSEPackedDouble,
6180                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6181                                  VEX_4V, WIG;
6182  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6183                                   VR256, load, f256mem, 0, SSEPackedDouble,
6184                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6185                                   VEX_4V, VEX_L, WIG;
6186  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6187                                  VR128, load, i128mem, 0, SSEPackedInt,
6188                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6189                                  VEX_4V, WIG;
6190}
6191
6192let Predicates = [HasAVX2] in {
6193  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6194                                   VR256, load, i256mem, 0, SSEPackedInt,
6195                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6196                                   VEX_4V, VEX_L, WIG;
6197}
6198
6199// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6200// ExecutionDomainFixPass will cleanup domains later on.
6201let Predicates = [HasAVX1Only] in {
6202def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6203          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6204def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6205          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6206def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6207          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6208
6209// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6210// it from becoming movsd via commuting under optsize.
6211def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6212          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6213def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6214          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6215def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6216          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6217
6218def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6219          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6220def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6221          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6222def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6223          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6224
6225// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6226// it from becoming movss via commuting under optsize.
6227def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6228          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6229def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6230          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6231def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6232          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6233}
6234
6235defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6236                               VR128, memop, f128mem, 1, SSEPackedSingle,
6237                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6238defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6239                               VR128, memop, f128mem, 1, SSEPackedDouble,
6240                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6241defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6242                               VR128, memop, i128mem, 1, SSEPackedInt,
6243                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6244
6245let Predicates = [UseSSE41] in {
6246// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6247// it from becoming movss via commuting under optsize.
6248def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6249          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6250def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6251          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6252def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6253          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6254
6255def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6256          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6257def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6258          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6259def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6260          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6261}
6262
6263// For insertion into the zero index (low half) of a 256-bit vector, it is
6264// more efficient to generate a blend with immediate instead of an insert*128.
6265let Predicates = [HasAVX] in {
6266def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6267          (VBLENDPDYrri VR256:$src1,
6268                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6269                                       VR128:$src2, sub_xmm), 0x3)>;
6270def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6271          (VBLENDPSYrri VR256:$src1,
6272                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6273                                       VR128:$src2, sub_xmm), 0xf)>;
6274
6275def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6276          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6277                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6278def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6279          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6280                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6281}
6282
6283/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6284multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6285                                X86MemOperand x86memop, ValueType VT,
6286                                PatFrag mem_frag, SDNode OpNode,
6287                                X86FoldableSchedWrite sched> {
6288  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6289                  (ins RC:$src1, RC:$src2, RC:$src3),
6290                  !strconcat(OpcodeStr,
6291                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6292                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6293                  SSEPackedInt>, TAPD, VEX_4V,
6294                Sched<[sched]>;
6295
6296  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6297                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6298                  !strconcat(OpcodeStr,
6299                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6300                  [(set RC:$dst,
6301                        (OpNode RC:$src3, (mem_frag addr:$src2),
6302                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6303                Sched<[sched.Folded, sched.ReadAfterFold,
6304                       // x86memop:$src2
6305                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6306                       ReadDefault,
6307                       // RC::$src3
6308                       sched.ReadAfterFold]>;
6309}
6310
6311let Predicates = [HasAVX] in {
6312let ExeDomain = SSEPackedDouble in {
6313defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6314                                       v2f64, loadv2f64, X86Blendv,
6315                                       SchedWriteFVarBlend.XMM>;
6316defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6317                                       v4f64, loadv4f64, X86Blendv,
6318                                       SchedWriteFVarBlend.YMM>, VEX_L;
6319} // ExeDomain = SSEPackedDouble
6320let ExeDomain = SSEPackedSingle in {
6321defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6322                                       v4f32, loadv4f32, X86Blendv,
6323                                       SchedWriteFVarBlend.XMM>;
6324defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6325                                       v8f32, loadv8f32, X86Blendv,
6326                                       SchedWriteFVarBlend.YMM>, VEX_L;
6327} // ExeDomain = SSEPackedSingle
6328defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6329                                       v16i8, loadv16i8, X86Blendv,
6330                                       SchedWriteVarBlend.XMM>;
6331}
6332
6333let Predicates = [HasAVX2] in {
6334defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6335                                       v32i8, loadv32i8, X86Blendv,
6336                                       SchedWriteVarBlend.YMM>, VEX_L;
6337}
6338
6339let Predicates = [HasAVX] in {
6340  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6341                              (v4i32 VR128:$src2))),
6342            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6343  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6344                              (v2i64 VR128:$src2))),
6345            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6346  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6347                              (v8i32 VR256:$src2))),
6348            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6349  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6350                              (v4i64 VR256:$src2))),
6351            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6352}
6353
6354// Prefer a movss or movsd over a blendps when optimizing for size. these were
6355// changed to use blends because blends have better throughput on sandybridge
6356// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6357let Predicates = [HasAVX, OptForSpeed] in {
6358  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6359            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6360  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6361            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6362
6363  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6364            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6365  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6366            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6367  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6368            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6369
6370  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6371            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6372  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6373            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6374  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6375            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6376
6377  // Move low f32 and clear high bits.
6378  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6379            (SUBREG_TO_REG (i32 0),
6380             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6381                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6382                          (i8 1))), sub_xmm)>;
6383  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6384            (SUBREG_TO_REG (i32 0),
6385             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6386                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6387                          (i8 3))), sub_xmm)>;
6388}
6389
6390// Prefer a movss or movsd over a blendps when optimizing for size. these were
6391// changed to use blends because blends have better throughput on sandybridge
6392// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6393let Predicates = [UseSSE41, OptForSpeed] in {
6394  // With SSE41 we can use blends for these patterns.
6395  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6396            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6397  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6398            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6399
6400  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6401            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6402  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6403            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6404  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6405            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6406
6407  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6408            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6409  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6410            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6411  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6412            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6413}
6414
6415
6416/// SS41I_ternary - SSE 4.1 ternary operator
6417let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6418  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6419                           PatFrag mem_frag, X86MemOperand x86memop,
6420                           SDNode OpNode, X86FoldableSchedWrite sched> {
6421    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6422                    (ins VR128:$src1, VR128:$src2),
6423                    !strconcat(OpcodeStr,
6424                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6425                    [(set VR128:$dst,
6426                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6427                    Sched<[sched]>;
6428
6429    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6430                    (ins VR128:$src1, x86memop:$src2),
6431                    !strconcat(OpcodeStr,
6432                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6433                    [(set VR128:$dst,
6434                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6435                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6436  }
6437}
6438
6439let ExeDomain = SSEPackedDouble in
6440defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6441                              X86Blendv, SchedWriteFVarBlend.XMM>;
6442let ExeDomain = SSEPackedSingle in
6443defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6444                              X86Blendv, SchedWriteFVarBlend.XMM>;
6445defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6446                              X86Blendv, SchedWriteVarBlend.XMM>;
6447
6448// Aliases with the implicit xmm0 argument
6449def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6450                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6451def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6452                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6453def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6454                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6455def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6456                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6457def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6458                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6459def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6460                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6461
6462let Predicates = [UseSSE41] in {
6463  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6464                              (v4i32 VR128:$src2))),
6465            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6466  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6467                              (v2i64 VR128:$src2))),
6468            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6469}
6470
6471let AddedComplexity = 400 in { // Prefer non-temporal versions
6472
6473let Predicates = [HasAVX, NoVLX] in
6474def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6475                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6476                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, WIG;
6477let Predicates = [HasAVX2, NoVLX] in
6478def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6479                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6480                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, WIG;
6481def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6482                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6483                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6484
6485let Predicates = [HasAVX2, NoVLX] in {
6486  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6487            (VMOVNTDQAYrm addr:$src)>;
6488  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6489            (VMOVNTDQAYrm addr:$src)>;
6490  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6491            (VMOVNTDQAYrm addr:$src)>;
6492  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6493            (VMOVNTDQAYrm addr:$src)>;
6494  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6495            (VMOVNTDQAYrm addr:$src)>;
6496  def : Pat<(v16f16 (alignednontemporalload addr:$src)),
6497            (VMOVNTDQAYrm addr:$src)>;
6498  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6499            (VMOVNTDQAYrm addr:$src)>;
6500}
6501
6502let Predicates = [HasAVX, NoVLX] in {
6503  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6504            (VMOVNTDQArm addr:$src)>;
6505  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6506            (VMOVNTDQArm addr:$src)>;
6507  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6508            (VMOVNTDQArm addr:$src)>;
6509  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6510            (VMOVNTDQArm addr:$src)>;
6511  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6512            (VMOVNTDQArm addr:$src)>;
6513  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6514            (VMOVNTDQArm addr:$src)>;
6515  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6516            (VMOVNTDQArm addr:$src)>;
6517}
6518
6519let Predicates = [UseSSE41] in {
6520  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6521            (MOVNTDQArm addr:$src)>;
6522  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6523            (MOVNTDQArm addr:$src)>;
6524  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6525            (MOVNTDQArm addr:$src)>;
6526  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6527            (MOVNTDQArm addr:$src)>;
6528  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6529            (MOVNTDQArm addr:$src)>;
6530  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
6531            (MOVNTDQArm addr:$src)>;
6532  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6533            (MOVNTDQArm addr:$src)>;
6534}
6535
6536} // AddedComplexity
6537
6538//===----------------------------------------------------------------------===//
6539// SSE4.2 - Compare Instructions
6540//===----------------------------------------------------------------------===//
6541
6542/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6543multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6544                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6545                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6546                          bit Is2Addr = 1> {
6547  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6548       (ins RC:$src1, RC:$src2),
6549       !if(Is2Addr,
6550           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6551           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6552       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6553       Sched<[sched]>;
6554  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6555       (ins RC:$src1, x86memop:$src2),
6556       !if(Is2Addr,
6557           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6558           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6559       [(set RC:$dst,
6560         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6561       Sched<[sched.Folded, sched.ReadAfterFold]>;
6562}
6563
6564let Predicates = [HasAVX] in
6565  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6566                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6567                                 VEX_4V, WIG;
6568
6569let Predicates = [HasAVX2] in
6570  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6571                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6572                                  VEX_4V, VEX_L, WIG;
6573
6574let Constraints = "$src1 = $dst" in
6575  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6576                                memop, i128mem, SchedWriteVecALU.XMM>;
6577
6578//===----------------------------------------------------------------------===//
6579// SSE4.2 - String/text Processing Instructions
6580//===----------------------------------------------------------------------===//
6581
6582multiclass pcmpistrm_SS42AI<string asm> {
6583  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6584    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6585    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6586    []>, Sched<[WritePCmpIStrM]>;
6587  let mayLoad = 1 in
6588  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6589    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6590    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6591    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6592}
6593
6594let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6595  let Predicates = [HasAVX] in
6596  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, WIG;
6597  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6598}
6599
6600multiclass SS42AI_pcmpestrm<string asm> {
6601  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6602    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6603    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6604    []>, Sched<[WritePCmpEStrM]>;
6605  let mayLoad = 1 in
6606  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6607    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6608    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6609    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6610}
6611
6612let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6613  let Predicates = [HasAVX] in
6614  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, WIG;
6615  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6616}
6617
6618multiclass SS42AI_pcmpistri<string asm> {
6619  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6620    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6621    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6622    []>, Sched<[WritePCmpIStrI]>;
6623  let mayLoad = 1 in
6624  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6625    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6626    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6627    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6628}
6629
6630let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6631  let Predicates = [HasAVX] in
6632  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, WIG;
6633  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6634}
6635
6636multiclass SS42AI_pcmpestri<string asm> {
6637  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6638    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6639    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6640    []>, Sched<[WritePCmpEStrI]>;
6641  let mayLoad = 1 in
6642  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6643    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6644    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6645    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6646}
6647
6648let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6649  let Predicates = [HasAVX] in
6650  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, WIG;
6651  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6652}
6653
6654//===----------------------------------------------------------------------===//
6655// SSE4.2 - CRC Instructions
6656//===----------------------------------------------------------------------===//
6657
6658// No CRC instructions have AVX equivalents
6659
6660// crc intrinsic instruction
6661// This set of instructions are only rm, the only difference is the size
6662// of r and m.
6663class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6664                   RegisterClass RCIn, SDPatternOperator Int> :
6665  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6666         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6667         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6668         Sched<[WriteCRC32]>;
6669
6670class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6671                   X86MemOperand x86memop, SDPatternOperator Int> :
6672  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6673         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6674         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6675         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6676
6677let Constraints = "$src1 = $dst" in {
6678  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6679                                 int_x86_sse42_crc32_32_8>;
6680  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6681                                 int_x86_sse42_crc32_32_8>;
6682  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6683                                 int_x86_sse42_crc32_32_16>, OpSize16;
6684  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6685                                 int_x86_sse42_crc32_32_16>, OpSize16;
6686  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6687                                 int_x86_sse42_crc32_32_32>, OpSize32;
6688  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6689                                 int_x86_sse42_crc32_32_32>, OpSize32;
6690  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6691                                 int_x86_sse42_crc32_64_64>, REX_W;
6692  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6693                                 int_x86_sse42_crc32_64_64>, REX_W;
6694  let hasSideEffects = 0 in {
6695    let mayLoad = 1 in
6696    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6697                                   null_frag>, REX_W;
6698    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6699                                   null_frag>, REX_W;
6700  }
6701}
6702
6703//===----------------------------------------------------------------------===//
6704// SHA-NI Instructions
6705//===----------------------------------------------------------------------===//
6706
6707// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6708multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6709                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6710  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6711             (ins VR128:$src1, VR128:$src2),
6712             !if(UsesXMM0,
6713                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6714                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6715             [!if(UsesXMM0,
6716                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6717                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6718             T8PS, Sched<[sched]>;
6719
6720  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6721             (ins VR128:$src1, i128mem:$src2),
6722             !if(UsesXMM0,
6723                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6724                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6725             [!if(UsesXMM0,
6726                  (set VR128:$dst, (IntId VR128:$src1,
6727                    (memop addr:$src2), XMM0)),
6728                  (set VR128:$dst, (IntId VR128:$src1,
6729                    (memop addr:$src2))))]>, T8PS,
6730             Sched<[sched.Folded, sched.ReadAfterFold]>;
6731}
6732
6733let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6734  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6735                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6736                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6737                         [(set VR128:$dst,
6738                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6739                            (i8 timm:$src3)))]>, TAPS,
6740                         Sched<[SchedWriteVecIMul.XMM]>;
6741  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6742                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6743                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6744                         [(set VR128:$dst,
6745                           (int_x86_sha1rnds4 VR128:$src1,
6746                            (memop addr:$src2),
6747                            (i8 timm:$src3)))]>, TAPS,
6748                         Sched<[SchedWriteVecIMul.XMM.Folded,
6749                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6750
6751  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6752                              SchedWriteVecIMul.XMM>;
6753  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6754                              SchedWriteVecIMul.XMM>;
6755  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6756                              SchedWriteVecIMul.XMM>;
6757
6758  let Uses=[XMM0] in
6759  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6760                                SchedWriteVecIMul.XMM, 1>;
6761
6762  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6763                               SchedWriteVecIMul.XMM>;
6764  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6765                               SchedWriteVecIMul.XMM>;
6766}
6767
6768// Aliases with explicit %xmm0
6769def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6770                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6771def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6772                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6773
6774//===----------------------------------------------------------------------===//
6775// AES-NI Instructions
6776//===----------------------------------------------------------------------===//
6777
6778multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6779                             Intrinsic IntId, PatFrag ld_frag,
6780                             bit Is2Addr = 0, RegisterClass RC = VR128,
6781                             X86MemOperand MemOp = i128mem> {
6782  let AsmString = OpcodeStr#
6783                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6784                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6785    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6786                   (ins RC:$src1, RC:$src2), "",
6787                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6788                   Sched<[WriteAESDecEnc]>;
6789    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6790                   (ins RC:$src1, MemOp:$src2), "",
6791                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6792                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6793  }
6794}
6795
6796// Perform One Round of an AES Encryption/Decryption Flow
6797let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6798  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6799                         int_x86_aesni_aesenc, load>, VEX_4V, WIG;
6800  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6801                         int_x86_aesni_aesenclast, load>, VEX_4V, WIG;
6802  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6803                         int_x86_aesni_aesdec, load>, VEX_4V, WIG;
6804  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6805                         int_x86_aesni_aesdeclast, load>, VEX_4V, WIG;
6806}
6807
6808let Predicates = [NoVLX, HasVAES] in {
6809  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6810                         int_x86_aesni_aesenc_256, load, 0, VR256,
6811                         i256mem>, VEX_4V, VEX_L, WIG;
6812  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6813                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6814                         i256mem>, VEX_4V, VEX_L, WIG;
6815  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6816                         int_x86_aesni_aesdec_256, load, 0, VR256,
6817                         i256mem>, VEX_4V, VEX_L, WIG;
6818  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6819                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6820                         i256mem>, VEX_4V, VEX_L, WIG;
6821}
6822
6823let Constraints = "$src1 = $dst" in {
6824  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6825                         int_x86_aesni_aesenc, memop, 1>;
6826  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6827                         int_x86_aesni_aesenclast, memop, 1>;
6828  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6829                         int_x86_aesni_aesdec, memop, 1>;
6830  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6831                         int_x86_aesni_aesdeclast, memop, 1>;
6832}
6833
6834// Perform the AES InvMixColumn Transformation
6835let Predicates = [HasAVX, HasAES] in {
6836  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6837      (ins VR128:$src1),
6838      "vaesimc\t{$src1, $dst|$dst, $src1}",
6839      [(set VR128:$dst,
6840        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6841      VEX, WIG;
6842  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6843      (ins i128mem:$src1),
6844      "vaesimc\t{$src1, $dst|$dst, $src1}",
6845      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6846      Sched<[WriteAESIMC.Folded]>, VEX, WIG;
6847}
6848def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6849  (ins VR128:$src1),
6850  "aesimc\t{$src1, $dst|$dst, $src1}",
6851  [(set VR128:$dst,
6852    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6853def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6854  (ins i128mem:$src1),
6855  "aesimc\t{$src1, $dst|$dst, $src1}",
6856  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6857  Sched<[WriteAESIMC.Folded]>;
6858
6859// AES Round Key Generation Assist
6860let Predicates = [HasAVX, HasAES] in {
6861  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6862      (ins VR128:$src1, u8imm:$src2),
6863      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6864      [(set VR128:$dst,
6865        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6866      Sched<[WriteAESKeyGen]>, VEX, WIG;
6867  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6868      (ins i128mem:$src1, u8imm:$src2),
6869      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6870      [(set VR128:$dst,
6871        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6872      Sched<[WriteAESKeyGen.Folded]>, VEX, WIG;
6873}
6874def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6875  (ins VR128:$src1, u8imm:$src2),
6876  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6877  [(set VR128:$dst,
6878    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6879  Sched<[WriteAESKeyGen]>;
6880def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6881  (ins i128mem:$src1, u8imm:$src2),
6882  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6883  [(set VR128:$dst,
6884    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6885  Sched<[WriteAESKeyGen.Folded]>;
6886
6887//===----------------------------------------------------------------------===//
6888// PCLMUL Instructions
6889//===----------------------------------------------------------------------===//
6890
6891// Immediate transform to help with commuting.
6892def PCLMULCommuteImm : SDNodeXForm<timm, [{
6893  uint8_t Imm = N->getZExtValue();
6894  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6895}]>;
6896
6897// SSE carry-less Multiplication instructions
6898let Predicates = [NoAVX, HasPCLMUL] in {
6899  let Constraints = "$src1 = $dst" in {
6900    let isCommutable = 1 in
6901    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6902              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6903              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6904              [(set VR128:$dst,
6905                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6906                Sched<[WriteCLMul]>;
6907
6908    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6909              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6910              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6911              [(set VR128:$dst,
6912                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6913                  timm:$src3))]>,
6914              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6915  } // Constraints = "$src1 = $dst"
6916
6917  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6918                                (i8 timm:$src3)),
6919            (PCLMULQDQrm VR128:$src1, addr:$src2,
6920                          (PCLMULCommuteImm timm:$src3))>;
6921} // Predicates = [NoAVX, HasPCLMUL]
6922
6923// SSE aliases
6924foreach HI = ["hq","lq"] in
6925foreach LO = ["hq","lq"] in {
6926  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6927                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6928                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6929  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6930                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6931                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6932}
6933
6934// AVX carry-less Multiplication instructions
6935multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6936                      PatFrag LdFrag, Intrinsic IntId> {
6937  let isCommutable = 1 in
6938  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6939            (ins RC:$src1, RC:$src2, u8imm:$src3),
6940            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6941            [(set RC:$dst,
6942              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6943            Sched<[WriteCLMul]>;
6944
6945  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6946            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6947            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6948            [(set RC:$dst,
6949               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6950            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6951
6952  // We can commute a load in the first operand by swapping the sources and
6953  // rotating the immediate.
6954  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6955            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6956                                           (PCLMULCommuteImm timm:$src3))>;
6957}
6958
6959let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6960defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6961                             int_x86_pclmulqdq>, VEX_4V, WIG;
6962
6963let Predicates = [NoVLX, HasVPCLMULQDQ] in
6964defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6965                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, WIG;
6966
6967multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6968                                   X86MemOperand MemOp, string Hi, string Lo> {
6969  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6970                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6971                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6972  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6973                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6974                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6975}
6976
6977multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6978                              X86MemOperand MemOp> {
6979  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6980  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6981  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6982  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6983}
6984
6985// AVX aliases
6986defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6987defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6988
6989//===----------------------------------------------------------------------===//
6990// SSE4A Instructions
6991//===----------------------------------------------------------------------===//
6992
6993let Predicates = [HasSSE4A] in {
6994
6995let ExeDomain = SSEPackedInt in {
6996let Constraints = "$src = $dst" in {
6997def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6998                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6999                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
7000                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
7001                                    timm:$idx))]>,
7002                 PD, Sched<[SchedWriteVecALU.XMM]>;
7003def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7004              (ins VR128:$src, VR128:$mask),
7005              "extrq\t{$mask, $src|$src, $mask}",
7006              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
7007                                 VR128:$mask))]>,
7008              PD, Sched<[SchedWriteVecALU.XMM]>;
7009
7010def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
7011                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
7012                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
7013                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
7014                                      timm:$len, timm:$idx))]>,
7015                   XD, Sched<[SchedWriteVecALU.XMM]>;
7016def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
7017                 (ins VR128:$src, VR128:$mask),
7018                 "insertq\t{$mask, $src|$src, $mask}",
7019                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
7020                                    VR128:$mask))]>,
7021                 XD, Sched<[SchedWriteVecALU.XMM]>;
7022}
7023} // ExeDomain = SSEPackedInt
7024
7025// Non-temporal (unaligned) scalar stores.
7026let AddedComplexity = 400 in { // Prefer non-temporal versions
7027let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
7028def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
7029                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
7030
7031def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
7032                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
7033} // SchedRW
7034
7035def : Pat<(nontemporalstore FR32:$src, addr:$dst),
7036          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7037
7038def : Pat<(nontemporalstore FR64:$src, addr:$dst),
7039          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7040
7041} // AddedComplexity
7042} // HasSSE4A
7043
7044//===----------------------------------------------------------------------===//
7045// AVX Instructions
7046//===----------------------------------------------------------------------===//
7047
7048//===----------------------------------------------------------------------===//
7049// VBROADCAST - Load from memory and broadcast to all elements of the
7050//              destination operand
7051//
7052class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
7053                           X86MemOperand x86memop, ValueType VT,
7054                           PatFrag bcast_frag, SchedWrite Sched> :
7055  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7056        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7057        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
7058        Sched<[Sched]>, VEX;
7059
7060// AVX2 adds register forms
7061class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
7062                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
7063  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7064         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7065         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
7066         Sched<[Sched]>, VEX;
7067
7068let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
7069  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
7070                                         f32mem, v4f32, X86VBroadcastld32,
7071                                         SchedWriteFShuffle.XMM.Folded>;
7072  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
7073                                         f32mem, v8f32, X86VBroadcastld32,
7074                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
7075}
7076let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7077def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7078                                        v4f64, X86VBroadcastld64,
7079                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
7080
7081let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7082  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7083                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
7084  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7085                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7086}
7087let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7088def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7089                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7090
7091//===----------------------------------------------------------------------===//
7092// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7093//                  halves of a 256-bit vector.
7094//
7095let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7096def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7097                           (ins i128mem:$src),
7098                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7099                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7100
7101let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7102    ExeDomain = SSEPackedSingle in
7103def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7104                           (ins f128mem:$src),
7105                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7106                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7107
7108let Predicates = [HasAVX, NoVLX] in {
7109def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
7110          (VBROADCASTF128 addr:$src)>;
7111def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
7112          (VBROADCASTF128 addr:$src)>;
7113// NOTE: We're using FP instructions here, but execution domain fixing can
7114// convert to integer when profitable.
7115def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
7116          (VBROADCASTF128 addr:$src)>;
7117def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
7118          (VBROADCASTF128 addr:$src)>;
7119def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
7120          (VBROADCASTF128 addr:$src)>;
7121def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
7122          (VBROADCASTF128 addr:$src)>;
7123def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
7124          (VBROADCASTF128 addr:$src)>;
7125}
7126
7127//===----------------------------------------------------------------------===//
7128// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7129//
7130
7131let ExeDomain = SSEPackedSingle in {
7132let isCommutable = 1 in
7133def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7134          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7135          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7136          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
7137def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7138          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7139          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7140          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7141}
7142
7143// Immediate transform to help with commuting.
7144def Perm2XCommuteImm : SDNodeXForm<timm, [{
7145  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7146}]>;
7147
7148multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7149  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7150            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7151  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7152            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7153  // Pattern with load in other operand.
7154  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7155            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7156                                             (Perm2XCommuteImm timm:$imm))>;
7157}
7158
7159let Predicates = [HasAVX] in {
7160  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7161  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
7162}
7163
7164let Predicates = [HasAVX1Only] in {
7165  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
7166  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
7167  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7168  defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
7169  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
7170}
7171
7172//===----------------------------------------------------------------------===//
7173// VINSERTF128 - Insert packed floating-point values
7174//
7175let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7176def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7177          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7178          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7179          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7180let mayLoad = 1 in
7181def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7182          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7183          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7184          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7185}
7186
7187// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7188// with YMM register containing zero.
7189// FIXME: Avoid producing vxorps to clear the fake inputs.
7190let Predicates = [HasAVX1Only] in {
7191def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7192}
7193
7194multiclass vinsert_lowering<string InstrStr, string PermStr,
7195                            ValueType From, ValueType To,
7196                            PatFrag frommemop_frag, PatFrag tomemop_frag> {
7197  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7198                                   (iPTR imm)),
7199            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7200                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7201  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7202                                    (From (frommemop_frag addr:$src2)),
7203                                    (iPTR imm)),
7204            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7205                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7206  // Folding "To" vector - convert to perm2x128 and commute inputs.
7207  def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
7208                                    (From VR128:$src2),
7209                                    (iPTR imm)),
7210            (!cast<Instruction>(PermStr#rm)
7211              (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
7212              addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
7213}
7214
7215let Predicates = [HasAVX, NoVLX] in {
7216  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
7217  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
7218}
7219
7220let Predicates = [HasAVX1Only] in {
7221  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
7222  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
7223  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
7224  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
7225  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7226  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7227}
7228
7229//===----------------------------------------------------------------------===//
7230// VEXTRACTF128 - Extract packed floating-point values
7231//
7232let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7233def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7234          (ins VR256:$src1, u8imm:$src2),
7235          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7236          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7237let mayStore = 1 in
7238def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7239          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7240          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7241          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7242}
7243
7244multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7245  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7246            (To (!cast<Instruction>(InstrStr#rr)
7247                                    (From VR256:$src1),
7248                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7249  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7250                                                 (iPTR imm))), addr:$dst),
7251            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7252             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7253}
7254
7255// AVX1 patterns
7256let Predicates = [HasAVX, NoVLX] in {
7257  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7258  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7259}
7260
7261let Predicates = [HasAVX1Only] in {
7262  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7263  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7264  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7265  defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
7266  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7267  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7268}
7269
7270//===----------------------------------------------------------------------===//
7271// VMASKMOV - Conditional SIMD Packed Loads and Stores
7272//
7273multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7274                          Intrinsic IntLd, Intrinsic IntLd256,
7275                          Intrinsic IntSt, Intrinsic IntSt256,
7276                          X86SchedWriteMaskMove schedX,
7277                          X86SchedWriteMaskMove schedY> {
7278  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7279             (ins VR128:$src1, f128mem:$src2),
7280             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7281             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7282             VEX_4V, Sched<[schedX.RM]>;
7283  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7284             (ins VR256:$src1, f256mem:$src2),
7285             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7286             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7287             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7288  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7289             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7290             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7291             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7292             VEX_4V, Sched<[schedX.MR]>;
7293  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7294             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7295             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7296             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7297             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7298}
7299
7300let ExeDomain = SSEPackedSingle in
7301defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7302                                 int_x86_avx_maskload_ps,
7303                                 int_x86_avx_maskload_ps_256,
7304                                 int_x86_avx_maskstore_ps,
7305                                 int_x86_avx_maskstore_ps_256,
7306                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7307let ExeDomain = SSEPackedDouble in
7308defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7309                                 int_x86_avx_maskload_pd,
7310                                 int_x86_avx_maskload_pd_256,
7311                                 int_x86_avx_maskstore_pd,
7312                                 int_x86_avx_maskstore_pd_256,
7313                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7314
7315//===----------------------------------------------------------------------===//
7316// AVX_VNNI
7317//===----------------------------------------------------------------------===//
7318let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
7319    ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
7320multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7321                       bit IsCommutable> {
7322  let isCommutable = IsCommutable in
7323  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
7324             (ins VR128:$src1, VR128:$src2, VR128:$src3),
7325             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7326             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
7327                                       VR128:$src2, VR128:$src3)))]>,
7328             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7329
7330  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
7331             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
7332             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7333             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
7334                                      (loadv4i32 addr:$src3))))]>,
7335             VEX_4V, Sched<[SchedWriteVecIMul.XMM.Folded,
7336                            SchedWriteVecIMul.XMM.ReadAfterFold,
7337                            SchedWriteVecIMul.XMM.ReadAfterFold]>;
7338
7339  let isCommutable = IsCommutable in
7340  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
7341             (ins VR256:$src1, VR256:$src2, VR256:$src3),
7342             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7343             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
7344                                       VR256:$src2, VR256:$src3)))]>,
7345             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
7346
7347  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
7348             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
7349             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7350             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
7351                                      (loadv8i32 addr:$src3))))]>,
7352             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
7353                                   SchedWriteVecIMul.YMM.ReadAfterFold,
7354                                   SchedWriteVecIMul.YMM.ReadAfterFold]>;
7355}
7356
7357defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
7358defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
7359defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>;
7360defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
7361
7362def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
7363                             (X86vpmaddwd node:$lhs, node:$rhs), [{
7364  return N->hasOneUse();
7365}]>;
7366
7367let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
7368  def : Pat<(v8i32 (add VR256:$src1,
7369                        (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
7370            (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
7371  def : Pat<(v8i32 (add VR256:$src1,
7372                        (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
7373            (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
7374  def : Pat<(v4i32 (add VR128:$src1,
7375                        (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
7376            (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
7377  def : Pat<(v4i32 (add VR128:$src1,
7378                        (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
7379            (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
7380}
7381
7382//===----------------------------------------------------------------------===//
7383// VPERMIL - Permute Single and Double Floating-Point Values
7384//
7385
7386multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7387                      RegisterClass RC, X86MemOperand x86memop_f,
7388                      X86MemOperand x86memop_i,
7389                      ValueType f_vt, ValueType i_vt,
7390                      X86FoldableSchedWrite sched,
7391                      X86FoldableSchedWrite varsched> {
7392  let Predicates = [HasAVX, NoVLX] in {
7393    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7394               (ins RC:$src1, RC:$src2),
7395               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7396               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7397               Sched<[varsched]>;
7398    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7399               (ins RC:$src1, x86memop_i:$src2),
7400               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7401               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7402                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7403               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7404
7405    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7406             (ins RC:$src1, u8imm:$src2),
7407             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7408             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7409             Sched<[sched]>;
7410    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7411             (ins x86memop_f:$src1, u8imm:$src2),
7412             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7413             [(set RC:$dst,
7414               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7415             Sched<[sched.Folded]>;
7416  }// Predicates = [HasAVX, NoVLX]
7417}
7418
7419let ExeDomain = SSEPackedSingle in {
7420  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7421                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7422                               SchedWriteFVarShuffle.XMM>;
7423  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7424                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7425                               SchedWriteFVarShuffle.YMM>, VEX_L;
7426}
7427let ExeDomain = SSEPackedDouble in {
7428  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7429                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7430                               SchedWriteFVarShuffle.XMM>;
7431  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7432                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7433                               SchedWriteFVarShuffle.YMM>, VEX_L;
7434}
7435
7436//===----------------------------------------------------------------------===//
7437// VZERO - Zero YMM registers
7438// Note: These instruction do not affect the YMM16-YMM31.
7439//
7440
7441let SchedRW = [WriteSystem] in {
7442let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7443            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7444  // Zero All YMM registers
7445  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7446                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7447                  Requires<[HasAVX]>, WIG;
7448
7449  // Zero Upper bits of YMM registers
7450  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7451                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7452                     Requires<[HasAVX]>, WIG;
7453} // Defs
7454} // SchedRW
7455
7456//===----------------------------------------------------------------------===//
7457// Half precision conversion instructions
7458//
7459
7460multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7461                      X86FoldableSchedWrite sched> {
7462  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7463             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7464             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7465             T8PD, VEX, Sched<[sched]>;
7466  let hasSideEffects = 0, mayLoad = 1 in
7467  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7468             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7469             []>, T8PD, VEX, Sched<[sched.Folded]>;
7470}
7471
7472multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7473                      SchedWrite RR, SchedWrite MR> {
7474  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7475               (ins RC:$src1, i32u8imm:$src2),
7476               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7477               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7478               TAPD, VEX, Sched<[RR]>;
7479  let hasSideEffects = 0, mayStore = 1 in
7480  def mr : Ii8<0x1D, MRMDestMem, (outs),
7481               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7482               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7483               TAPD, VEX, Sched<[MR]>;
7484}
7485
7486let Predicates = [HasF16C, NoVLX] in {
7487  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7488  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7489  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7490                               WriteCvtPS2PHSt>, SIMD_EXC;
7491  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7492                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7493
7494  // Pattern match vcvtph2ps of a scalar i64 load.
7495  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7496            (VCVTPH2PSrm addr:$src)>;
7497  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7498              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7499            (VCVTPH2PSrm addr:$src)>;
7500  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7501            (VCVTPH2PSYrm addr:$src)>;
7502
7503  def : Pat<(store (f64 (extractelt
7504                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7505                         (iPTR 0))), addr:$dst),
7506            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7507  def : Pat<(store (i64 (extractelt
7508                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7509                         (iPTR 0))), addr:$dst),
7510            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7511  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7512            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7513}
7514
7515//===----------------------------------------------------------------------===//
7516// AVX2 Instructions
7517//===----------------------------------------------------------------------===//
7518
7519/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7520multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7521                          ValueType OpVT, X86FoldableSchedWrite sched,
7522                          RegisterClass RC,
7523                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7524  let isCommutable = 1 in
7525  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7526        (ins RC:$src1, RC:$src2, u8imm:$src3),
7527        !strconcat(OpcodeStr,
7528            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7529        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7530        Sched<[sched]>, VEX_4V;
7531  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7532        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7533        !strconcat(OpcodeStr,
7534            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7535        [(set RC:$dst,
7536          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7537        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7538
7539  // Pattern to commute if load is in first source.
7540  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7541            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7542                                            (commuteXForm timm:$src3))>;
7543}
7544
7545let Predicates = [HasAVX2] in {
7546defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7547                               SchedWriteBlend.XMM, VR128, i128mem,
7548                               BlendCommuteImm4>;
7549defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7550                                SchedWriteBlend.YMM, VR256, i256mem,
7551                                BlendCommuteImm8>, VEX_L;
7552
7553def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7554          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7555def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7556          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7557def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7558          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7559
7560def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7561          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7562def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7563          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7564def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7565          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7566}
7567
7568// For insertion into the zero index (low half) of a 256-bit vector, it is
7569// more efficient to generate a blend with immediate instead of an insert*128.
7570// NOTE: We're using FP instructions here, but execution domain fixing should
7571// take care of using integer instructions when profitable.
7572let Predicates = [HasAVX] in {
7573def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7574          (VBLENDPSYrri VR256:$src1,
7575                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7576                                       VR128:$src2, sub_xmm), 0xf)>;
7577def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7578          (VBLENDPSYrri VR256:$src1,
7579                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7580                                       VR128:$src2, sub_xmm), 0xf)>;
7581def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7582          (VBLENDPSYrri VR256:$src1,
7583                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7584                                       VR128:$src2, sub_xmm), 0xf)>;
7585def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
7586          (VBLENDPSYrri VR256:$src1,
7587                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7588                                       VR128:$src2, sub_xmm), 0xf)>;
7589def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7590          (VBLENDPSYrri VR256:$src1,
7591                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7592                                       VR128:$src2, sub_xmm), 0xf)>;
7593
7594def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7595          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7596                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7597def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7598          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7599                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7600def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7601          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7602                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7603def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
7604          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7605                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7606def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7607          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7608                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7609}
7610
7611//===----------------------------------------------------------------------===//
7612// VPBROADCAST - Load from memory and broadcast to all elements of the
7613//               destination operand
7614//
7615multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7616                          X86MemOperand x86memop, PatFrag bcast_frag,
7617                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7618  let Predicates = [HasAVX2, prd] in {
7619    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7620                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7621                  [(set VR128:$dst,
7622                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7623                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7624    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7625                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7626                  [(set VR128:$dst,
7627                   (OpVT128 (bcast_frag addr:$src)))]>,
7628                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7629    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7630                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7631                   [(set VR256:$dst,
7632                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7633                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7634    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7635                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7636                   [(set VR256:$dst,
7637                    (OpVT256 (bcast_frag addr:$src)))]>,
7638                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7639
7640    // Provide aliases for broadcast from the same register class that
7641    // automatically does the extract.
7642    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7643              (!cast<Instruction>(NAME#"Yrr")
7644                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7645  }
7646}
7647
7648defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7649                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7650defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7651                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7652defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7653                                    v4i32, v8i32, NoVLX>;
7654defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7655                                    v2i64, v4i64, NoVLX>;
7656
7657let Predicates = [HasAVX2, NoVLX] in {
7658  // Provide fallback in case the load node that is used in the patterns above
7659  // is used by additional users, which prevents the pattern selection.
7660    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7661              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7662    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7663              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7664    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7665              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7666}
7667
7668let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7669  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7670        (VPBROADCASTBrr (VMOVDI2PDIrr
7671                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7672                                             GR8:$src, sub_8bit))))>;
7673  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7674        (VPBROADCASTBYrr (VMOVDI2PDIrr
7675                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7676                                              GR8:$src, sub_8bit))))>;
7677
7678  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7679        (VPBROADCASTWrr (VMOVDI2PDIrr
7680                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7681                                             GR16:$src, sub_16bit))))>;
7682  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7683        (VPBROADCASTWYrr (VMOVDI2PDIrr
7684                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7685                                              GR16:$src, sub_16bit))))>;
7686
7687  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
7688            (VPBROADCASTWrm addr:$src)>;
7689  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
7690            (VPBROADCASTWYrm addr:$src)>;
7691
7692  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))),
7693            (VPBROADCASTWrr VR128:$src)>;
7694  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))),
7695            (VPBROADCASTWYrr VR128:$src)>;
7696
7697  def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))),
7698            (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7699  def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))),
7700            (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>;
7701}
7702let Predicates = [HasAVX2, NoVLX] in {
7703  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7704            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7705  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7706            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7707  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7708            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7709  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7710            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7711}
7712
7713// AVX1 broadcast patterns
7714let Predicates = [HasAVX1Only] in {
7715def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7716          (VBROADCASTSSYrm addr:$src)>;
7717def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7718          (VBROADCASTSDYrm addr:$src)>;
7719def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7720          (VBROADCASTSSrm addr:$src)>;
7721}
7722
7723  // Provide fallback in case the load node that is used in the patterns above
7724  // is used by additional users, which prevents the pattern selection.
7725let Predicates = [HasAVX, NoVLX] in {
7726  // 128bit broadcasts:
7727  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7728            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7729  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7730            (VMOVDDUPrm addr:$src)>;
7731
7732  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7733            (VMOVDDUPrr VR128:$src)>;
7734}
7735
7736let Predicates = [HasAVX1Only] in {
7737  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7738            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7739  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7740            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7741              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7742              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7743  def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
7744            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7745              (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
7746              (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
7747  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7748            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7749              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7750              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7751  def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
7752            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7753              (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
7754              (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
7755
7756  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7757            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7758  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7759            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7760              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7761              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7762  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7763            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7764              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7765              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7766
7767  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7768            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7769  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7770            (VMOVDDUPrm addr:$src)>;
7771}
7772
7773//===----------------------------------------------------------------------===//
7774// VPERM - Permute instructions
7775//
7776
7777multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7778                     ValueType OpVT, X86FoldableSchedWrite Sched,
7779                     X86MemOperand memOp> {
7780  let Predicates = [HasAVX2, NoVLX] in {
7781    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7782                     (ins VR256:$src1, VR256:$src2),
7783                     !strconcat(OpcodeStr,
7784                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7785                     [(set VR256:$dst,
7786                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7787                     Sched<[Sched]>, VEX_4V, VEX_L;
7788    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7789                     (ins VR256:$src1, memOp:$src2),
7790                     !strconcat(OpcodeStr,
7791                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7792                     [(set VR256:$dst,
7793                       (OpVT (X86VPermv VR256:$src1,
7794                              (load addr:$src2))))]>,
7795                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7796  }
7797}
7798
7799defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7800let ExeDomain = SSEPackedSingle in
7801defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7802
7803multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7804                         ValueType OpVT, X86FoldableSchedWrite Sched,
7805                         X86MemOperand memOp> {
7806  let Predicates = [HasAVX2, NoVLX] in {
7807    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7808                       (ins VR256:$src1, u8imm:$src2),
7809                       !strconcat(OpcodeStr,
7810                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7811                       [(set VR256:$dst,
7812                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7813                       Sched<[Sched]>, VEX, VEX_L;
7814    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7815                       (ins memOp:$src1, u8imm:$src2),
7816                       !strconcat(OpcodeStr,
7817                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7818                       [(set VR256:$dst,
7819                         (OpVT (X86VPermi (mem_frag addr:$src1),
7820                                (i8 timm:$src2))))]>,
7821                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7822  }
7823}
7824
7825defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7826                            WriteShuffle256, i256mem>, REX_W;
7827let ExeDomain = SSEPackedDouble in
7828defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7829                             WriteFShuffle256, f256mem>, REX_W;
7830
7831//===----------------------------------------------------------------------===//
7832// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
7833//
7834let isCommutable = 1 in
7835def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7836          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7837          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7838          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7839def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7840          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7841          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7842          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7843
7844let Predicates = [HasAVX2] in {
7845  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
7846  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
7847  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7848  defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
7849  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7850  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7851}
7852
7853//===----------------------------------------------------------------------===//
7854// VINSERTI128 - Insert packed integer values
7855//
7856let hasSideEffects = 0 in {
7857def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7858          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7859          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7860          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7861let mayLoad = 1 in
7862def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7863          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7864          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7865          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7866}
7867
7868let Predicates = [HasAVX2, NoVLX] in {
7869  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
7870  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
7871  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
7872  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16,  loadv16f16>;
7873  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7874  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7875}
7876
7877//===----------------------------------------------------------------------===//
7878// VEXTRACTI128 - Extract packed integer values
7879//
7880def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7881          (ins VR256:$src1, u8imm:$src2),
7882          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7883          Sched<[WriteShuffle256]>, VEX, VEX_L;
7884let hasSideEffects = 0, mayStore = 1 in
7885def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7886          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7887          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7888          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7889
7890let Predicates = [HasAVX2, NoVLX] in {
7891  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7892  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7893  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7894  defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
7895  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7896  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7897}
7898
7899//===----------------------------------------------------------------------===//
7900// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7901//
7902multiclass avx2_pmovmask<string OpcodeStr,
7903                         Intrinsic IntLd128, Intrinsic IntLd256,
7904                         Intrinsic IntSt128, Intrinsic IntSt256,
7905                         X86SchedWriteMaskMove schedX,
7906                         X86SchedWriteMaskMove schedY> {
7907  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7908             (ins VR128:$src1, i128mem:$src2),
7909             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7910             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7911             VEX_4V, Sched<[schedX.RM]>;
7912  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7913             (ins VR256:$src1, i256mem:$src2),
7914             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7915             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7916             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7917  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7918             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7919             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7920             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7921             VEX_4V, Sched<[schedX.MR]>;
7922  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7923             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7924             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7925             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7926             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7927}
7928
7929defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7930                                int_x86_avx2_maskload_d,
7931                                int_x86_avx2_maskload_d_256,
7932                                int_x86_avx2_maskstore_d,
7933                                int_x86_avx2_maskstore_d_256,
7934                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
7935defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7936                                int_x86_avx2_maskload_q,
7937                                int_x86_avx2_maskload_q_256,
7938                                int_x86_avx2_maskstore_q,
7939                                int_x86_avx2_maskstore_q_256,
7940                                WriteVecMaskMove64, WriteVecMaskMove64Y>, REX_W;
7941
7942multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7943                          ValueType MaskVT> {
7944    // masked store
7945    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7946             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7947    // masked load
7948    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7949             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7950    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7951                              (VT immAllZerosV))),
7952             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7953}
7954let Predicates = [HasAVX] in {
7955  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7956  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7957  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7958  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7959}
7960let Predicates = [HasAVX1Only] in {
7961  // load/store i32/i64 not supported use ps/pd version
7962  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7963  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7964  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7965  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7966}
7967let Predicates = [HasAVX2] in {
7968  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7969  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7970  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7971  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7972}
7973
7974//===----------------------------------------------------------------------===//
7975// Variable Bit Shifts
7976//
7977multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7978                          ValueType vt128, ValueType vt256> {
7979  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7980             (ins VR128:$src1, VR128:$src2),
7981             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7982             [(set VR128:$dst,
7983               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7984             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7985  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7986             (ins VR128:$src1, i128mem:$src2),
7987             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7988             [(set VR128:$dst,
7989               (vt128 (OpNode VR128:$src1,
7990                       (vt128 (load addr:$src2)))))]>,
7991             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7992                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7993  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7994             (ins VR256:$src1, VR256:$src2),
7995             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7996             [(set VR256:$dst,
7997               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7998             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7999  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
8000             (ins VR256:$src1, i256mem:$src2),
8001             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8002             [(set VR256:$dst,
8003               (vt256 (OpNode VR256:$src1,
8004                       (vt256 (load addr:$src2)))))]>,
8005             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
8006                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
8007}
8008
8009let Predicates = [HasAVX2, NoVLX] in {
8010  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
8011  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, REX_W;
8012  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
8013  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, REX_W;
8014  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
8015}
8016
8017//===----------------------------------------------------------------------===//
8018// VGATHER - GATHER Operations
8019
8020// FIXME: Improve scheduling of gather instructions.
8021multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
8022                       X86MemOperand memop128, X86MemOperand memop256> {
8023let mayLoad = 1, hasSideEffects = 0 in {
8024  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
8025            (ins VR128:$src1, memop128:$src2, VR128:$mask),
8026            !strconcat(OpcodeStr,
8027              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8028            []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8029  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
8030            (ins RC256:$src1, memop256:$src2, RC256:$mask),
8031            !strconcat(OpcodeStr,
8032              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
8033            []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
8034}
8035}
8036
8037let Predicates = [HasAVX2] in {
8038  let mayLoad = 1, hasSideEffects = 0, Constraints
8039    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
8040    in {
8041    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
8042                                  VR256, vx128mem, vx256mem>, REX_W;
8043    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
8044                                  VR256, vx128mem, vy256mem>, REX_W;
8045    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
8046                                  VR256, vx128mem, vy256mem>;
8047    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
8048                                  VR128, vx64mem, vy128mem>;
8049
8050    let ExeDomain = SSEPackedDouble in {
8051      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
8052                                    VR256, vx128mem, vx256mem>, REX_W;
8053      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
8054                                    VR256, vx128mem, vy256mem>, REX_W;
8055    }
8056
8057    let ExeDomain = SSEPackedSingle in {
8058      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
8059                                    VR256, vx128mem, vy256mem>;
8060      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
8061                                    VR128, vx64mem, vy128mem>;
8062    }
8063  }
8064}
8065
8066//===----------------------------------------------------------------------===//
8067// GFNI instructions
8068//===----------------------------------------------------------------------===//
8069
8070multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
8071                        RegisterClass RC, PatFrag MemOpFrag,
8072                        X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8073                        bit Is2Addr = 0> {
8074  let ExeDomain = SSEPackedInt,
8075      AsmString = !if(Is2Addr,
8076        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
8077        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
8078    let isCommutable = 1 in
8079    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
8080                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
8081             Sched<[sched]>, T8PD;
8082
8083    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
8084                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
8085                                 (MemOpFrag addr:$src2))))]>,
8086             Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
8087  }
8088}
8089
8090multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
8091                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
8092                           X86MemOperand X86MemOp, X86FoldableSchedWrite sched,
8093                           bit Is2Addr = 0> {
8094  let AsmString = !if(Is2Addr,
8095      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8096      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
8097  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
8098              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
8099              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
8100              SSEPackedInt>, Sched<[sched]>;
8101  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
8102              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
8103              [(set RC:$dst, (OpVT (OpNode RC:$src1,
8104                                    (MemOpFrag addr:$src2),
8105                              timm:$src3)))], SSEPackedInt>,
8106              Sched<[sched.Folded, sched.ReadAfterFold]>;
8107  }
8108}
8109
8110multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8111  let Constraints = "$src1 = $dst",
8112      Predicates  = [HasGFNI, UseSSE2] in
8113  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8114                                      VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>;
8115  let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
8116    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
8117                                     load, i128mem, SchedWriteVecIMul.XMM>,
8118                                     VEX_4V, REX_W;
8119    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
8120                                     load, i256mem, SchedWriteVecIMul.YMM>,
8121                                     VEX_4V, VEX_L, REX_W;
8122  }
8123}
8124
8125// GF2P8MULB
8126let Constraints = "$src1 = $dst",
8127    Predicates  = [HasGFNI, UseSSE2] in
8128defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8129                                    i128mem, SchedWriteVecALU.XMM, 1>;
8130let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
8131  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8132                                   i128mem, SchedWriteVecALU.XMM>, VEX_4V;
8133  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8134                                   i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
8135}
8136// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8137let isCommutable = 0 in {
8138  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8139                                             X86GF2P8affineinvqb>, TAPD;
8140  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8141                                             X86GF2P8affineqb>, TAPD;
8142}
8143
8144// AVX-IFMA
8145let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst",
8146    checkVEXPredicate = 1 in
8147multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
8148  // NOTE: The SDNode have the multiply operands first with the add last.
8149  // This enables commuted load patterns to be autogenerated by tablegen.
8150  let isCommutable = 1 in {
8151    def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
8152               (ins VR128:$src1, VR128:$src2, VR128:$src3),
8153               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8154               [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8155                                         VR128:$src3, VR128:$src1)))]>,
8156               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8157  }
8158    def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
8159               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8160               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8161               [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
8162                                        (loadv2i64 addr:$src3), VR128:$src1)))]>,
8163               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8164  let isCommutable = 1 in {
8165    def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
8166               (ins VR256:$src1, VR256:$src2, VR256:$src3),
8167               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8168               [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8169                                         VR256:$src3, VR256:$src1)))]>,
8170               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8171  }
8172    def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
8173               (ins VR256:$src1, VR256:$src2, i256mem:$src3),
8174               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8175               [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
8176                                        (loadv4i64 addr:$src3), VR256:$src1)))]>,
8177               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8178}
8179
8180defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix;
8181defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, REX_W, ExplicitVEXPrefix;
8182
8183// AVX-VNNI-INT8
8184let Constraints = "$src1 = $dst" in
8185multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
8186                          RegisterClass RC, PatFrag MemOpFrag,
8187                          X86MemOperand X86memop, SDNode OpNode,
8188                          X86FoldableSchedWrite Sched,
8189                          bit IsCommutable> {
8190  let isCommutable = IsCommutable in
8191  def rr  :  I<Opc, MRMSrcReg, (outs RC:$dst),
8192             (ins RC:$src1, RC:$src2, RC:$src3),
8193             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8194             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
8195             VEX_4V, Sched<[Sched]>;
8196  def rm  :  I<Opc, MRMSrcMem, (outs RC:$dst),
8197             (ins RC:$src1, RC:$src2, X86memop:$src3),
8198             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8199             [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
8200                                   (MemOpFrag addr:$src3))))]>,
8201             VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
8202}
8203
8204let Predicates = [HasAVXVNNIINT8] in {
8205  defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
8206                                   i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
8207                                   1>, T8XD;
8208  defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
8209                                   i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
8210                                   1>, VEX_L, T8XD;
8211  defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
8212                                   i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
8213                                   1>, T8PS;
8214  defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
8215                                   i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
8216                                   1>, VEX_L, T8PS;
8217  defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
8218                                   i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
8219                                   1>, T8XD;
8220  defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
8221                                   i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
8222                                   1>, VEX_L, T8XD;
8223  defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
8224                                   i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
8225                                   1>, T8PS;
8226  defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
8227                                   i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
8228                                   1>, VEX_L, T8PS;
8229  defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
8230                                   i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
8231                                   0>, T8XS;
8232  defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
8233                                   i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
8234                                   0>,  VEX_L, T8XS;
8235  defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
8236                                   i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
8237                                   0>, T8XS;
8238  defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
8239                                   i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
8240                                   0>, VEX_L, T8XS;
8241}
8242
8243// AVX-NE-CONVERT
8244multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr,
8245                  X86MemOperand MemOp128, X86MemOperand MemOp256> {
8246  def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src),
8247              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8248              [(set VR128:$dst,
8249                (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>,
8250              Sched<[WriteCvtPH2PS]>, VEX;
8251  def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src),
8252              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
8253              [(set VR256:$dst,
8254                (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>,
8255              Sched<[WriteCvtPH2PSY]>, VEX, VEX_L;
8256}
8257
8258multiclass VCVTNEPS2BF16_BASE {
8259  def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
8260             "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8261             [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>,
8262             Sched<[WriteCvtPH2PS]>;
8263  def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
8264             "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}",
8265             [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>,
8266             Sched<[WriteCvtPH2PS]>;
8267  def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
8268             "vcvtneps2bf16\t{$src, $dst|$dst, $src}",
8269             [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>,
8270             Sched<[WriteCvtPH2PSY]>, VEX_L;
8271  def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
8272             "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}",
8273             [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>,
8274             Sched<[WriteCvtPH2PSY]>, VEX_L;
8275}
8276
8277let Predicates = [HasAVXNECONVERT] in {
8278  defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
8279       f16mem>, T8XS;
8280  defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
8281       T8PD;
8282  defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
8283       f256mem>, T8XS;
8284  defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
8285       f256mem>, T8PD;
8286  defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
8287       f256mem>, T8XD;
8288  defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
8289       f256mem>, T8PS;
8290  let checkVEXPredicate = 1 in
8291  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
8292}
8293
8294def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}",
8295                (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">;
8296def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}",
8297                (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">;
8298
8299// FIXME: Is there a better scheduler class for SHA512 than WriteVecIMul?
8300let Predicates = [HasSHA512], Constraints = "$src1 = $dst" in {
8301def VSHA512MSG1rr : I<0xcc, MRMSrcReg, (outs VR256:$dst),
8302                     (ins VR256:$src1, VR128:$src2),
8303                     "vsha512msg1\t{$src2, $dst|$dst, $src2}",
8304                     [(set VR256:$dst,
8305                       (int_x86_vsha512msg1 VR256:$src1, VR128:$src2))]>, VEX_L,
8306                     VEX, T8XD, Sched<[WriteVecIMul]>;
8307def VSHA512MSG2rr : I<0xcd, MRMSrcReg, (outs VR256:$dst),
8308                     (ins VR256:$src1, VR256:$src2),
8309                     "vsha512msg2\t{$src2, $dst|$dst, $src2}",
8310                     [(set VR256:$dst,
8311                       (int_x86_vsha512msg2 VR256:$src1, VR256:$src2))]>, VEX_L,
8312                     VEX, T8XD, Sched<[WriteVecIMul]>;
8313def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst),
8314                      (ins VR256:$src1, VR256:$src2, VR128:$src3),
8315                      "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}",
8316                      [(set VR256:$dst,
8317                        (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>,
8318                      VEX_L, VEX_4V, T8XD, Sched<[WriteVecIMul]>;
8319}
8320
8321// FIXME: Is there a better scheduler class for SM3 than WriteVecIMul?
8322let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
8323  multiclass SM3_Base<string OpStr> {
8324    def rr : I<0xda, MRMSrcReg, (outs VR128:$dst),
8325              (ins VR128:$src1, VR128:$src2, VR128:$src3),
8326              !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8327              [(set VR128:$dst,
8328               (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
8329                VR128:$src2, VR128:$src3))]>,
8330              Sched<[WriteVecIMul]>, VEX_4V;
8331    def rm : I<0xda, MRMSrcMem, (outs VR128:$dst),
8332              (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8333              !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8334              [(set VR128:$dst,
8335               (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
8336                VR128:$src2, (loadv4i32 addr:$src3)))]>,
8337              Sched<[WriteVecIMul]>, VEX_4V;
8338  }
8339
8340  multiclass VSM3RNDS2_Base {
8341    def rr : Ii8<0xde, MRMSrcReg, (outs VR128:$dst),
8342              (ins VR128:$src1, VR128:$src2, VR128:$src3, i32u8imm:$src4),
8343              "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}",
8344              [(set VR128:$dst,
8345               (int_x86_vsm3rnds2 VR128:$src1,
8346                VR128:$src2, VR128:$src3, timm:$src4))]>,
8347              Sched<[WriteVecIMul]>;
8348    def rm : Ii8<0xde, MRMSrcMem, (outs VR128:$dst),
8349              (ins VR128:$src1, VR128:$src2, i128mem:$src3, i32u8imm:$src4),
8350              "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}",
8351              [(set VR128:$dst,
8352               (int_x86_vsm3rnds2 VR128:$src1,
8353                VR128:$src2, (loadv4i32 addr:$src3), timm:$src4))]>,
8354              Sched<[WriteVecIMul]>;
8355  }
8356}
8357
8358defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8PS;
8359defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8PD;
8360defm VSM3RNDS2 : VSM3RNDS2_Base, VEX_4V, TAPD;
8361
8362// FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
8363let Predicates = [HasSM4] in {
8364  multiclass SM4_Base<string OpStr, RegisterClass RC, string VL,
8365                      PatFrag LD, X86MemOperand MemOp> {
8366    def rr : I<0xda, MRMSrcReg, (outs RC:$dst),
8367               (ins RC:$src1, RC:$src2),
8368               !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8369               [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1,
8370                  RC:$src2))]>,
8371               Sched<[WriteVecIMul]>;
8372    def rm : I<0xda, MRMSrcMem, (outs RC:$dst),
8373               (ins RC:$src1, MemOp:$src2),
8374               !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
8375               [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1,
8376                 (LD addr:$src2)))]>,
8377               Sched<[WriteVecIMul]>;
8378  }
8379}
8380
8381defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX_4V;
8382defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
8383defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
8384defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
8385
8386let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
8387multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
8388  let isCommutable = IsCommutable in
8389  def rr  : I<opc, MRMSrcReg, (outs VR128:$dst),
8390              (ins VR128:$src1, VR128:$src2, VR128:$src3),
8391              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8392              [(set VR128:$dst,
8393                (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
8394                        VR128:$src1, VR128:$src2, VR128:$src3)))]>,
8395              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8396
8397  def rm  : I<opc, MRMSrcMem, (outs VR128:$dst),
8398              (ins VR128:$src1, VR128:$src2, i128mem:$src3),
8399              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8400              [(set VR128:$dst,
8401                (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
8402                        VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
8403              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
8404
8405  let isCommutable = IsCommutable in
8406  def Yrr  : I<opc, MRMSrcReg, (outs VR256:$dst),
8407               (ins VR256:$src1, VR256:$src2, VR256:$src3),
8408               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8409               [(set VR256:$dst,
8410                 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
8411                         VR256:$src1, VR256:$src2, VR256:$src3)))]>,
8412               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8413
8414  def Yrm  : I<opc, MRMSrcMem, (outs VR256:$dst),
8415               (ins VR256:$src1, VR256:$src2, i256mem:$src3),
8416               !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
8417               [(set VR256:$dst,
8418                 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
8419                         VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
8420               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
8421}
8422
8423defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
8424defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS;
8425defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD;
8426defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD;
8427defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS;
8428defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS;
8429