xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision 6966ac055c3b7a39266fb982493330df7a097997)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               ComplexPattern mem_cpat, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
119}
120
121//===----------------------------------------------------------------------===//
122// AVX & SSE - Zero/One Vectors
123//===----------------------------------------------------------------------===//
124
125// Alias instruction that maps zero vector to pxor / xorp* for sse.
126// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
127// swizzled by ExecutionDomainFix to pxor.
128// We set canFoldAsLoad because this can be converted to a constant-pool
129// load of an all-zeros value if folding it would be beneficial.
130let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
131    isPseudo = 1, SchedRW = [WriteZero] in {
132def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
133               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
134}
135
136let Predicates = [NoAVX512] in
137def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
138
139
140// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
141// and doesn't need it because on sandy bridge the register is set to zero
142// at the rename stage without using any execution unit, so SET0PSY
143// and SET0PDY can be used for vector int instructions without penalty
144let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
145    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
146def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
147                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
148}
149
150// We set canFoldAsLoad because this can be converted to a constant-pool
151// load of an all-ones value if folding it would be beneficial.
152let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
153    isPseudo = 1, SchedRW = [WriteZero] in {
154  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
155                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
156  let Predicates = [HasAVX1Only, OptForMinSize] in {
157  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
158                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
159  }
160  let Predicates = [HasAVX2] in
161  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
162                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
163}
164
165//===----------------------------------------------------------------------===//
166// SSE 1 & 2 - Move FP Scalar Instructions
167//
168// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
169// register copies because it's a partial register update; Register-to-register
170// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
171// that the insert be implementable in terms of a copy, and just mentioned, we
172// don't use movss/movsd for copies.
173//===----------------------------------------------------------------------===//
174
175multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
176                         X86MemOperand x86memop, string base_opc,
177                         string asm_opr, Domain d, string Name> {
178  let isCommutable = 1 in
179  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
180              (ins VR128:$src1, VR128:$src2),
181              !strconcat(base_opc, asm_opr),
182              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
183              Sched<[SchedWriteFShuffle.XMM]>;
184
185  // For the disassembler
186  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
187  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
188                  (ins VR128:$src1, VR128:$src2),
189                  !strconcat(base_opc, asm_opr), []>,
190                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
191}
192
193multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
194                      X86MemOperand x86memop, string OpcodeStr,
195                      Domain d, string Name, Predicate pred> {
196  // AVX
197  let Predicates = [UseAVX, OptForSize] in
198  defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
199                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
200                              "V"#Name>,
201                              VEX_4V, VEX_LIG, VEX_WIG;
202
203  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
204                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
205                     [(store RC:$src, addr:$dst)], d>,
206                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
207  // SSE1 & 2
208  let Constraints = "$src1 = $dst" in {
209    let Predicates = [pred, NoSSE41_Or_OptForSize] in
210    defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
211                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
212  }
213
214  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
215                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
216                     [(store RC:$src, addr:$dst)], d>,
217                     Sched<[WriteFStore]>;
218
219  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
220                  (!cast<Instruction>("V"#NAME#"rr_REV")
221                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
222  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
223                  (!cast<Instruction>(NAME#"rr_REV")
224                   VR128:$dst, VR128:$src2), 0>;
225}
226
227// Loading from memory automatically zeroing upper bits.
228multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
229                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
230                         Domain d> {
231  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
232                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
233                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
234                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
235  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
236                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
237                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
238                     Sched<[WriteFLoad]>;
239
240  // _alt version uses FR32/FR64 register class.
241  let isCodeGenOnly = 1 in {
242  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
243                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
244                         [(set RC:$dst, (mem_pat addr:$src))], d>,
245                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
246  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
247                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248                         [(set RC:$dst, (mem_pat addr:$src))], d>,
249                         Sched<[WriteFLoad]>;
250  }
251}
252
253defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
254                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
255defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
256                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
257
258let canFoldAsLoad = 1, isReMaterializable = 1 in {
259  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
260                             SSEPackedSingle>, XS;
261  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
262                             SSEPackedDouble>, XD;
263}
264
265// Patterns
266let Predicates = [UseAVX] in {
267  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
268            (VMOVSSrm addr:$src)>;
269  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
270            (VMOVSDrm addr:$src)>;
271
272  // Represent the same patterns above but in the form they appear for
273  // 256-bit types
274  def : Pat<(v8f32 (X86vzload32 addr:$src)),
275            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
276  def : Pat<(v4f64 (X86vzload64 addr:$src)),
277            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
278}
279
280let Predicates = [UseAVX, OptForSize] in {
281  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
282  // MOVSS to the lower bits.
283  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
284            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
285  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
286            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
287
288  // Move low f32 and clear high bits.
289  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
290            (SUBREG_TO_REG (i32 0),
291             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
292              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
293  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
294            (SUBREG_TO_REG (i32 0),
295             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
296              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
297}
298
299let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
300// Move scalar to XMM zero-extended, zeroing a VR128 then do a
301// MOVSS to the lower bits.
302def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
303          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
304def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
305          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
306}
307
308let Predicates = [UseSSE2] in
309def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
310          (MOVSDrm addr:$src)>;
311
312let Predicates = [UseSSE1] in
313def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
314          (MOVSSrm addr:$src)>;
315
316//===----------------------------------------------------------------------===//
317// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
318//===----------------------------------------------------------------------===//
319
320multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
321                            X86MemOperand x86memop, PatFrag ld_frag,
322                            string asm, Domain d,
323                            X86SchedWriteMoveLS sched> {
324let hasSideEffects = 0, isMoveReg = 1 in
325  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
326              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
327           Sched<[sched.RR]>;
328let canFoldAsLoad = 1, isReMaterializable = 1 in
329  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
330              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
331                   [(set RC:$dst, (ld_frag addr:$src))], d>,
332           Sched<[sched.RM]>;
333}
334
335let Predicates = [HasAVX, NoVLX] in {
336defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
337                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
338                                PS, VEX, VEX_WIG;
339defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
340                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
341                                PD, VEX, VEX_WIG;
342defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
343                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
344                                PS, VEX, VEX_WIG;
345defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
346                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
347                                PD, VEX, VEX_WIG;
348
349defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
350                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
351                                 PS, VEX, VEX_L, VEX_WIG;
352defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
353                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
354                                 PD, VEX, VEX_L, VEX_WIG;
355defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
356                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
357                                 PS, VEX, VEX_L, VEX_WIG;
358defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
359                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
360                                 PD, VEX, VEX_L, VEX_WIG;
361}
362
363let Predicates = [UseSSE1] in {
364defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
365                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
366                               PS;
367defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
368                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
369                               PS;
370}
371let Predicates = [UseSSE2] in {
372defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
373                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
374                               PD;
375defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
376                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
377                               PD;
378}
379
380let Predicates = [HasAVX, NoVLX]  in {
381let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
382def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
383                   "movaps\t{$src, $dst|$dst, $src}",
384                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
385                   VEX, VEX_WIG;
386def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
387                   "movapd\t{$src, $dst|$dst, $src}",
388                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
389                   VEX, VEX_WIG;
390def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
391                   "movups\t{$src, $dst|$dst, $src}",
392                   [(store (v4f32 VR128:$src), addr:$dst)]>,
393                   VEX, VEX_WIG;
394def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
395                   "movupd\t{$src, $dst|$dst, $src}",
396                   [(store (v2f64 VR128:$src), addr:$dst)]>,
397                   VEX, VEX_WIG;
398} // SchedRW
399
400let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
401def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
402                   "movaps\t{$src, $dst|$dst, $src}",
403                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
404                   VEX, VEX_L, VEX_WIG;
405def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
406                   "movapd\t{$src, $dst|$dst, $src}",
407                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
408                   VEX, VEX_L, VEX_WIG;
409def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
410                   "movups\t{$src, $dst|$dst, $src}",
411                   [(store (v8f32 VR256:$src), addr:$dst)]>,
412                   VEX, VEX_L, VEX_WIG;
413def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
414                   "movupd\t{$src, $dst|$dst, $src}",
415                   [(store (v4f64 VR256:$src), addr:$dst)]>,
416                   VEX, VEX_L, VEX_WIG;
417} // SchedRW
418} // Predicate
419
420// For disassembler
421let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
422    isMoveReg = 1 in {
423let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
424  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
425                          (ins VR128:$src),
426                          "movaps\t{$src, $dst|$dst, $src}", []>,
427                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
428  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
429                           (ins VR128:$src),
430                           "movapd\t{$src, $dst|$dst, $src}", []>,
431                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
432  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
433                           (ins VR128:$src),
434                           "movups\t{$src, $dst|$dst, $src}", []>,
435                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
436  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
437                           (ins VR128:$src),
438                           "movupd\t{$src, $dst|$dst, $src}", []>,
439                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
440} // SchedRW
441
442let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
443  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
444                            (ins VR256:$src),
445                            "movaps\t{$src, $dst|$dst, $src}", []>,
446                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
447  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
448                            (ins VR256:$src),
449                            "movapd\t{$src, $dst|$dst, $src}", []>,
450                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
451  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
452                            (ins VR256:$src),
453                            "movups\t{$src, $dst|$dst, $src}", []>,
454                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
455  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
456                            (ins VR256:$src),
457                            "movupd\t{$src, $dst|$dst, $src}", []>,
458                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
459} // SchedRW
460} // Predicate
461
462// Reversed version with ".s" suffix for GAS compatibility.
463def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
464                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
465def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
466                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
467def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
468                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
469def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
470                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
471def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
472                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
473def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
474                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
475def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
476                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
477def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
478                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
479
480let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
481def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
482                   "movaps\t{$src, $dst|$dst, $src}",
483                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
484def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
485                   "movapd\t{$src, $dst|$dst, $src}",
486                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
487def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
488                   "movups\t{$src, $dst|$dst, $src}",
489                   [(store (v4f32 VR128:$src), addr:$dst)]>;
490def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
491                   "movupd\t{$src, $dst|$dst, $src}",
492                   [(store (v2f64 VR128:$src), addr:$dst)]>;
493} // SchedRW
494
495// For disassembler
496let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
497    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
498  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
499                         "movaps\t{$src, $dst|$dst, $src}", []>,
500                         FoldGenData<"MOVAPSrr">;
501  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
502                         "movapd\t{$src, $dst|$dst, $src}", []>,
503                         FoldGenData<"MOVAPDrr">;
504  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
505                         "movups\t{$src, $dst|$dst, $src}", []>,
506                         FoldGenData<"MOVUPSrr">;
507  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
508                         "movupd\t{$src, $dst|$dst, $src}", []>,
509                         FoldGenData<"MOVUPDrr">;
510}
511
512// Reversed version with ".s" suffix for GAS compatibility.
513def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
514                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
515def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
516                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
517def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
518                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
519def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
520                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
521
522let Predicates = [HasAVX, NoVLX] in {
523  // 256-bit load/store need to use floating point load/store in case we don't
524  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
525  // available and changing the domain is beneficial.
526  def : Pat<(alignedloadv4i64 addr:$src),
527            (VMOVAPSYrm addr:$src)>;
528  def : Pat<(alignedloadv8i32 addr:$src),
529            (VMOVAPSYrm addr:$src)>;
530  def : Pat<(alignedloadv16i16 addr:$src),
531            (VMOVAPSYrm addr:$src)>;
532  def : Pat<(alignedloadv32i8 addr:$src),
533            (VMOVAPSYrm addr:$src)>;
534  def : Pat<(loadv4i64 addr:$src),
535            (VMOVUPSYrm addr:$src)>;
536  def : Pat<(loadv8i32 addr:$src),
537            (VMOVUPSYrm addr:$src)>;
538  def : Pat<(loadv16i16 addr:$src),
539            (VMOVUPSYrm addr:$src)>;
540  def : Pat<(loadv32i8 addr:$src),
541            (VMOVUPSYrm addr:$src)>;
542
543  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
544            (VMOVAPSYmr addr:$dst, VR256:$src)>;
545  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
546            (VMOVAPSYmr addr:$dst, VR256:$src)>;
547  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
548            (VMOVAPSYmr addr:$dst, VR256:$src)>;
549  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
550            (VMOVAPSYmr addr:$dst, VR256:$src)>;
551  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
552            (VMOVUPSYmr addr:$dst, VR256:$src)>;
553  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
554            (VMOVUPSYmr addr:$dst, VR256:$src)>;
555  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
556            (VMOVUPSYmr addr:$dst, VR256:$src)>;
557  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
558            (VMOVUPSYmr addr:$dst, VR256:$src)>;
559}
560
561// Use movaps / movups for SSE integer load / store (one byte shorter).
562// The instructions selected below are then converted to MOVDQA/MOVDQU
563// during the SSE domain pass.
564let Predicates = [UseSSE1] in {
565  def : Pat<(alignedloadv2i64 addr:$src),
566            (MOVAPSrm addr:$src)>;
567  def : Pat<(alignedloadv4i32 addr:$src),
568            (MOVAPSrm addr:$src)>;
569  def : Pat<(alignedloadv8i16 addr:$src),
570            (MOVAPSrm addr:$src)>;
571  def : Pat<(alignedloadv16i8 addr:$src),
572            (MOVAPSrm addr:$src)>;
573  def : Pat<(loadv2i64 addr:$src),
574            (MOVUPSrm addr:$src)>;
575  def : Pat<(loadv4i32 addr:$src),
576            (MOVUPSrm addr:$src)>;
577  def : Pat<(loadv8i16 addr:$src),
578            (MOVUPSrm addr:$src)>;
579  def : Pat<(loadv16i8 addr:$src),
580            (MOVUPSrm addr:$src)>;
581
582  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
583            (MOVAPSmr addr:$dst, VR128:$src)>;
584  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
585            (MOVAPSmr addr:$dst, VR128:$src)>;
586  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
587            (MOVAPSmr addr:$dst, VR128:$src)>;
588  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
589            (MOVAPSmr addr:$dst, VR128:$src)>;
590  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
591            (MOVUPSmr addr:$dst, VR128:$src)>;
592  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
593            (MOVUPSmr addr:$dst, VR128:$src)>;
594  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
595            (MOVUPSmr addr:$dst, VR128:$src)>;
596  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
597            (MOVUPSmr addr:$dst, VR128:$src)>;
598}
599
600//===----------------------------------------------------------------------===//
601// SSE 1 & 2 - Move Low packed FP Instructions
602//===----------------------------------------------------------------------===//
603
604multiclass sse12_mov_hilo_packed_base<bits<8>opc,  SDNode pdnode,
605                                      string base_opc, string asm_opr> {
606  // No pattern as they need be special cased between high and low.
607  let hasSideEffects = 0, mayLoad = 1 in
608  def PSrm : PI<opc, MRMSrcMem,
609                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
610                !strconcat(base_opc, "s", asm_opr),
611                [], SSEPackedSingle>, PS,
612                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
613
614  def PDrm : PI<opc, MRMSrcMem,
615         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
616         !strconcat(base_opc, "d", asm_opr),
617     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
618                              (scalar_to_vector (loadf64 addr:$src2)))))],
619              SSEPackedDouble>, PD,
620     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
621}
622
623multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
624                                 string base_opc> {
625  let Predicates = [UseAVX] in
626    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
627                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
628                                    VEX_4V, VEX_WIG;
629
630  let Constraints = "$src1 = $dst" in
631    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
632                                    "\t{$src2, $dst|$dst, $src2}">;
633}
634
635defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
636
637let SchedRW = [WriteFStore] in {
638let Predicates = [UseAVX] in {
639let mayStore = 1, hasSideEffects = 0 in
640def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
641                     "movlps\t{$src, $dst|$dst, $src}",
642                     []>,
643                     VEX, VEX_WIG;
644def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
645                     "movlpd\t{$src, $dst|$dst, $src}",
646                     [(store (f64 (extractelt (v2f64 VR128:$src),
647                                   (iPTR 0))), addr:$dst)]>,
648                     VEX, VEX_WIG;
649}// UseAVX
650let mayStore = 1, hasSideEffects = 0 in
651def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
652                   "movlps\t{$src, $dst|$dst, $src}",
653                   []>;
654def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
655                   "movlpd\t{$src, $dst|$dst, $src}",
656                   [(store (f64 (extractelt (v2f64 VR128:$src),
657                                 (iPTR 0))), addr:$dst)]>;
658} // SchedRW
659
660let Predicates = [UseSSE1] in {
661  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
662  // end up with a movsd or blend instead of shufp.
663  // No need for aligned load, we're only loading 64-bits.
664  def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
665                      (i8 -28)),
666            (MOVLPSrm VR128:$src1, addr:$src2)>;
667  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
668            (MOVLPSrm VR128:$src1, addr:$src2)>;
669
670  def : Pat<(v4f32 (X86vzload64 addr:$src)),
671            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
672  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
673            (MOVLPSmr addr:$dst, VR128:$src)>;
674}
675
676//===----------------------------------------------------------------------===//
677// SSE 1 & 2 - Move Hi packed FP Instructions
678//===----------------------------------------------------------------------===//
679
680defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
681
682let SchedRW = [WriteFStore] in {
683// v2f64 extract element 1 is always custom lowered to unpack high to low
684// and extract element 0 so the non-store version isn't too horrible.
685let Predicates = [UseAVX] in {
686let mayStore = 1, hasSideEffects = 0 in
687def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
688                   "movhps\t{$src, $dst|$dst, $src}",
689                   []>, VEX, VEX_WIG;
690def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
691                   "movhpd\t{$src, $dst|$dst, $src}",
692                   [(store (f64 (extractelt
693                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
694                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
695} // UseAVX
696let mayStore = 1, hasSideEffects = 0 in
697def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
698                   "movhps\t{$src, $dst|$dst, $src}",
699                   []>;
700def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
701                   "movhpd\t{$src, $dst|$dst, $src}",
702                   [(store (f64 (extractelt
703                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
704                                 (iPTR 0))), addr:$dst)]>;
705} // SchedRW
706
707let Predicates = [UseAVX] in {
708  // Also handle an i64 load because that may get selected as a faster way to
709  // load the data.
710  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
711                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
712            (VMOVHPDrm VR128:$src1, addr:$src2)>;
713  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
714            (VMOVHPDrm VR128:$src1, addr:$src2)>;
715
716  def : Pat<(store (f64 (extractelt
717                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
718                          (iPTR 0))), addr:$dst),
719            (VMOVHPDmr addr:$dst, VR128:$src)>;
720
721  // MOVLPD patterns
722  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
723            (VMOVLPDrm VR128:$src1, addr:$src2)>;
724}
725
726let Predicates = [UseSSE1] in {
727  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
728  // end up with a movsd or blend instead of shufp.
729  // No need for aligned load, we're only loading 64-bits.
730  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
731            (MOVHPSrm VR128:$src1, addr:$src2)>;
732  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
733            (MOVHPSrm VR128:$src1, addr:$src2)>;
734
735  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
736                                addr:$dst),
737            (MOVHPSmr addr:$dst, VR128:$src)>;
738}
739
740let Predicates = [UseSSE2] in {
741  // MOVHPD patterns
742
743  // Also handle an i64 load because that may get selected as a faster way to
744  // load the data.
745  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
746                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
747            (MOVHPDrm VR128:$src1, addr:$src2)>;
748  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
749            (MOVHPDrm VR128:$src1, addr:$src2)>;
750
751  def : Pat<(store (f64 (extractelt
752                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
753                          (iPTR 0))), addr:$dst),
754            (MOVHPDmr addr:$dst, VR128:$src)>;
755
756  // MOVLPD patterns
757  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
758            (MOVLPDrm VR128:$src1, addr:$src2)>;
759}
760
761let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
762  // Use MOVLPD to load into the low bits from a full vector unless we can use
763  // BLENDPD.
764  def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))),
765            (MOVLPDrm VR128:$src1, addr:$src2)>;
766}
767
768//===----------------------------------------------------------------------===//
769// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
770//===----------------------------------------------------------------------===//
771
772let Predicates = [UseAVX] in {
773  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
774                                       (ins VR128:$src1, VR128:$src2),
775                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
776                      [(set VR128:$dst,
777                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
778                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
779  let isCommutable = 1 in
780  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
781                                       (ins VR128:$src1, VR128:$src2),
782                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
783                      [(set VR128:$dst,
784                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
785                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
786                      NotMemoryFoldable;
787}
788let Constraints = "$src1 = $dst" in {
789  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
790                                       (ins VR128:$src1, VR128:$src2),
791                      "movlhps\t{$src2, $dst|$dst, $src2}",
792                      [(set VR128:$dst,
793                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
794                      Sched<[SchedWriteFShuffle.XMM]>;
795  let isCommutable = 1 in
796  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
797                                       (ins VR128:$src1, VR128:$src2),
798                      "movhlps\t{$src2, $dst|$dst, $src2}",
799                      [(set VR128:$dst,
800                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
801                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
802}
803
804//===----------------------------------------------------------------------===//
805// SSE 1 & 2 - Conversion Instructions
806//===----------------------------------------------------------------------===//
807
808multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
809                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
810                     string asm, string mem, X86FoldableSchedWrite sched,
811                     SchedRead Int2Fpu = ReadDefault> {
812  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
813              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
814              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
815              Sched<[sched, Int2Fpu]>;
816  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
817              mem#"\t{$src, $dst|$dst, $src}",
818              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
819              Sched<[sched.Folded]>;
820}
821
822multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
823                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
824                       string asm, Domain d, X86FoldableSchedWrite sched> {
825let hasSideEffects = 0 in {
826  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
827             [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
828             Sched<[sched]>;
829  let mayLoad = 1 in
830  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
831             [(set RC:$dst, (DstTy (sint_to_fp
832                                    (SrcTy (ld_frag addr:$src)))))], d>,
833             Sched<[sched.Folded]>;
834}
835}
836
837multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
838                          X86MemOperand x86memop, string asm, string mem,
839                          X86FoldableSchedWrite sched> {
840let hasSideEffects = 0, Predicates = [UseAVX] in {
841  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
842              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
843              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
844  let mayLoad = 1 in
845  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
846              (ins DstRC:$src1, x86memop:$src),
847              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
848           Sched<[sched.Folded, sched.ReadAfterFold]>;
849} // hasSideEffects = 0
850}
851
852let isCodeGenOnly = 1, Predicates = [UseAVX] in {
853defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
854                                "cvttss2si", "cvttss2si",
855                                WriteCvtSS2I>,
856                                XS, VEX, VEX_LIG;
857defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
858                                "cvttss2si", "cvttss2si",
859                                WriteCvtSS2I>,
860                                XS, VEX, VEX_W, VEX_LIG;
861defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
862                                "cvttsd2si", "cvttsd2si",
863                                WriteCvtSD2I>,
864                                XD, VEX, VEX_LIG;
865defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
866                                "cvttsd2si", "cvttsd2si",
867                                WriteCvtSD2I>,
868                                XD, VEX, VEX_W, VEX_LIG;
869}
870
871// The assembler can recognize rr 64-bit instructions by seeing a rxx
872// register, but the same isn't true when only using memory operands,
873// provide other assembly "l" and "q" forms to address this explicitly
874// where appropriate to do so.
875let isCodeGenOnly = 1 in {
876defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
877                                  WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
878defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
879                                  WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
880defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
881                                  WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
882defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
883                                  WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
884} // isCodeGenOnly = 1
885
886let Predicates = [UseAVX] in {
887  def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
888            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
889  def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
890            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
891  def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
892            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
893  def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
894            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
895
896  def : Pat<(f32 (sint_to_fp GR32:$src)),
897            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
898  def : Pat<(f32 (sint_to_fp GR64:$src)),
899            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
900  def : Pat<(f64 (sint_to_fp GR32:$src)),
901            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
902  def : Pat<(f64 (sint_to_fp GR64:$src)),
903            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
904}
905
906let isCodeGenOnly = 1 in {
907defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
908                      "cvttss2si", "cvttss2si",
909                      WriteCvtSS2I>, XS;
910defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
911                      "cvttss2si", "cvttss2si",
912                      WriteCvtSS2I>, XS, REX_W;
913defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
914                      "cvttsd2si", "cvttsd2si",
915                      WriteCvtSD2I>, XD;
916defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
917                      "cvttsd2si", "cvttsd2si",
918                      WriteCvtSD2I>, XD, REX_W;
919defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
920                      "cvtsi2ss", "cvtsi2ss{l}",
921                      WriteCvtI2SS, ReadInt2Fpu>, XS;
922defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
923                      "cvtsi2ss", "cvtsi2ss{q}",
924                      WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
925defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
926                      "cvtsi2sd", "cvtsi2sd{l}",
927                      WriteCvtI2SD, ReadInt2Fpu>, XD;
928defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
929                      "cvtsi2sd", "cvtsi2sd{q}",
930                      WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
931} // isCodeGenOnly = 1
932
933// Conversion Instructions Intrinsics - Match intrinsics which expect MM
934// and/or XMM operand(s).
935
936multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
937                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
938                          Operand memop, ComplexPattern mem_cpat, string asm,
939                          X86FoldableSchedWrite sched> {
940  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
941                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
942                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
943               Sched<[sched]>;
944  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
945                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
946                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
947               Sched<[sched.Folded]>;
948}
949
950multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
951                    RegisterClass DstRC, X86MemOperand x86memop,
952                    string asm, string mem, X86FoldableSchedWrite sched,
953                    bit Is2Addr = 1> {
954let hasSideEffects = 0 in {
955  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
956                  !if(Is2Addr,
957                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
958                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
959                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
960  let mayLoad = 1 in
961  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
962                  (ins DstRC:$src1, x86memop:$src2),
963                  !if(Is2Addr,
964                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
965                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
966                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
967}
968}
969
970let Predicates = [UseAVX] in {
971defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
972                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
973                  WriteCvtSD2I>, XD, VEX, VEX_LIG;
974defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
975                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
976                    WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
977}
978defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
979                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
980defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
981                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
982
983
984let Predicates = [UseAVX] in {
985defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
986          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG;
987defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
988          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W;
989defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
990          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
991defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
992          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W;
993}
994let Constraints = "$src1 = $dst" in {
995  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
996                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS;
997  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
998                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W;
999  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1000                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
1001  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1002                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W;
1003}
1004
1005def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1006               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1007def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1008               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1009def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1010               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1011def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1012               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1013
1014def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1015              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1016def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1017              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1018
1019def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1020                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1021def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1022                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1023def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1024                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1025def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1026                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1027
1028def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1029                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1030def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1031                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1032
1033/// SSE 1 Only
1034
1035// Aliases for intrinsics
1036let Predicates = [UseAVX] in {
1037defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1038                                ssmem, sse_load_f32, "cvttss2si",
1039                                WriteCvtSS2I>, XS, VEX, VEX_LIG;
1040defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1041                               X86cvtts2Int, ssmem, sse_load_f32,
1042                               "cvttss2si", WriteCvtSS2I>,
1043                               XS, VEX, VEX_LIG, VEX_W;
1044defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1045                                sdmem, sse_load_f64, "cvttsd2si",
1046                                WriteCvtSS2I>, XD, VEX, VEX_LIG;
1047defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1048                              X86cvtts2Int, sdmem, sse_load_f64,
1049                              "cvttsd2si", WriteCvtSS2I>,
1050                              XD, VEX, VEX_LIG, VEX_W;
1051}
1052defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1053                                    ssmem, sse_load_f32, "cvttss2si",
1054                                    WriteCvtSS2I>, XS;
1055defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1056                                   X86cvtts2Int, ssmem, sse_load_f32,
1057                                   "cvttss2si", WriteCvtSS2I>, XS, REX_W;
1058defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1059                                    sdmem, sse_load_f64, "cvttsd2si",
1060                                    WriteCvtSD2I>, XD;
1061defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1062                                  X86cvtts2Int, sdmem, sse_load_f64,
1063                                  "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
1064
1065def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1066                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1067def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1068                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1069def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1070                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1071def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1072                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1073def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1074                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1075def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1076                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1077def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1078                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1079def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1080                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1081
1082def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1083                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1084def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1085                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1086def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1087                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1088def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1089                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1090def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1091                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1092def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1093                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1094def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1095                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1096def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1097                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1098
1099let Predicates = [UseAVX] in {
1100defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1101                                  ssmem, sse_load_f32, "cvtss2si",
1102                                  WriteCvtSS2I>, XS, VEX, VEX_LIG;
1103defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1104                                  ssmem, sse_load_f32, "cvtss2si",
1105                                  WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
1106}
1107defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1108                               ssmem, sse_load_f32, "cvtss2si",
1109                               WriteCvtSS2I>, XS;
1110defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1111                                 ssmem, sse_load_f32, "cvtss2si",
1112                                 WriteCvtSS2I>, XS, REX_W;
1113
1114defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1115                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1116                               SSEPackedSingle, WriteCvtI2PS>,
1117                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1118defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1119                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1120                               SSEPackedSingle, WriteCvtI2PSY>,
1121                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1122
1123defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1124                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1125                            SSEPackedSingle, WriteCvtI2PS>,
1126                            PS, Requires<[UseSSE2]>;
1127
1128// AVX aliases
1129def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1130                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1131def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1132                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1133def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1134                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1135def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1136                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1137def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1138                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1139def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1140                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1141def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1142                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1143def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1144                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1145
1146// SSE aliases
1147def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1148                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1149def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1150                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1151def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1152                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1153def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1154                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1155def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1156                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1157def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1158                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1159def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1160                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1161def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1162                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1163
1164/// SSE 2 Only
1165
1166// Convert scalar double to scalar single
1167let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1168def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1169                        (ins FR32:$src1, FR64:$src2),
1170                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1171                        VEX_4V, VEX_LIG, VEX_WIG,
1172                        Sched<[WriteCvtSD2SS]>;
1173let mayLoad = 1 in
1174def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1175                     (ins FR32:$src1, f64mem:$src2),
1176                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1177                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1178                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1179}
1180
1181def : Pat<(f32 (fpround FR64:$src)),
1182            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1183          Requires<[UseAVX]>;
1184
1185let isCodeGenOnly = 1 in {
1186def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1187                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1188                      [(set FR32:$dst, (fpround FR64:$src))]>,
1189                      Sched<[WriteCvtSD2SS]>;
1190def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1191                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1192                    [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
1193                    XD, Requires<[UseSSE2, OptForSize]>,
1194                    Sched<[WriteCvtSD2SS.Folded]>;
1195}
1196
1197def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1198                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1199                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1200                       [(set VR128:$dst,
1201                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1202                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1203                       Sched<[WriteCvtSD2SS]>;
1204def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1205                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1206                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1207                       [(set VR128:$dst,
1208                         (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
1209                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1210                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1211let Constraints = "$src1 = $dst" in {
1212def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1213                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1214                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1215                       [(set VR128:$dst,
1216                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1217                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1218def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1219                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1220                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1221                       [(set VR128:$dst,
1222                         (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
1223                       XD, Requires<[UseSSE2]>,
1224                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1225}
1226
1227// Convert scalar single to scalar double
1228// SSE2 instructions with XS prefix
1229let isCodeGenOnly = 1, hasSideEffects = 0 in {
1230def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1231                    (ins FR64:$src1, FR32:$src2),
1232                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1233                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1234                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
1235let mayLoad = 1 in
1236def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1237                    (ins FR64:$src1, f32mem:$src2),
1238                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1239                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1240                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1241                    Requires<[UseAVX, OptForSize]>;
1242} // isCodeGenOnly = 1, hasSideEffects = 0
1243
1244def : Pat<(f64 (fpextend FR32:$src)),
1245    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1246def : Pat<(fpextend (loadf32 addr:$src)),
1247    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1248
1249let isCodeGenOnly = 1 in {
1250def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1251                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1252                   [(set FR64:$dst, (fpextend FR32:$src))]>,
1253                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
1254def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1255                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1256                   [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>,
1257                   XS, Requires<[UseSSE2, OptForSize]>,
1258                   Sched<[WriteCvtSS2SD.Folded]>;
1259} // isCodeGenOnly = 1
1260
1261let hasSideEffects = 0 in {
1262def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1263                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1264                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1265                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1266                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1267let mayLoad = 1 in
1268def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1269                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1270                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1271                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1272                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1273let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1274def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1275                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1276                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1277                    []>, XS, Requires<[UseSSE2]>,
1278                    Sched<[WriteCvtSS2SD]>;
1279let mayLoad = 1 in
1280def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1281                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1282                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1283                    []>, XS, Requires<[UseSSE2]>,
1284                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1285}
1286} // hasSideEffects = 0
1287
1288// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1289// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1290// vmovs{s,d} instructions
1291let Predicates = [UseAVX] in {
1292def : Pat<(v4f32 (X86Movss
1293                   (v4f32 VR128:$dst),
1294                   (v4f32 (scalar_to_vector
1295                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1296          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1297
1298def : Pat<(v2f64 (X86Movsd
1299                   (v2f64 VR128:$dst),
1300                   (v2f64 (scalar_to_vector
1301                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1302          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1303
1304def : Pat<(v4f32 (X86Movss
1305                   (v4f32 VR128:$dst),
1306                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1307          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1308
1309def : Pat<(v4f32 (X86Movss
1310                   (v4f32 VR128:$dst),
1311                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1312          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1313
1314def : Pat<(v4f32 (X86Movss
1315                   (v4f32 VR128:$dst),
1316                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1317          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1318
1319def : Pat<(v4f32 (X86Movss
1320                   (v4f32 VR128:$dst),
1321                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1322          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1323
1324def : Pat<(v2f64 (X86Movsd
1325                   (v2f64 VR128:$dst),
1326                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1327          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1328
1329def : Pat<(v2f64 (X86Movsd
1330                   (v2f64 VR128:$dst),
1331                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1332          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1333
1334def : Pat<(v2f64 (X86Movsd
1335                   (v2f64 VR128:$dst),
1336                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1337          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1338
1339def : Pat<(v2f64 (X86Movsd
1340                   (v2f64 VR128:$dst),
1341                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1342          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1343} // Predicates = [UseAVX]
1344
1345let Predicates = [UseSSE2] in {
1346def : Pat<(v4f32 (X86Movss
1347                   (v4f32 VR128:$dst),
1348                   (v4f32 (scalar_to_vector
1349                     (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1350          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1351
1352def : Pat<(v2f64 (X86Movsd
1353                   (v2f64 VR128:$dst),
1354                   (v2f64 (scalar_to_vector
1355                     (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1356          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1357
1358def : Pat<(v2f64 (X86Movsd
1359                   (v2f64 VR128:$dst),
1360                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
1361          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1362
1363def : Pat<(v2f64 (X86Movsd
1364                   (v2f64 VR128:$dst),
1365                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
1366          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1367
1368def : Pat<(v2f64 (X86Movsd
1369                   (v2f64 VR128:$dst),
1370                   (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
1371          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1372
1373def : Pat<(v2f64 (X86Movsd
1374                   (v2f64 VR128:$dst),
1375                   (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
1376          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1377} // Predicates = [UseSSE2]
1378
1379let Predicates = [UseSSE1] in {
1380def : Pat<(v4f32 (X86Movss
1381                   (v4f32 VR128:$dst),
1382                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
1383          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1384
1385def : Pat<(v4f32 (X86Movss
1386                   (v4f32 VR128:$dst),
1387                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
1388          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1389
1390def : Pat<(v4f32 (X86Movss
1391                   (v4f32 VR128:$dst),
1392                   (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
1393          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1394
1395def : Pat<(v4f32 (X86Movss
1396                   (v4f32 VR128:$dst),
1397                   (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
1398          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1399} // Predicates = [UseSSE1]
1400
1401let Predicates = [HasAVX, NoVLX] in {
1402// Convert packed single/double fp to doubleword
1403def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1404                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1405                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1406                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1407def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1408                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1409                       [(set VR128:$dst,
1410                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1411                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1412def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1413                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1414                        [(set VR256:$dst,
1415                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1416                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1417def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1418                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1419                        [(set VR256:$dst,
1420                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1421                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1422}
1423def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1424                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1425                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1426                     Sched<[WriteCvtPS2I]>;
1427def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1428                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1429                     [(set VR128:$dst,
1430                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1431                     Sched<[WriteCvtPS2ILd]>;
1432
1433
1434// Convert Packed Double FP to Packed DW Integers
1435let Predicates = [HasAVX, NoVLX] in {
1436// The assembler can recognize rr 256-bit instructions by seeing a ymm
1437// register, but the same isn't true when using memory operands instead.
1438// Provide other assembly rr and rm forms to address this explicitly.
1439def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1440                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1441                       [(set VR128:$dst,
1442                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1443                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1444
1445// XMM only
1446def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1447                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1448                      [(set VR128:$dst,
1449                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1450                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1451
1452// YMM only
1453def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1454                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1455                       [(set VR128:$dst,
1456                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1457                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1458def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1459                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1460                       [(set VR128:$dst,
1461                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1462                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1463}
1464
1465def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1466                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1467def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1468                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1469
1470def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1471                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1472                      [(set VR128:$dst,
1473                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1474                      Sched<[WriteCvtPD2ILd]>;
1475def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1476                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1477                      [(set VR128:$dst,
1478                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1479                      Sched<[WriteCvtPD2I]>;
1480
1481// Convert with truncation packed single/double fp to doubleword
1482// SSE2 packed instructions with XS prefix
1483let Predicates = [HasAVX, NoVLX] in {
1484def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1485                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1486                         [(set VR128:$dst,
1487                           (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1488                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1489def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1490                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1491                         [(set VR128:$dst,
1492                           (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
1493                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1494def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1495                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1496                          [(set VR256:$dst,
1497                            (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
1498                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1499def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1500                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1501                          [(set VR256:$dst,
1502                            (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
1503                          VEX, VEX_L,
1504                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1505}
1506
1507def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1508                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1509                       [(set VR128:$dst,
1510                         (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
1511                       Sched<[WriteCvtPS2I]>;
1512def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1513                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1514                       [(set VR128:$dst,
1515                         (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
1516                       Sched<[WriteCvtPS2ILd]>;
1517
1518// The assembler can recognize rr 256-bit instructions by seeing a ymm
1519// register, but the same isn't true when using memory operands instead.
1520// Provide other assembly rr and rm forms to address this explicitly.
1521let Predicates = [HasAVX, NoVLX] in {
1522// XMM only
1523def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1524                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1525                        [(set VR128:$dst,
1526                          (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1527                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1528def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1529                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1530                        [(set VR128:$dst,
1531                          (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
1532                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1533
1534// YMM only
1535def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1536                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1537                         [(set VR128:$dst,
1538                           (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
1539                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1540def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1541                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1542                         [(set VR128:$dst,
1543                           (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
1544                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1545} // Predicates = [HasAVX, NoVLX]
1546
1547def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1548                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1549def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1550                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1551
1552let Predicates = [HasAVX, NoVLX] in {
1553  def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
1554            (VCVTTPD2DQYrr VR256:$src)>;
1555  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
1556            (VCVTTPD2DQYrm addr:$src)>;
1557}
1558
1559def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1560                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1561                      [(set VR128:$dst,
1562                        (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
1563                      Sched<[WriteCvtPD2I]>;
1564def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1565                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1566                      [(set VR128:$dst,
1567                        (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
1568                      Sched<[WriteCvtPD2ILd]>;
1569
1570// Convert packed single to packed double
1571let Predicates = [HasAVX, NoVLX] in {
1572                  // SSE2 instructions without OpSize prefix
1573def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1574                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1575                    [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1576                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1577def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1578                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1579                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1580                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1581def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1582                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1583                     [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
1584                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1585def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1586                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1587                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1588                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1589}
1590
1591let Predicates = [UseSSE2] in {
1592def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1593                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1594                   [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
1595                   PS, Sched<[WriteCvtPS2PD]>;
1596def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1597                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1598                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1599                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1600}
1601
1602// Convert Packed DW Integers to Packed Double FP
1603let Predicates = [HasAVX, NoVLX] in {
1604let hasSideEffects = 0, mayLoad = 1 in
1605def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1606                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1607                        [(set VR128:$dst,
1608                          (v2f64 (X86VSintToFP
1609                                  (bc_v4i32
1610                                   (v2i64 (scalar_to_vector
1611                                           (loadi64 addr:$src)))))))]>,
1612                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1613def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1614                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1615                        [(set VR128:$dst,
1616                          (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1617                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1618def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1619                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1620                         [(set VR256:$dst,
1621                           (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
1622                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1623                         VEX_WIG;
1624def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1625                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1626                         [(set VR256:$dst,
1627                           (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
1628                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1629}
1630
1631let hasSideEffects = 0, mayLoad = 1 in
1632def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1633                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1634                       [(set VR128:$dst,
1635                         (v2f64 (X86VSintToFP
1636                                 (bc_v4i32
1637                                  (v2i64 (scalar_to_vector
1638                                          (loadi64 addr:$src)))))))]>,
1639                       Sched<[WriteCvtI2PDLd]>;
1640def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1641                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1642                       [(set VR128:$dst,
1643                         (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
1644                       Sched<[WriteCvtI2PD]>;
1645
1646// AVX register conversion intrinsics
1647let Predicates = [HasAVX, NoVLX] in {
1648  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1649            (VCVTDQ2PDrm addr:$src)>;
1650} // Predicates = [HasAVX, NoVLX]
1651
1652// SSE2 register conversion intrinsics
1653let Predicates = [UseSSE2] in {
1654  def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1655            (CVTDQ2PDrm addr:$src)>;
1656} // Predicates = [UseSSE2]
1657
1658// Convert packed double to packed single
1659// The assembler can recognize rr 256-bit instructions by seeing a ymm
1660// register, but the same isn't true when using memory operands instead.
1661// Provide other assembly rr and rm forms to address this explicitly.
1662let Predicates = [HasAVX, NoVLX] in {
1663// XMM only
1664def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1665                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1666                       [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1667                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1668def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1669                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1670                       [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
1671                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1672
1673def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1674                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1675                        [(set VR128:$dst, (X86vfpround VR256:$src))]>,
1676                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1677def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1678                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1679                        [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
1680                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1681} // Predicates = [HasAVX, NoVLX]
1682
1683def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1684                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1685def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1686                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1687
1688def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1689                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1690                     [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
1691                     Sched<[WriteCvtPD2PS]>;
1692def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1693                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1694                     [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
1695                     Sched<[WriteCvtPD2PS.Folded]>;
1696
1697let Predicates = [HasAVX, NoVLX] in {
1698  def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
1699            (VCVTPD2PSYrr VR256:$src)>;
1700  def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
1701            (VCVTPD2PSYrm addr:$src)>;
1702}
1703
1704//===----------------------------------------------------------------------===//
1705// SSE 1 & 2 - Compare Instructions
1706//===----------------------------------------------------------------------===//
1707
1708// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1709multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1710                            SDNode OpNode, ValueType VT,
1711                            PatFrag ld_frag, string asm,
1712                            X86FoldableSchedWrite sched> {
1713  let isCommutable = 1 in
1714  def rr : SIi8<0xC2, MRMSrcReg,
1715                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1716                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>,
1717                Sched<[sched]>;
1718  def rm : SIi8<0xC2, MRMSrcMem,
1719                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1720                [(set RC:$dst, (OpNode (VT RC:$src1),
1721                                         (ld_frag addr:$src2), imm:$cc))]>,
1722                Sched<[sched.Folded, sched.ReadAfterFold]>;
1723}
1724
1725let isCodeGenOnly = 1 in {
1726  let ExeDomain = SSEPackedSingle in
1727  defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1728                   "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1729                   SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1730  let ExeDomain = SSEPackedDouble in
1731  defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1732                   "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1733                   SchedWriteFCmpSizes.PD.Scl>,
1734                   XD, VEX_4V, VEX_LIG, VEX_WIG;
1735
1736  let Constraints = "$src1 = $dst" in {
1737    let ExeDomain = SSEPackedSingle in
1738    defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1739                    "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1740                    SchedWriteFCmpSizes.PS.Scl>, XS;
1741    let ExeDomain = SSEPackedDouble in
1742    defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1743                    "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1744                    SchedWriteFCmpSizes.PD.Scl>, XD;
1745  }
1746}
1747
1748multiclass sse12_cmp_scalar_int<Operand memop,
1749                         Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1750                         ComplexPattern mem_cpat> {
1751  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1752                      (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
1753                        [(set VR128:$dst, (Int VR128:$src1,
1754                                               VR128:$src, imm:$cc))]>,
1755           Sched<[sched]>;
1756let mayLoad = 1 in
1757  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1758                      (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
1759                        [(set VR128:$dst, (Int VR128:$src1,
1760                                               mem_cpat:$src, imm:$cc))]>,
1761           Sched<[sched.Folded, sched.ReadAfterFold]>;
1762}
1763
1764// Aliases to match intrinsics which expect XMM operand(s).
1765let ExeDomain = SSEPackedSingle in
1766defm VCMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1767                     "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1768                     SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1769                     XS, VEX_4V, VEX_LIG, VEX_WIG;
1770let ExeDomain = SSEPackedDouble in
1771defm VCMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1772                     "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1773                     SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1774                     XD, VEX_4V, VEX_LIG, VEX_WIG;
1775let Constraints = "$src1 = $dst" in {
1776  let ExeDomain = SSEPackedSingle in
1777  defm CMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1778                       "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
1779                       SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1780  let ExeDomain = SSEPackedDouble in
1781  defm CMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1782                       "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
1783                       SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1784}
1785
1786
1787// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1788multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1789                         ValueType vt, X86MemOperand x86memop,
1790                         PatFrag ld_frag, string OpcodeStr,
1791                         X86FoldableSchedWrite sched> {
1792let hasSideEffects = 0 in {
1793  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1794                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1795                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1796          Sched<[sched]>;
1797let mayLoad = 1 in
1798  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1799                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1800                     [(set EFLAGS, (OpNode (vt RC:$src1),
1801                                           (ld_frag addr:$src2)))]>,
1802          Sched<[sched.Folded, sched.ReadAfterFold]>;
1803}
1804}
1805
1806// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1807multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1808                             ValueType vt, Operand memop,
1809                             ComplexPattern mem_cpat, string OpcodeStr,
1810                             X86FoldableSchedWrite sched> {
1811  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1812                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1813                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1814          Sched<[sched]>;
1815let mayLoad = 1 in
1816  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1817                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1818                     [(set EFLAGS, (OpNode (vt RC:$src1),
1819                                           mem_cpat:$src2))]>,
1820          Sched<[sched.Folded, sched.ReadAfterFold]>;
1821}
1822
1823let Defs = [EFLAGS] in {
1824  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1825                               "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1826  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1827                               "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1828  let Pattern = []<dag> in {
1829    defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1830                                "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1831    defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1832                                "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1833  }
1834
1835  let isCodeGenOnly = 1 in {
1836    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1837                      sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1838    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1839                      sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1840
1841    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1842                       sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
1843    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1844                       sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
1845  }
1846  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
1847                                  "ucomiss", WriteFCom>, PS;
1848  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
1849                                  "ucomisd", WriteFCom>, PD;
1850
1851  let Pattern = []<dag> in {
1852    defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
1853                                    "comiss", WriteFCom>, PS;
1854    defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
1855                                    "comisd", WriteFCom>, PD;
1856  }
1857
1858  let isCodeGenOnly = 1 in {
1859    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1860                            sse_load_f32, "ucomiss", WriteFCom>, PS;
1861    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1862                            sse_load_f64, "ucomisd", WriteFCom>, PD;
1863
1864    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1865                                sse_load_f32, "comiss", WriteFCom>, PS;
1866    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1867                                    sse_load_f64, "comisd", WriteFCom>, PD;
1868  }
1869} // Defs = [EFLAGS]
1870
1871// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1872multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1873                            ValueType VT, string asm,
1874                            X86FoldableSchedWrite sched,
1875                            Domain d, PatFrag ld_frag> {
1876  let isCommutable = 1 in
1877  def rri : PIi8<0xC2, MRMSrcReg,
1878             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1879             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>,
1880            Sched<[sched]>;
1881  def rmi : PIi8<0xC2, MRMSrcMem,
1882             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1883             [(set RC:$dst,
1884               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>,
1885            Sched<[sched.Folded, sched.ReadAfterFold]>;
1886}
1887
1888defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1889               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1890               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1891defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1892               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1893               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1894defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1895               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1896               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1897defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1898               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1899               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1900let Constraints = "$src1 = $dst" in {
1901  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1902                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1903                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1904  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1905                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1906                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1907}
1908
1909def CommutableCMPCC : PatLeaf<(imm), [{
1910  uint64_t Imm = N->getZExtValue() & 0x7;
1911  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1912}]>;
1913
1914// Patterns to select compares with loads in first operand.
1915let Predicates = [HasAVX] in {
1916  def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
1917                            CommutableCMPCC:$cc)),
1918            (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
1919
1920  def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
1921                            CommutableCMPCC:$cc)),
1922            (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
1923
1924  def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
1925                            CommutableCMPCC:$cc)),
1926            (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
1927
1928  def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
1929                            CommutableCMPCC:$cc)),
1930            (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
1931
1932  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1933                          CommutableCMPCC:$cc)),
1934            (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
1935
1936  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1937                          CommutableCMPCC:$cc)),
1938            (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
1939}
1940
1941let Predicates = [UseSSE2] in {
1942  def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
1943                            CommutableCMPCC:$cc)),
1944            (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
1945
1946  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1947                          CommutableCMPCC:$cc)),
1948            (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
1949}
1950
1951let Predicates = [UseSSE1] in {
1952  def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
1953                            CommutableCMPCC:$cc)),
1954            (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
1955
1956  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1957                          CommutableCMPCC:$cc)),
1958            (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
1959}
1960
1961//===----------------------------------------------------------------------===//
1962// SSE 1 & 2 - Shuffle Instructions
1963//===----------------------------------------------------------------------===//
1964
1965/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
1966multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
1967                         ValueType vt, string asm, PatFrag mem_frag,
1968                         X86FoldableSchedWrite sched, Domain d,
1969                         bit IsCommutable = 0> {
1970  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
1971                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
1972                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
1973                                       (i8 imm:$src3))))], d>,
1974            Sched<[sched.Folded, sched.ReadAfterFold]>;
1975  let isCommutable = IsCommutable in
1976  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
1977                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
1978                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
1979                                     (i8 imm:$src3))))], d>,
1980            Sched<[sched]>;
1981}
1982
1983let Predicates = [HasAVX, NoVLX] in {
1984  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
1985           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1986           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
1987           PS, VEX_4V, VEX_WIG;
1988  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
1989           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1990           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
1991           PS, VEX_4V, VEX_L, VEX_WIG;
1992  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
1993           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1994           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
1995           PD, VEX_4V, VEX_WIG;
1996  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
1997           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
1998           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
1999           PD, VEX_4V, VEX_L, VEX_WIG;
2000}
2001let Constraints = "$src1 = $dst" in {
2002  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2003                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2004                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2005  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2006                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2007                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2008}
2009
2010//===----------------------------------------------------------------------===//
2011// SSE 1 & 2 - Unpack FP Instructions
2012//===----------------------------------------------------------------------===//
2013
2014/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2015multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2016                                   PatFrag mem_frag, RegisterClass RC,
2017                                   X86MemOperand x86memop, string asm,
2018                                   X86FoldableSchedWrite sched, Domain d,
2019                                   bit IsCommutable = 0> {
2020    let isCommutable = IsCommutable in
2021    def rr : PI<opc, MRMSrcReg,
2022                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2023                asm, [(set RC:$dst,
2024                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2025                Sched<[sched]>;
2026    def rm : PI<opc, MRMSrcMem,
2027                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2028                asm, [(set RC:$dst,
2029                           (vt (OpNode RC:$src1,
2030                                       (mem_frag addr:$src2))))], d>,
2031             Sched<[sched.Folded, sched.ReadAfterFold]>;
2032}
2033
2034let Predicates = [HasAVX, NoVLX] in {
2035defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2036      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2037                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2038defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2039      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2040                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2041defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2042      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2043                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2044defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2045      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2046                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2047
2048defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2049      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2050                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2051defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2052      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2053                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2054defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2055      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2056                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2057defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2058      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2059                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2060}// Predicates = [HasAVX, NoVLX]
2061
2062let Constraints = "$src1 = $dst" in {
2063  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2064        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2065                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2066  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2067        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2068                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2069  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2070        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2071                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2072  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2073        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2074                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2075} // Constraints = "$src1 = $dst"
2076
2077let Predicates = [HasAVX1Only] in {
2078  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2079            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2080  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2081            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2082  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2083            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2084  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2085            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2086
2087  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2088            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2089  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2090            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2091  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2092            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2093  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2094            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2095}
2096
2097let Predicates = [UseSSE2] in {
2098  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2099  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2100                              (v2f64 (nonvolatile_load addr:$src2)))),
2101            (MOVHPDrm VR128:$src1, addr:$src2)>;
2102}
2103
2104//===----------------------------------------------------------------------===//
2105// SSE 1 & 2 - Extract Floating-Point Sign mask
2106//===----------------------------------------------------------------------===//
2107
2108/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2109multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2110                                string asm, Domain d> {
2111  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2112              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2113              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2114              Sched<[WriteFMOVMSK]>;
2115}
2116
2117let Predicates = [HasAVX] in {
2118  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2119                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2120  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2121                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2122  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2123                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2124  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2125                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2126
2127  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2128  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2129            (VMOVMSKPSrr VR128:$src)>;
2130  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2131            (VMOVMSKPDrr VR128:$src)>;
2132  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2133            (VMOVMSKPSYrr VR256:$src)>;
2134  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2135            (VMOVMSKPDYrr VR256:$src)>;
2136}
2137
2138defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2139                                     SSEPackedSingle>, PS;
2140defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2141                                     SSEPackedDouble>, PD;
2142
2143let Predicates = [UseSSE2] in {
2144  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2145  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2146            (MOVMSKPSrr VR128:$src)>;
2147  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2148            (MOVMSKPDrr VR128:$src)>;
2149}
2150
2151//===---------------------------------------------------------------------===//
2152// SSE2 - Packed Integer Logical Instructions
2153//===---------------------------------------------------------------------===//
2154
2155let ExeDomain = SSEPackedInt in { // SSE integer instructions
2156
2157/// PDI_binop_rm - Simple SSE2 binary operator.
2158multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2159                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2160                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2161                        bit IsCommutable, bit Is2Addr> {
2162  let isCommutable = IsCommutable in
2163  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2164       (ins RC:$src1, RC:$src2),
2165       !if(Is2Addr,
2166           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2167           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2168       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2169       Sched<[sched]>;
2170  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2171       (ins RC:$src1, x86memop:$src2),
2172       !if(Is2Addr,
2173           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2174           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2175       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2176       Sched<[sched.Folded, sched.ReadAfterFold]>;
2177}
2178} // ExeDomain = SSEPackedInt
2179
2180multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2181                         ValueType OpVT128, ValueType OpVT256,
2182                         X86SchedWriteWidths sched, bit IsCommutable,
2183                         Predicate prd> {
2184let Predicates = [HasAVX, prd] in
2185  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2186                             VR128, load, i128mem, sched.XMM,
2187                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2188
2189let Constraints = "$src1 = $dst" in
2190  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2191                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2192
2193let Predicates = [HasAVX2, prd] in
2194  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2195                               OpVT256, VR256, load, i256mem, sched.YMM,
2196                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2197}
2198
2199// These are ordered here for pattern ordering requirements with the fp versions
2200
2201defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2202                           SchedWriteVecLogic, 1, NoVLX>;
2203defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2204                           SchedWriteVecLogic, 1, NoVLX>;
2205defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2206                           SchedWriteVecLogic, 1, NoVLX>;
2207defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2208                           SchedWriteVecLogic, 0, NoVLX>;
2209
2210//===----------------------------------------------------------------------===//
2211// SSE 1 & 2 - Logical Instructions
2212//===----------------------------------------------------------------------===//
2213
2214/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2215///
2216/// There are no patterns here because isel prefers integer versions for SSE2
2217/// and later. There are SSE1 v4f32 patterns later.
2218multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2219                                   SDNode OpNode, X86SchedWriteWidths sched> {
2220  let Predicates = [HasAVX, NoVLX] in {
2221  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2222        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2223        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2224
2225  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2226        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2227        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2228
2229  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2230       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2231       [], [], 0>, PS, VEX_4V, VEX_WIG;
2232
2233  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2234       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2235       [], [], 0>, PD, VEX_4V, VEX_WIG;
2236  }
2237
2238  let Constraints = "$src1 = $dst" in {
2239    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2240         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2241         [], []>, PS;
2242
2243    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2244         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2245         [], []>, PD;
2246  }
2247}
2248
2249defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2250defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2251defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2252let isCommutable = 0 in
2253  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2254
2255let Predicates = [HasAVX2, NoVLX] in {
2256  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2257            (VPANDYrr VR256:$src1, VR256:$src2)>;
2258  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2259            (VPANDYrr VR256:$src1, VR256:$src2)>;
2260  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2261            (VPANDYrr VR256:$src1, VR256:$src2)>;
2262
2263  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2264            (VPORYrr VR256:$src1, VR256:$src2)>;
2265  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2266            (VPORYrr VR256:$src1, VR256:$src2)>;
2267  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2268            (VPORYrr VR256:$src1, VR256:$src2)>;
2269
2270  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2271            (VPXORYrr VR256:$src1, VR256:$src2)>;
2272  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2273            (VPXORYrr VR256:$src1, VR256:$src2)>;
2274  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2275            (VPXORYrr VR256:$src1, VR256:$src2)>;
2276
2277  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2278            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2279  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2280            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2281  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2282            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2283
2284  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2285            (VPANDYrm VR256:$src1, addr:$src2)>;
2286  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2287            (VPANDYrm VR256:$src1, addr:$src2)>;
2288  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2289            (VPANDYrm VR256:$src1, addr:$src2)>;
2290
2291  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2292            (VPORYrm VR256:$src1, addr:$src2)>;
2293  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2294            (VPORYrm VR256:$src1, addr:$src2)>;
2295  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2296            (VPORYrm VR256:$src1, addr:$src2)>;
2297
2298  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2299            (VPXORYrm VR256:$src1, addr:$src2)>;
2300  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2301            (VPXORYrm VR256:$src1, addr:$src2)>;
2302  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2303            (VPXORYrm VR256:$src1, addr:$src2)>;
2304
2305  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2306            (VPANDNYrm VR256:$src1, addr:$src2)>;
2307  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2308            (VPANDNYrm VR256:$src1, addr:$src2)>;
2309  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2310            (VPANDNYrm VR256:$src1, addr:$src2)>;
2311}
2312
2313// If only AVX1 is supported, we need to handle integer operations with
2314// floating point instructions since the integer versions aren't available.
2315let Predicates = [HasAVX1Only] in {
2316  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2317            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2318  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2319            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2320  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2321            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2322  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2323            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2324
2325  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2326            (VORPSYrr VR256:$src1, VR256:$src2)>;
2327  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2328            (VORPSYrr VR256:$src1, VR256:$src2)>;
2329  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2330            (VORPSYrr VR256:$src1, VR256:$src2)>;
2331  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2332            (VORPSYrr VR256:$src1, VR256:$src2)>;
2333
2334  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2335            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2336  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2337            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2338  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2339            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2340  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2341            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2342
2343  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2344            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2345  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2346            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2347  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2348            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2349  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2350            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2351
2352  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2353            (VANDPSYrm VR256:$src1, addr:$src2)>;
2354  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2355            (VANDPSYrm VR256:$src1, addr:$src2)>;
2356  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2357            (VANDPSYrm VR256:$src1, addr:$src2)>;
2358  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2359            (VANDPSYrm VR256:$src1, addr:$src2)>;
2360
2361  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2362            (VORPSYrm VR256:$src1, addr:$src2)>;
2363  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2364            (VORPSYrm VR256:$src1, addr:$src2)>;
2365  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2366            (VORPSYrm VR256:$src1, addr:$src2)>;
2367  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2368            (VORPSYrm VR256:$src1, addr:$src2)>;
2369
2370  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2371            (VXORPSYrm VR256:$src1, addr:$src2)>;
2372  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2373            (VXORPSYrm VR256:$src1, addr:$src2)>;
2374  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2375            (VXORPSYrm VR256:$src1, addr:$src2)>;
2376  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2377            (VXORPSYrm VR256:$src1, addr:$src2)>;
2378
2379  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2380            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2381  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2382            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2383  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2384            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2385  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2386            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2387}
2388
2389let Predicates = [HasAVX, NoVLX] in {
2390  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2391            (VPANDrr VR128:$src1, VR128:$src2)>;
2392  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2393            (VPANDrr VR128:$src1, VR128:$src2)>;
2394  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2395            (VPANDrr VR128:$src1, VR128:$src2)>;
2396
2397  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2398            (VPORrr VR128:$src1, VR128:$src2)>;
2399  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2400            (VPORrr VR128:$src1, VR128:$src2)>;
2401  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2402            (VPORrr VR128:$src1, VR128:$src2)>;
2403
2404  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2405            (VPXORrr VR128:$src1, VR128:$src2)>;
2406  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2407            (VPXORrr VR128:$src1, VR128:$src2)>;
2408  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2409            (VPXORrr VR128:$src1, VR128:$src2)>;
2410
2411  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2412            (VPANDNrr VR128:$src1, VR128:$src2)>;
2413  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2414            (VPANDNrr VR128:$src1, VR128:$src2)>;
2415  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2416            (VPANDNrr VR128:$src1, VR128:$src2)>;
2417
2418  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2419            (VPANDrm VR128:$src1, addr:$src2)>;
2420  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2421            (VPANDrm VR128:$src1, addr:$src2)>;
2422  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2423            (VPANDrm VR128:$src1, addr:$src2)>;
2424
2425  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2426            (VPORrm VR128:$src1, addr:$src2)>;
2427  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2428            (VPORrm VR128:$src1, addr:$src2)>;
2429  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2430            (VPORrm VR128:$src1, addr:$src2)>;
2431
2432  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2433            (VPXORrm VR128:$src1, addr:$src2)>;
2434  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2435            (VPXORrm VR128:$src1, addr:$src2)>;
2436  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2437            (VPXORrm VR128:$src1, addr:$src2)>;
2438
2439  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2440            (VPANDNrm VR128:$src1, addr:$src2)>;
2441  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2442            (VPANDNrm VR128:$src1, addr:$src2)>;
2443  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2444            (VPANDNrm VR128:$src1, addr:$src2)>;
2445}
2446
2447let Predicates = [UseSSE2] in {
2448  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2449            (PANDrr VR128:$src1, VR128:$src2)>;
2450  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2451            (PANDrr VR128:$src1, VR128:$src2)>;
2452  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2453            (PANDrr VR128:$src1, VR128:$src2)>;
2454
2455  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2456            (PORrr VR128:$src1, VR128:$src2)>;
2457  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2458            (PORrr VR128:$src1, VR128:$src2)>;
2459  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2460            (PORrr VR128:$src1, VR128:$src2)>;
2461
2462  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2463            (PXORrr VR128:$src1, VR128:$src2)>;
2464  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2465            (PXORrr VR128:$src1, VR128:$src2)>;
2466  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2467            (PXORrr VR128:$src1, VR128:$src2)>;
2468
2469  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2470            (PANDNrr VR128:$src1, VR128:$src2)>;
2471  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2472            (PANDNrr VR128:$src1, VR128:$src2)>;
2473  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2474            (PANDNrr VR128:$src1, VR128:$src2)>;
2475
2476  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2477            (PANDrm VR128:$src1, addr:$src2)>;
2478  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2479            (PANDrm VR128:$src1, addr:$src2)>;
2480  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2481            (PANDrm VR128:$src1, addr:$src2)>;
2482
2483  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2484            (PORrm VR128:$src1, addr:$src2)>;
2485  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2486            (PORrm VR128:$src1, addr:$src2)>;
2487  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2488            (PORrm VR128:$src1, addr:$src2)>;
2489
2490  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2491            (PXORrm VR128:$src1, addr:$src2)>;
2492  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2493            (PXORrm VR128:$src1, addr:$src2)>;
2494  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2495            (PXORrm VR128:$src1, addr:$src2)>;
2496
2497  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2498            (PANDNrm VR128:$src1, addr:$src2)>;
2499  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2500            (PANDNrm VR128:$src1, addr:$src2)>;
2501  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2502            (PANDNrm VR128:$src1, addr:$src2)>;
2503}
2504
2505// Patterns for packed operations when we don't have integer type available.
2506def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2507          (ANDPSrr VR128:$src1, VR128:$src2)>;
2508def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2509          (ORPSrr VR128:$src1, VR128:$src2)>;
2510def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2511          (XORPSrr VR128:$src1, VR128:$src2)>;
2512def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2513          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2514
2515def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2516          (ANDPSrm VR128:$src1, addr:$src2)>;
2517def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2518          (ORPSrm VR128:$src1, addr:$src2)>;
2519def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2520          (XORPSrm VR128:$src1, addr:$src2)>;
2521def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2522          (ANDNPSrm VR128:$src1, addr:$src2)>;
2523
2524//===----------------------------------------------------------------------===//
2525// SSE 1 & 2 - Arithmetic Instructions
2526//===----------------------------------------------------------------------===//
2527
2528/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2529/// vector forms.
2530///
2531/// In addition, we also have a special variant of the scalar form here to
2532/// represent the associated intrinsic operation.  This form is unlike the
2533/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2534/// and leaves the top elements unmodified (therefore these cannot be commuted).
2535///
2536/// These three forms can each be reg+reg or reg+mem.
2537///
2538
2539/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2540/// classes below
2541multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2542                                  SDNode OpNode, X86SchedWriteSizes sched> {
2543  let Predicates = [HasAVX, NoVLX] in {
2544  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2545                               VR128, v4f32, f128mem, loadv4f32,
2546                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2547  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2548                               VR128, v2f64, f128mem, loadv2f64,
2549                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2550
2551  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2552                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2553                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2554  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2555                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2556                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2557  }
2558
2559  let Constraints = "$src1 = $dst" in {
2560    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2561                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2562                              sched.PS.XMM>, PS;
2563    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2564                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2565                              sched.PD.XMM>, PD;
2566  }
2567}
2568
2569multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2570                                  X86SchedWriteSizes sched> {
2571  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2572                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2573                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2574  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2575                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2576                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2577
2578  let Constraints = "$src1 = $dst" in {
2579    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2580                              OpNode, FR32, f32mem, SSEPackedSingle,
2581                              sched.PS.Scl>, XS;
2582    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2583                              OpNode, FR64, f64mem, SSEPackedDouble,
2584                              sched.PD.Scl>, XD;
2585  }
2586}
2587
2588multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2589                                      SDPatternOperator OpNode,
2590                                      X86SchedWriteSizes sched> {
2591  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2592                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2593                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2594  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2595                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2596                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2597
2598  let Constraints = "$src1 = $dst" in {
2599    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2600                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2601                   SSEPackedSingle, sched.PS.Scl>, XS;
2602    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2603                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2604                   SSEPackedDouble, sched.PD.Scl>, XD;
2605  }
2606}
2607
2608// Binary Arithmetic instructions
2609defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
2610           basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
2611           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2612defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
2613           basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
2614           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2615let isCommutable = 0 in {
2616  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2617             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
2618             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2619  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2620             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
2621             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2622  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2623             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2624             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2625  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2626             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2627             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2628}
2629
2630let isCodeGenOnly = 1 in {
2631  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2632             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2633  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2634             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2635}
2636
2637// Patterns used to select SSE scalar fp arithmetic instructions from
2638// either:
2639//
2640// (1) a scalar fp operation followed by a blend
2641//
2642// The effect is that the backend no longer emits unnecessary vector
2643// insert instructions immediately after SSE scalar fp instructions
2644// like addss or mulss.
2645//
2646// For example, given the following code:
2647//   __m128 foo(__m128 A, __m128 B) {
2648//     A[0] += B[0];
2649//     return A;
2650//   }
2651//
2652// Previously we generated:
2653//   addss %xmm0, %xmm1
2654//   movss %xmm1, %xmm0
2655//
2656// We now generate:
2657//   addss %xmm1, %xmm0
2658//
2659// (2) a vector packed single/double fp operation followed by a vector insert
2660//
2661// The effect is that the backend converts the packed fp instruction
2662// followed by a vector insert into a single SSE scalar fp instruction.
2663//
2664// For example, given the following code:
2665//   __m128 foo(__m128 A, __m128 B) {
2666//     __m128 C = A + B;
2667//     return (__m128) {c[0], a[1], a[2], a[3]};
2668//   }
2669//
2670// Previously we generated:
2671//   addps %xmm0, %xmm1
2672//   movss %xmm1, %xmm0
2673//
2674// We now generate:
2675//   addss %xmm1, %xmm0
2676
2677// TODO: Some canonicalization in lowering would simplify the number of
2678// patterns we have to try to match.
2679multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2680                                    ValueType VT, ValueType EltTy,
2681                                    RegisterClass RC, PatFrag ld_frag,
2682                                    Predicate BasePredicate> {
2683  let Predicates = [BasePredicate] in {
2684    // extracted scalar math op with insert via movss/movsd
2685    def : Pat<(VT (Move (VT VR128:$dst),
2686                        (VT (scalar_to_vector
2687                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2688                                 RC:$src))))),
2689              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2690               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2691    def : Pat<(VT (Move (VT VR128:$dst),
2692                        (VT (scalar_to_vector
2693                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2694                                 (ld_frag addr:$src)))))),
2695              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2696  }
2697
2698  // Repeat for AVX versions of the instructions.
2699  let Predicates = [UseAVX] in {
2700    // extracted scalar math op with insert via movss/movsd
2701    def : Pat<(VT (Move (VT VR128:$dst),
2702                        (VT (scalar_to_vector
2703                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2704                                 RC:$src))))),
2705              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2706               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2707    def : Pat<(VT (Move (VT VR128:$dst),
2708                        (VT (scalar_to_vector
2709                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2710                                 (ld_frag addr:$src)))))),
2711              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2712  }
2713}
2714
2715defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2716defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2717defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2718defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2719
2720defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2721defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2722defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2723defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2724
2725/// Unop Arithmetic
2726/// In addition, we also have a special variant of the scalar form here to
2727/// represent the associated intrinsic operation.  This form is unlike the
2728/// plain scalar form, in that it takes an entire vector (instead of a
2729/// scalar) and leaves the top elements undefined.
2730///
2731/// And, we have a special variant form for a full-vector intrinsic form.
2732
2733/// sse_fp_unop_s - SSE1 unops in scalar form
2734/// For the non-AVX defs, we need $src1 to be tied to $dst because
2735/// the HW instructions are 2 operand / destructive.
2736multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2737                          ValueType ScalarVT, X86MemOperand x86memop,
2738                          Operand intmemop, SDNode OpNode, Domain d,
2739                          X86FoldableSchedWrite sched, Predicate target> {
2740  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2741  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2742              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2743            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2744            Requires<[target]>;
2745  let mayLoad = 1 in
2746  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2747            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2748            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2749            Sched<[sched.Folded]>,
2750            Requires<[target, OptForSize]>;
2751  }
2752
2753  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2754  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2755                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2756                Sched<[sched]>;
2757  let mayLoad = 1 in
2758  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2759                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2760                Sched<[sched.Folded, sched.ReadAfterFold]>;
2761  }
2762
2763}
2764
2765multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2766                              ComplexPattern int_cpat, Intrinsic Intr,
2767                              Predicate target, string Suffix> {
2768  let Predicates = [target] in {
2769  // These are unary operations, but they are modeled as having 2 source operands
2770  // because the high elements of the destination are unchanged in SSE.
2771  def : Pat<(Intr VR128:$src),
2772            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2773  }
2774  // We don't want to fold scalar loads into these instructions unless
2775  // optimizing for size. This is because the folded instruction will have a
2776  // partial register update, while the unfolded sequence will not, e.g.
2777  // movss mem, %xmm0
2778  // rcpss %xmm0, %xmm0
2779  // which has a clobber before the rcp, vs.
2780  // rcpss mem, %xmm0
2781  let Predicates = [target, OptForSize] in {
2782    def : Pat<(Intr int_cpat:$src2),
2783               (!cast<Instruction>(NAME#m_Int)
2784                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2785  }
2786}
2787
2788multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2789                              Intrinsic Intr, Predicate target> {
2790  let Predicates = [target] in {
2791   def : Pat<(Intr VR128:$src),
2792             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2793                                 VR128:$src)>;
2794  }
2795  let Predicates = [target, OptForSize] in {
2796    def : Pat<(Intr int_cpat:$src2),
2797              (!cast<Instruction>(NAME#m_Int)
2798                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2799  }
2800}
2801
2802multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2803                          ValueType ScalarVT, X86MemOperand x86memop,
2804                          Operand intmemop, SDNode OpNode, Domain d,
2805                          X86FoldableSchedWrite sched, Predicate target> {
2806  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2807  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2808            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2809            [], d>, Sched<[sched]>;
2810  let mayLoad = 1 in
2811  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2812             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2813            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2814  }
2815  let hasSideEffects = 0, ExeDomain = d in {
2816  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2817                (ins VR128:$src1, VR128:$src2),
2818             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2819             []>, Sched<[sched]>;
2820  let mayLoad = 1 in
2821  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2822                (ins VR128:$src1, intmemop:$src2),
2823             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2824             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2825  }
2826
2827  // We don't want to fold scalar loads into these instructions unless
2828  // optimizing for size. This is because the folded instruction will have a
2829  // partial register update, while the unfolded sequence will not, e.g.
2830  // vmovss mem, %xmm0
2831  // vrcpss %xmm0, %xmm0, %xmm0
2832  // which has a clobber before the rcp, vs.
2833  // vrcpss mem, %xmm0, %xmm0
2834  // TODO: In theory, we could fold the load, and avoid the stall caused by
2835  // the partial register store, either in BreakFalseDeps or with smarter RA.
2836  let Predicates = [target] in {
2837   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2838                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2839  }
2840  let Predicates = [target, OptForSize] in {
2841    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2842              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2843            addr:$src)>;
2844  }
2845}
2846
2847/// sse1_fp_unop_p - SSE1 unops in packed form.
2848multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2849                          X86SchedWriteWidths sched, list<Predicate> prds> {
2850let Predicates = prds in {
2851  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2852                       !strconcat("v", OpcodeStr,
2853                                  "ps\t{$src, $dst|$dst, $src}"),
2854                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2855                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2856  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2857                       !strconcat("v", OpcodeStr,
2858                                  "ps\t{$src, $dst|$dst, $src}"),
2859                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2860                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2861  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2862                        !strconcat("v", OpcodeStr,
2863                                   "ps\t{$src, $dst|$dst, $src}"),
2864                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2865                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2866  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2867                        !strconcat("v", OpcodeStr,
2868                                   "ps\t{$src, $dst|$dst, $src}"),
2869                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2870                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2871}
2872
2873  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2874                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2875                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2876                Sched<[sched.XMM]>;
2877  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2878                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2879                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2880                Sched<[sched.XMM.Folded]>;
2881}
2882
2883/// sse2_fp_unop_p - SSE2 unops in vector forms.
2884multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2885                          SDNode OpNode, X86SchedWriteWidths sched> {
2886let Predicates = [HasAVX, NoVLX] in {
2887  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2888                       !strconcat("v", OpcodeStr,
2889                                  "pd\t{$src, $dst|$dst, $src}"),
2890                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2891                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2892  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2893                       !strconcat("v", OpcodeStr,
2894                                  "pd\t{$src, $dst|$dst, $src}"),
2895                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2896                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2897  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2898                        !strconcat("v", OpcodeStr,
2899                                   "pd\t{$src, $dst|$dst, $src}"),
2900                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2901                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2902  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2903                        !strconcat("v", OpcodeStr,
2904                                   "pd\t{$src, $dst|$dst, $src}"),
2905                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2906                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2907}
2908
2909  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2910                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2911                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2912                Sched<[sched.XMM]>;
2913  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2914                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2915                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2916                Sched<[sched.XMM.Folded]>;
2917}
2918
2919multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2920                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2921  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2922                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2923                      UseSSE1, "SS">, XS;
2924  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2925                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2926                      AVXTarget>,
2927                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2928}
2929
2930multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2931                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2932  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
2933                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2934  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
2935                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2936                       XS, VEX_4V, VEX_LIG, VEX_WIG;
2937}
2938
2939multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2940                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2941  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
2942                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2943  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
2944                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2945                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2946}
2947
2948// Square root.
2949defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
2950             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
2951             sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
2952             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
2953
2954// Reciprocal approximations. Note that these typically require refinement
2955// in order to obtain suitable precision.
2956defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2957             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
2958             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
2959defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2960             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
2961             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
2962
2963// There is no f64 version of the reciprocal approximation instructions.
2964
2965multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
2966                                      ValueType VT, Predicate BasePredicate> {
2967  let Predicates = [BasePredicate] in {
2968    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2969                                  (OpNode (extractelt VT:$src, 0))))),
2970              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2971  }
2972
2973  // Repeat for AVX versions of the instructions.
2974  let Predicates = [UseAVX] in {
2975    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
2976                                  (OpNode (extractelt VT:$src, 0))))),
2977              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2978  }
2979}
2980
2981defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
2982defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
2983
2984multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
2985                                           SDNode Move, ValueType VT,
2986                                           Predicate BasePredicate> {
2987  let Predicates = [BasePredicate] in {
2988    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
2989              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2990  }
2991
2992  // Repeat for AVX versions of the instructions.
2993  let Predicates = [HasAVX] in {
2994    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
2995              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
2996  }
2997}
2998
2999defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3000                                       v4f32, UseSSE1>;
3001defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3002                                       v4f32, UseSSE1>;
3003
3004
3005//===----------------------------------------------------------------------===//
3006// SSE 1 & 2 - Non-temporal stores
3007//===----------------------------------------------------------------------===//
3008
3009let AddedComplexity = 400 in { // Prefer non-temporal versions
3010let Predicates = [HasAVX, NoVLX] in {
3011let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3012def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3013                     (ins f128mem:$dst, VR128:$src),
3014                     "movntps\t{$src, $dst|$dst, $src}",
3015                     [(alignednontemporalstore (v4f32 VR128:$src),
3016                                               addr:$dst)]>, VEX, VEX_WIG;
3017def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3018                     (ins f128mem:$dst, VR128:$src),
3019                     "movntpd\t{$src, $dst|$dst, $src}",
3020                     [(alignednontemporalstore (v2f64 VR128:$src),
3021                                               addr:$dst)]>, VEX, VEX_WIG;
3022} // SchedRW
3023
3024let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3025def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3026                     (ins f256mem:$dst, VR256:$src),
3027                     "movntps\t{$src, $dst|$dst, $src}",
3028                     [(alignednontemporalstore (v8f32 VR256:$src),
3029                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3030def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3031                     (ins f256mem:$dst, VR256:$src),
3032                     "movntpd\t{$src, $dst|$dst, $src}",
3033                     [(alignednontemporalstore (v4f64 VR256:$src),
3034                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3035} // SchedRW
3036
3037let ExeDomain = SSEPackedInt in {
3038def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3039                         (ins i128mem:$dst, VR128:$src),
3040                         "movntdq\t{$src, $dst|$dst, $src}",
3041                         [(alignednontemporalstore (v2i64 VR128:$src),
3042                                                   addr:$dst)]>, VEX, VEX_WIG,
3043                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3044def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3045                    (ins i256mem:$dst, VR256:$src),
3046                    "movntdq\t{$src, $dst|$dst, $src}",
3047                    [(alignednontemporalstore (v4i64 VR256:$src),
3048                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3049                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3050} // ExeDomain
3051} // Predicates
3052
3053let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3054def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3055                    "movntps\t{$src, $dst|$dst, $src}",
3056                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3057def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3058                    "movntpd\t{$src, $dst|$dst, $src}",
3059                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3060} // SchedRW
3061
3062let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3063def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3064                    "movntdq\t{$src, $dst|$dst, $src}",
3065                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3066
3067let SchedRW = [WriteStoreNT] in {
3068// There is no AVX form for instructions below this point
3069def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3070                 "movnti{l}\t{$src, $dst|$dst, $src}",
3071                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3072               PS, Requires<[HasSSE2]>;
3073def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3074                     "movnti{q}\t{$src, $dst|$dst, $src}",
3075                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3076                  PS, Requires<[HasSSE2]>;
3077} // SchedRW = [WriteStoreNT]
3078
3079let Predicates = [HasAVX, NoVLX] in {
3080  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3081            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3082  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3083            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3084  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3085            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3086
3087  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3088            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3089  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3090            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3091  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3092            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3093}
3094
3095let Predicates = [UseSSE2] in {
3096  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3097            (MOVNTDQmr addr:$dst, VR128:$src)>;
3098  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3099            (MOVNTDQmr addr:$dst, VR128:$src)>;
3100  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3101            (MOVNTDQmr addr:$dst, VR128:$src)>;
3102}
3103
3104} // AddedComplexity
3105
3106//===----------------------------------------------------------------------===//
3107// SSE 1 & 2 - Prefetch and memory fence
3108//===----------------------------------------------------------------------===//
3109
3110// Prefetch intrinsic.
3111let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3112def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3113    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3114def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3115    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3116def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3117    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3118def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3119    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3120}
3121
3122// FIXME: How should flush instruction be modeled?
3123let SchedRW = [WriteLoad] in {
3124// Flush cache
3125def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3126               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3127               PS, Requires<[HasSSE2]>;
3128}
3129
3130let SchedRW = [WriteNop] in {
3131// Pause. This "instruction" is encoded as "rep; nop", so even though it
3132// was introduced with SSE2, it's backward compatible.
3133def PAUSE : I<0x90, RawFrm, (outs), (ins),
3134              "pause", [(int_x86_sse2_pause)]>, OBXS;
3135}
3136
3137let SchedRW = [WriteFence] in {
3138// Load, store, and memory fence
3139// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3140// to include any 64-bit target.
3141def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3142               PS, Requires<[HasSSE1]>;
3143def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3144               PS, Requires<[HasSSE2]>;
3145def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3146               PS, Requires<[HasMFence]>;
3147} // SchedRW
3148
3149def : Pat<(X86MFence), (MFENCE)>;
3150
3151//===----------------------------------------------------------------------===//
3152// SSE 1 & 2 - Load/Store XCSR register
3153//===----------------------------------------------------------------------===//
3154
3155let mayLoad=1, hasSideEffects=1 in
3156def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3157               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3158               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3159let mayStore=1, hasSideEffects=1 in
3160def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3161               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3162               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3163
3164let mayLoad=1, hasSideEffects=1 in
3165def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3166              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3167              TB, Sched<[WriteLDMXCSR]>;
3168let mayStore=1, hasSideEffects=1 in
3169def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3170              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3171              TB, Sched<[WriteSTMXCSR]>;
3172
3173//===---------------------------------------------------------------------===//
3174// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3175//===---------------------------------------------------------------------===//
3176
3177let ExeDomain = SSEPackedInt in { // SSE integer instructions
3178
3179let hasSideEffects = 0 in {
3180def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3181                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3182                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3183def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3184                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3185                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3186def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3187                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3188                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3189def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3190                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3191                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3192}
3193
3194// For Disassembler
3195let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3196def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3197                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3198                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3199                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3200def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3201                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3202                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3203                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3204def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3205                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3206                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3207                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3208def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3209                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3210                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3211                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3212}
3213
3214let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3215    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3216def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3217                      "movdqa\t{$src, $dst|$dst, $src}",
3218                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3219                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3220def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3221                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3222                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3223                      VEX, VEX_L, VEX_WIG;
3224def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3225                   "vmovdqu\t{$src, $dst|$dst, $src}",
3226                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3227                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3228                   XS, VEX, VEX_WIG;
3229def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3230                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3231                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3232                   XS, VEX, VEX_L, VEX_WIG;
3233}
3234
3235let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3236def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3237                      (ins i128mem:$dst, VR128:$src),
3238                      "movdqa\t{$src, $dst|$dst, $src}",
3239                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3240                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3241def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3242                      (ins i256mem:$dst, VR256:$src),
3243                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3244                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3245def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3246                   "vmovdqu\t{$src, $dst|$dst, $src}",
3247                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3248                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3249def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3250                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3251                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3252}
3253
3254let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3255let hasSideEffects = 0 in {
3256def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3257                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3258
3259def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3260                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3261                   XS, Requires<[UseSSE2]>;
3262}
3263
3264// For Disassembler
3265let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3266def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3267                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3268                       FoldGenData<"MOVDQArr">;
3269
3270def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3271                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3272                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3273}
3274} // SchedRW
3275
3276let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3277    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3278def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3279                   "movdqa\t{$src, $dst|$dst, $src}",
3280                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3281def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3282                   "movdqu\t{$src, $dst|$dst, $src}",
3283                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3284                 XS, Requires<[UseSSE2]>;
3285}
3286
3287let mayStore = 1, hasSideEffects = 0,
3288    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3289def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3290                   "movdqa\t{$src, $dst|$dst, $src}",
3291                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3292def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3293                   "movdqu\t{$src, $dst|$dst, $src}",
3294                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3295                 XS, Requires<[UseSSE2]>;
3296}
3297
3298} // ExeDomain = SSEPackedInt
3299
3300// Reversed version with ".s" suffix for GAS compatibility.
3301def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3302                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3303def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3304                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3305def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3306                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3307def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3308                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3309
3310// Reversed version with ".s" suffix for GAS compatibility.
3311def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3312                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3313def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3314                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3315
3316let Predicates = [HasAVX, NoVLX] in {
3317  // Additional patterns for other integer sizes.
3318  def : Pat<(alignedloadv4i32 addr:$src),
3319            (VMOVDQArm addr:$src)>;
3320  def : Pat<(alignedloadv8i16 addr:$src),
3321            (VMOVDQArm addr:$src)>;
3322  def : Pat<(alignedloadv16i8 addr:$src),
3323            (VMOVDQArm addr:$src)>;
3324  def : Pat<(loadv4i32 addr:$src),
3325            (VMOVDQUrm addr:$src)>;
3326  def : Pat<(loadv8i16 addr:$src),
3327            (VMOVDQUrm addr:$src)>;
3328  def : Pat<(loadv16i8 addr:$src),
3329            (VMOVDQUrm addr:$src)>;
3330
3331  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3332            (VMOVDQAmr addr:$dst, VR128:$src)>;
3333  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3334            (VMOVDQAmr addr:$dst, VR128:$src)>;
3335  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3336            (VMOVDQAmr addr:$dst, VR128:$src)>;
3337  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3338            (VMOVDQUmr addr:$dst, VR128:$src)>;
3339  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3340            (VMOVDQUmr addr:$dst, VR128:$src)>;
3341  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3342            (VMOVDQUmr addr:$dst, VR128:$src)>;
3343}
3344
3345//===---------------------------------------------------------------------===//
3346// SSE2 - Packed Integer Arithmetic Instructions
3347//===---------------------------------------------------------------------===//
3348
3349let ExeDomain = SSEPackedInt in { // SSE integer instructions
3350
3351/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3352multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3353                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3354                         PatFrag memop_frag, X86MemOperand x86memop,
3355                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3356  let isCommutable = 1 in
3357  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3358       (ins RC:$src1, RC:$src2),
3359       !if(Is2Addr,
3360           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3361           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3362       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3363       Sched<[sched]>;
3364  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3365       (ins RC:$src1, x86memop:$src2),
3366       !if(Is2Addr,
3367           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3368           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3369       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3370                                     (memop_frag addr:$src2))))]>,
3371       Sched<[sched.Folded, sched.ReadAfterFold]>;
3372}
3373} // ExeDomain = SSEPackedInt
3374
3375defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3376                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3377defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3378                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3379defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3380                             SchedWriteVecALU, 1, NoVLX>;
3381defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3382                             SchedWriteVecALU, 1, NoVLX>;
3383defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3384                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3385defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3386                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3387defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3388                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3389defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3390                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3391defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3392                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3393defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3394                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3395defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3396                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3397defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3398                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3399defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3400                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3401defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3402                             SchedWriteVecALU, 0, NoVLX>;
3403defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3404                             SchedWriteVecALU, 0, NoVLX>;
3405defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3406                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3407defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3408                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3409defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3410                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3411defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3412                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3413defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3414                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3415defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3416                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3417defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3418                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3419defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3420                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3421defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3422                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3423defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3424                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3425defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3426                             SchedWriteVecIMul, 1, NoVLX>;
3427
3428let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3429defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3430                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3431                              VEX_4V, VEX_WIG;
3432
3433let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3434defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3435                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3436                               0>, VEX_4V, VEX_L, VEX_WIG;
3437let Constraints = "$src1 = $dst" in
3438defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3439                             memop, i128mem, SchedWriteVecIMul.XMM>;
3440
3441let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3442defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3443                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3444                             VEX_4V, VEX_WIG;
3445let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3446defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3447                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3448                             VEX_4V, VEX_L, VEX_WIG;
3449let Constraints = "$src1 = $dst" in
3450defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3451                            memop, i128mem, SchedWritePSADBW.XMM>;
3452
3453//===---------------------------------------------------------------------===//
3454// SSE2 - Packed Integer Logical Instructions
3455//===---------------------------------------------------------------------===//
3456
3457multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3458                         string OpcodeStr, SDNode OpNode,
3459                         SDNode OpNode2, RegisterClass RC,
3460                         X86FoldableSchedWrite sched,
3461                         X86FoldableSchedWrite schedImm,
3462                         ValueType DstVT, ValueType SrcVT,
3463                         PatFrag ld_frag, bit Is2Addr = 1> {
3464  // src2 is always 128-bit
3465  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3466       (ins RC:$src1, VR128:$src2),
3467       !if(Is2Addr,
3468           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3469           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3470       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3471       Sched<[sched]>;
3472  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3473       (ins RC:$src1, i128mem:$src2),
3474       !if(Is2Addr,
3475           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3476           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3477       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3478                       (SrcVT (ld_frag addr:$src2)))))]>,
3479       Sched<[sched.Folded, sched.ReadAfterFold]>;
3480  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3481       (ins RC:$src1, u8imm:$src2),
3482       !if(Is2Addr,
3483           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3484           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3485       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>,
3486       Sched<[schedImm]>;
3487}
3488
3489multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3490                             string OpcodeStr, SDNode OpNode,
3491                             SDNode OpNode2, ValueType DstVT128,
3492                             ValueType DstVT256, ValueType SrcVT,
3493                             X86SchedWriteWidths sched,
3494                             X86SchedWriteWidths schedImm, Predicate prd> {
3495let Predicates = [HasAVX, prd] in
3496  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3497                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3498                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3499let Predicates = [HasAVX2, prd] in
3500  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3501                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3502                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3503                                VEX_WIG;
3504let Constraints = "$src1 = $dst" in
3505  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3506                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3507                            memop>;
3508}
3509
3510multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3511                        SDNode OpNode, RegisterClass RC, ValueType VT,
3512                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3513  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3514       !if(Is2Addr,
3515           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3516           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3517       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>,
3518       Sched<[sched]>;
3519}
3520
3521multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3522                            SDNode OpNode, X86SchedWriteWidths sched> {
3523let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3524  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3525                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3526let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3527  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3528                               VR256, v32i8, sched.YMM, 0>,
3529                               VEX_4V, VEX_L, VEX_WIG;
3530let Constraints = "$src1 = $dst" in
3531  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3532                           sched.XMM>;
3533}
3534
3535let ExeDomain = SSEPackedInt in {
3536  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3537                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3538                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3539  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3540                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3541                                 SchedWriteVecShiftImm, NoVLX>;
3542  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3543                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3544                                 SchedWriteVecShiftImm, NoVLX>;
3545
3546  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3547                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3548                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3549  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3550                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3551                                 SchedWriteVecShiftImm, NoVLX>;
3552  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3553                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3554                                 SchedWriteVecShiftImm, NoVLX>;
3555
3556  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3557                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3558                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3559  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3560                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3561                                 SchedWriteVecShiftImm, NoVLX>;
3562
3563  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3564                                 SchedWriteShuffle>;
3565  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3566                                 SchedWriteShuffle>;
3567} // ExeDomain = SSEPackedInt
3568
3569//===---------------------------------------------------------------------===//
3570// SSE2 - Packed Integer Comparison Instructions
3571//===---------------------------------------------------------------------===//
3572
3573defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3574                             SchedWriteVecALU, 1, TruePredicate>;
3575defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3576                             SchedWriteVecALU, 1, TruePredicate>;
3577defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3578                             SchedWriteVecALU, 1, TruePredicate>;
3579defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3580                             SchedWriteVecALU, 0, TruePredicate>;
3581defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3582                             SchedWriteVecALU, 0, TruePredicate>;
3583defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3584                             SchedWriteVecALU, 0, TruePredicate>;
3585
3586//===---------------------------------------------------------------------===//
3587// SSE2 - Packed Integer Shuffle Instructions
3588//===---------------------------------------------------------------------===//
3589
3590let ExeDomain = SSEPackedInt in {
3591multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3592                         SDNode OpNode, X86SchedWriteWidths sched,
3593                         Predicate prd> {
3594let Predicates = [HasAVX, prd] in {
3595  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3596                      (ins VR128:$src1, u8imm:$src2),
3597                      !strconcat("v", OpcodeStr,
3598                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3599                      [(set VR128:$dst,
3600                        (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3601                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3602  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3603                      (ins i128mem:$src1, u8imm:$src2),
3604                      !strconcat("v", OpcodeStr,
3605                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3606                     [(set VR128:$dst,
3607                       (vt128 (OpNode (load addr:$src1),
3608                        (i8 imm:$src2))))]>, VEX,
3609                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3610}
3611
3612let Predicates = [HasAVX2, prd] in {
3613  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3614                       (ins VR256:$src1, u8imm:$src2),
3615                       !strconcat("v", OpcodeStr,
3616                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3617                       [(set VR256:$dst,
3618                         (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>,
3619                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3620  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3621                       (ins i256mem:$src1, u8imm:$src2),
3622                       !strconcat("v", OpcodeStr,
3623                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3624                      [(set VR256:$dst,
3625                        (vt256 (OpNode (load addr:$src1),
3626                         (i8 imm:$src2))))]>, VEX, VEX_L,
3627                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3628}
3629
3630let Predicates = [UseSSE2] in {
3631  def ri : Ii8<0x70, MRMSrcReg,
3632               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3633               !strconcat(OpcodeStr,
3634                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3635               [(set VR128:$dst,
3636                 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>,
3637               Sched<[sched.XMM]>;
3638  def mi : Ii8<0x70, MRMSrcMem,
3639               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3640               !strconcat(OpcodeStr,
3641                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3642               [(set VR128:$dst,
3643                 (vt128 (OpNode (memop addr:$src1),
3644                        (i8 imm:$src2))))]>,
3645               Sched<[sched.XMM.Folded]>;
3646}
3647}
3648} // ExeDomain = SSEPackedInt
3649
3650defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3651                             SchedWriteShuffle, NoVLX>, PD;
3652defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3653                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3654defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3655                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3656
3657//===---------------------------------------------------------------------===//
3658// Packed Integer Pack Instructions (SSE & AVX)
3659//===---------------------------------------------------------------------===//
3660
3661let ExeDomain = SSEPackedInt in {
3662multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3663                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3664                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3665                     PatFrag ld_frag, bit Is2Addr = 1> {
3666  def rr : PDI<opc, MRMSrcReg,
3667               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3668               !if(Is2Addr,
3669                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3670                   !strconcat(OpcodeStr,
3671                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3672               [(set RC:$dst,
3673                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3674               Sched<[sched]>;
3675  def rm : PDI<opc, MRMSrcMem,
3676               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3677               !if(Is2Addr,
3678                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3679                   !strconcat(OpcodeStr,
3680                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3681               [(set RC:$dst,
3682                     (OutVT (OpNode (ArgVT RC:$src1),
3683                                    (ld_frag addr:$src2))))]>,
3684               Sched<[sched.Folded, sched.ReadAfterFold]>;
3685}
3686
3687multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3688                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3689                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3690                     PatFrag ld_frag, bit Is2Addr = 1> {
3691  def rr : SS48I<opc, MRMSrcReg,
3692                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3693                 !if(Is2Addr,
3694                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3695                     !strconcat(OpcodeStr,
3696                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3697                 [(set RC:$dst,
3698                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3699                 Sched<[sched]>;
3700  def rm : SS48I<opc, MRMSrcMem,
3701                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3702                 !if(Is2Addr,
3703                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3704                     !strconcat(OpcodeStr,
3705                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3706                 [(set RC:$dst,
3707                       (OutVT (OpNode (ArgVT RC:$src1),
3708                                      (ld_frag addr:$src2))))]>,
3709                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3710}
3711
3712let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3713  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3714                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3715                             VEX_4V, VEX_WIG;
3716  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3717                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3718                             VEX_4V, VEX_WIG;
3719
3720  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3721                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3722                             VEX_4V, VEX_WIG;
3723  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3724                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3725                             VEX_4V;
3726}
3727
3728let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3729  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3730                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3731                              VEX_4V, VEX_L, VEX_WIG;
3732  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3733                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3734                              VEX_4V, VEX_L, VEX_WIG;
3735
3736  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3737                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3738                              VEX_4V, VEX_L, VEX_WIG;
3739  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3740                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3741                              VEX_4V, VEX_L;
3742}
3743
3744let Constraints = "$src1 = $dst" in {
3745  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3746                            i128mem, SchedWriteShuffle.XMM, memop>;
3747  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3748                            i128mem, SchedWriteShuffle.XMM, memop>;
3749
3750  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3751                            i128mem, SchedWriteShuffle.XMM, memop>;
3752
3753  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3754                            i128mem, SchedWriteShuffle.XMM, memop>;
3755}
3756} // ExeDomain = SSEPackedInt
3757
3758//===---------------------------------------------------------------------===//
3759// SSE2 - Packed Integer Unpack Instructions
3760//===---------------------------------------------------------------------===//
3761
3762let ExeDomain = SSEPackedInt in {
3763multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3764                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3765                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3766                       bit Is2Addr = 1> {
3767  def rr : PDI<opc, MRMSrcReg,
3768      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3769      !if(Is2Addr,
3770          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3771          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3772      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3773      Sched<[sched]>;
3774  def rm : PDI<opc, MRMSrcMem,
3775      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3776      !if(Is2Addr,
3777          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3778          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3779      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3780      Sched<[sched.Folded, sched.ReadAfterFold]>;
3781}
3782
3783let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3784  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3785                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3786                                 VEX_4V, VEX_WIG;
3787  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3788                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3789                                 VEX_4V, VEX_WIG;
3790  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3791                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3792                                 VEX_4V, VEX_WIG;
3793  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3794                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3795                                 VEX_4V, VEX_WIG;
3796}
3797
3798let Predicates = [HasAVX, NoVLX] in {
3799  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3800                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3801                                 VEX_4V, VEX_WIG;
3802  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3803                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3804                                 VEX_4V, VEX_WIG;
3805  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3806                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3807                                 VEX_4V, VEX_WIG;
3808  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3809                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3810                                 VEX_4V, VEX_WIG;
3811}
3812
3813let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3814  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3815                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3816                                  VEX_4V, VEX_L, VEX_WIG;
3817  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3818                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3819                                  VEX_4V, VEX_L, VEX_WIG;
3820  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3821                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3822                                  VEX_4V, VEX_L, VEX_WIG;
3823  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3824                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3825                                  VEX_4V, VEX_L, VEX_WIG;
3826}
3827
3828let Predicates = [HasAVX2, NoVLX] in {
3829  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3830                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3831                                  VEX_4V, VEX_L, VEX_WIG;
3832  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3833                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3834                                  VEX_4V, VEX_L, VEX_WIG;
3835  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3836                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3837                                  VEX_4V, VEX_L, VEX_WIG;
3838  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3839                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3840                                  VEX_4V, VEX_L, VEX_WIG;
3841}
3842
3843let Constraints = "$src1 = $dst" in {
3844  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3845                                i128mem, SchedWriteShuffle.XMM, memop>;
3846  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3847                                i128mem, SchedWriteShuffle.XMM, memop>;
3848  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3849                                i128mem, SchedWriteShuffle.XMM, memop>;
3850  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3851                                i128mem, SchedWriteShuffle.XMM, memop>;
3852
3853  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3854                                i128mem, SchedWriteShuffle.XMM, memop>;
3855  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3856                                i128mem, SchedWriteShuffle.XMM, memop>;
3857  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3858                                i128mem, SchedWriteShuffle.XMM, memop>;
3859  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3860                                i128mem, SchedWriteShuffle.XMM, memop>;
3861}
3862} // ExeDomain = SSEPackedInt
3863
3864//===---------------------------------------------------------------------===//
3865// SSE2 - Packed Integer Extract and Insert
3866//===---------------------------------------------------------------------===//
3867
3868let ExeDomain = SSEPackedInt in {
3869multiclass sse2_pinsrw<bit Is2Addr = 1> {
3870  def rr : Ii8<0xC4, MRMSrcReg,
3871       (outs VR128:$dst), (ins VR128:$src1,
3872        GR32orGR64:$src2, u8imm:$src3),
3873       !if(Is2Addr,
3874           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3875           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3876       [(set VR128:$dst,
3877         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3878       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3879  def rm : Ii8<0xC4, MRMSrcMem,
3880                      (outs VR128:$dst), (ins VR128:$src1,
3881                       i16mem:$src2, u8imm:$src3),
3882       !if(Is2Addr,
3883           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3884           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3885       [(set VR128:$dst,
3886         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3887                    imm:$src3))]>,
3888       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3889}
3890
3891// Extract
3892let Predicates = [HasAVX, NoBWI] in
3893def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3894                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3895                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3896                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3897                                            imm:$src2))]>,
3898                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3899def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3900                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3901                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3902                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3903                                            imm:$src2))]>,
3904               Sched<[WriteVecExtract]>;
3905
3906// Insert
3907let Predicates = [HasAVX, NoBWI] in
3908defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3909
3910let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3911defm PINSRW : sse2_pinsrw, PD;
3912
3913} // ExeDomain = SSEPackedInt
3914
3915//===---------------------------------------------------------------------===//
3916// SSE2 - Packed Mask Creation
3917//===---------------------------------------------------------------------===//
3918
3919let ExeDomain = SSEPackedInt in {
3920
3921def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3922           (ins VR128:$src),
3923           "pmovmskb\t{$src, $dst|$dst, $src}",
3924           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3925           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3926
3927let Predicates = [HasAVX2] in {
3928def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3929           (ins VR256:$src),
3930           "pmovmskb\t{$src, $dst|$dst, $src}",
3931           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3932           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3933}
3934
3935def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3936           "pmovmskb\t{$src, $dst|$dst, $src}",
3937           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3938           Sched<[WriteVecMOVMSK]>;
3939
3940} // ExeDomain = SSEPackedInt
3941
3942//===---------------------------------------------------------------------===//
3943// SSE2 - Conditional Store
3944//===---------------------------------------------------------------------===//
3945
3946let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3947let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
3948def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
3949           (ins VR128:$src, VR128:$mask),
3950           "maskmovdqu\t{$mask, $src|$src, $mask}",
3951           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
3952           VEX, VEX_WIG;
3953let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
3954def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
3955           (ins VR128:$src, VR128:$mask),
3956           "maskmovdqu\t{$mask, $src|$src, $mask}",
3957           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
3958           VEX, VEX_WIG;
3959
3960let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
3961def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3962           "maskmovdqu\t{$mask, $src|$src, $mask}",
3963           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
3964let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
3965def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
3966           "maskmovdqu\t{$mask, $src|$src, $mask}",
3967           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
3968
3969} // ExeDomain = SSEPackedInt
3970
3971//===---------------------------------------------------------------------===//
3972// SSE2 - Move Doubleword/Quadword
3973//===---------------------------------------------------------------------===//
3974
3975//===---------------------------------------------------------------------===//
3976// Move Int Doubleword to Packed Double Int
3977//
3978let ExeDomain = SSEPackedInt in {
3979def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
3980                        "movd\t{$src, $dst|$dst, $src}",
3981                        [(set VR128:$dst,
3982                          (v4i32 (scalar_to_vector GR32:$src)))]>,
3983                          VEX, Sched<[WriteVecMoveFromGpr]>;
3984def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
3985                        "movd\t{$src, $dst|$dst, $src}",
3986                        [(set VR128:$dst,
3987                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
3988                        VEX, Sched<[WriteVecLoad]>;
3989def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
3990                          "movq\t{$src, $dst|$dst, $src}",
3991                          [(set VR128:$dst,
3992                            (v2i64 (scalar_to_vector GR64:$src)))]>,
3993                          VEX, Sched<[WriteVecMoveFromGpr]>;
3994let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
3995def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
3996                          "movq\t{$src, $dst|$dst, $src}", []>,
3997                          VEX, Sched<[WriteVecLoad]>;
3998let isCodeGenOnly = 1 in
3999def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4000                         "movq\t{$src, $dst|$dst, $src}",
4001                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4002                         VEX, Sched<[WriteVecMoveFromGpr]>;
4003
4004def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4005                      "movd\t{$src, $dst|$dst, $src}",
4006                      [(set VR128:$dst,
4007                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4008                      Sched<[WriteVecMoveFromGpr]>;
4009def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4010                      "movd\t{$src, $dst|$dst, $src}",
4011                      [(set VR128:$dst,
4012                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4013                      Sched<[WriteVecLoad]>;
4014def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4015                        "movq\t{$src, $dst|$dst, $src}",
4016                        [(set VR128:$dst,
4017                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4018                        Sched<[WriteVecMoveFromGpr]>;
4019let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4020def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4021                        "movq\t{$src, $dst|$dst, $src}", []>,
4022                        Sched<[WriteVecLoad]>;
4023let isCodeGenOnly = 1 in
4024def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4025                       "movq\t{$src, $dst|$dst, $src}",
4026                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4027                       Sched<[WriteVecMoveFromGpr]>;
4028} // ExeDomain = SSEPackedInt
4029
4030//===---------------------------------------------------------------------===//
4031// Move Int Doubleword to Single Scalar
4032//
4033let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4034  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4035                        "movd\t{$src, $dst|$dst, $src}",
4036                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4037                        VEX, Sched<[WriteVecMoveFromGpr]>;
4038
4039  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4040                        "movd\t{$src, $dst|$dst, $src}",
4041                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4042                        Sched<[WriteVecMoveFromGpr]>;
4043
4044} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4045
4046//===---------------------------------------------------------------------===//
4047// Move Packed Doubleword Int to Packed Double Int
4048//
4049let ExeDomain = SSEPackedInt in {
4050def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4051                         "movd\t{$src, $dst|$dst, $src}",
4052                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4053                                          (iPTR 0)))]>, VEX,
4054                         Sched<[WriteVecMoveToGpr]>;
4055def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4056                         (ins i32mem:$dst, VR128:$src),
4057                         "movd\t{$src, $dst|$dst, $src}",
4058                         [(store (i32 (extractelt (v4i32 VR128:$src),
4059                                       (iPTR 0))), addr:$dst)]>,
4060                         VEX, Sched<[WriteVecStore]>;
4061def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4062                       "movd\t{$src, $dst|$dst, $src}",
4063                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4064                                        (iPTR 0)))]>,
4065                   Sched<[WriteVecMoveToGpr]>;
4066def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4067                       "movd\t{$src, $dst|$dst, $src}",
4068                       [(store (i32 (extractelt (v4i32 VR128:$src),
4069                                     (iPTR 0))), addr:$dst)]>,
4070                       Sched<[WriteVecStore]>;
4071} // ExeDomain = SSEPackedInt
4072
4073//===---------------------------------------------------------------------===//
4074// Move Packed Doubleword Int first element to Doubleword Int
4075//
4076let ExeDomain = SSEPackedInt in {
4077let SchedRW = [WriteVecMoveToGpr] in {
4078def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4079                          "movq\t{$src, $dst|$dst, $src}",
4080                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4081                                                        (iPTR 0)))]>,
4082                      VEX;
4083
4084def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4085                        "movq\t{$src, $dst|$dst, $src}",
4086                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4087                                                         (iPTR 0)))]>;
4088} //SchedRW
4089
4090let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4091def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4092                          (ins i64mem:$dst, VR128:$src),
4093                          "movq\t{$src, $dst|$dst, $src}", []>,
4094                          VEX, Sched<[WriteVecStore]>;
4095let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4096def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4097                        "movq\t{$src, $dst|$dst, $src}", []>,
4098                        Sched<[WriteVecStore]>;
4099} // ExeDomain = SSEPackedInt
4100
4101//===---------------------------------------------------------------------===//
4102// Bitcast FR64 <-> GR64
4103//
4104let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4105  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4106                           "movq\t{$src, $dst|$dst, $src}",
4107                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4108                           VEX, Sched<[WriteVecMoveToGpr]>;
4109
4110  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4111                         "movq\t{$src, $dst|$dst, $src}",
4112                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4113                         Sched<[WriteVecMoveToGpr]>;
4114} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4115
4116//===---------------------------------------------------------------------===//
4117// Move Scalar Single to Double Int
4118//
4119let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4120  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4121                        "movd\t{$src, $dst|$dst, $src}",
4122                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4123                        VEX, Sched<[WriteVecMoveToGpr]>;
4124  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4125                        "movd\t{$src, $dst|$dst, $src}",
4126                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4127                        Sched<[WriteVecMoveToGpr]>;
4128} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4129
4130let Predicates = [UseAVX] in {
4131  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4132            (VMOVDI2PDIrr GR32:$src)>;
4133
4134  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4135            (VMOV64toPQIrr GR64:$src)>;
4136
4137  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4138  // These instructions also write zeros in the high part of a 256-bit register.
4139  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4140            (VMOVDI2PDIrm addr:$src)>;
4141  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4142            (VMOVDI2PDIrm addr:$src)>;
4143  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4144            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4145}
4146
4147let Predicates = [UseSSE2] in {
4148  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4149            (MOVDI2PDIrr GR32:$src)>;
4150
4151  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4152            (MOV64toPQIrr GR64:$src)>;
4153  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4154            (MOVDI2PDIrm addr:$src)>;
4155  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4156            (MOVDI2PDIrm addr:$src)>;
4157}
4158
4159// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4160// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4161// these aliases.
4162def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4163                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4164def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4165                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4166// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4167def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4168                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4169def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4170                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4171
4172//===---------------------------------------------------------------------===//
4173// SSE2 - Move Quadword
4174//===---------------------------------------------------------------------===//
4175
4176//===---------------------------------------------------------------------===//
4177// Move Quadword Int to Packed Quadword Int
4178//
4179
4180let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4181def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4182                    "vmovq\t{$src, $dst|$dst, $src}",
4183                    [(set VR128:$dst,
4184                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4185                    VEX, Requires<[UseAVX]>, VEX_WIG;
4186def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4187                    "movq\t{$src, $dst|$dst, $src}",
4188                    [(set VR128:$dst,
4189                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4190                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4191} // ExeDomain, SchedRW
4192
4193//===---------------------------------------------------------------------===//
4194// Move Packed Quadword Int to Quadword Int
4195//
4196let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4197def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4198                        "movq\t{$src, $dst|$dst, $src}",
4199                        [(store (i64 (extractelt (v2i64 VR128:$src),
4200                                      (iPTR 0))), addr:$dst)]>,
4201                        VEX, VEX_WIG;
4202def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4203                      "movq\t{$src, $dst|$dst, $src}",
4204                      [(store (i64 (extractelt (v2i64 VR128:$src),
4205                                    (iPTR 0))), addr:$dst)]>;
4206} // ExeDomain, SchedRW
4207
4208// For disassembler only
4209let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4210    SchedRW = [SchedWriteVecLogic.XMM] in {
4211def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4212                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4213def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4214                      "movq\t{$src, $dst|$dst, $src}", []>;
4215}
4216
4217def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4218                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4219def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4220                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4221
4222let Predicates = [UseAVX] in {
4223  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4224            (VMOVQI2PQIrm addr:$src)>;
4225  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4226            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4227
4228  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4229            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4230}
4231
4232let Predicates = [UseSSE2] in {
4233  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4234
4235  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4236            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4237}
4238
4239//===---------------------------------------------------------------------===//
4240// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4241// IA32 document. movq xmm1, xmm2 does clear the high bits.
4242//
4243let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4244def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4245                        "vmovq\t{$src, $dst|$dst, $src}",
4246                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4247                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4248def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4249                        "movq\t{$src, $dst|$dst, $src}",
4250                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4251                        XS, Requires<[UseSSE2]>;
4252} // ExeDomain, SchedRW
4253
4254let Predicates = [UseAVX] in {
4255  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4256            (VMOVZPQILo2PQIrr VR128:$src)>;
4257}
4258let Predicates = [UseSSE2] in {
4259  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4260            (MOVZPQILo2PQIrr VR128:$src)>;
4261}
4262
4263let Predicates = [UseAVX] in {
4264  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4265            (SUBREG_TO_REG (i32 0),
4266             (v2f64 (VMOVZPQILo2PQIrr
4267                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4268             sub_xmm)>;
4269  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4270            (SUBREG_TO_REG (i32 0),
4271             (v2i64 (VMOVZPQILo2PQIrr
4272                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4273             sub_xmm)>;
4274}
4275
4276//===---------------------------------------------------------------------===//
4277// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4278//===---------------------------------------------------------------------===//
4279
4280multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4281                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4282                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4283def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4284                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4285                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4286                      Sched<[sched]>;
4287def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4288                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4289                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4290                      Sched<[sched.Folded]>;
4291}
4292
4293let Predicates = [HasAVX, NoVLX] in {
4294  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4295                                       v4f32, VR128, loadv4f32, f128mem,
4296                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4297  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4298                                       v4f32, VR128, loadv4f32, f128mem,
4299                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4300  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4301                                       v8f32, VR256, loadv8f32, f256mem,
4302                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4303  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4304                                       v8f32, VR256, loadv8f32, f256mem,
4305                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4306}
4307defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4308                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4309defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4310                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4311
4312let Predicates = [HasAVX, NoVLX] in {
4313  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4314            (VMOVSHDUPrr VR128:$src)>;
4315  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4316            (VMOVSHDUPrm addr:$src)>;
4317  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4318            (VMOVSLDUPrr VR128:$src)>;
4319  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4320            (VMOVSLDUPrm addr:$src)>;
4321  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4322            (VMOVSHDUPYrr VR256:$src)>;
4323  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4324            (VMOVSHDUPYrm addr:$src)>;
4325  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4326            (VMOVSLDUPYrr VR256:$src)>;
4327  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4328            (VMOVSLDUPYrm addr:$src)>;
4329}
4330
4331let Predicates = [UseSSE3] in {
4332  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4333            (MOVSHDUPrr VR128:$src)>;
4334  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4335            (MOVSHDUPrm addr:$src)>;
4336  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4337            (MOVSLDUPrr VR128:$src)>;
4338  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4339            (MOVSLDUPrm addr:$src)>;
4340}
4341
4342//===---------------------------------------------------------------------===//
4343// SSE3 - Replicate Double FP - MOVDDUP
4344//===---------------------------------------------------------------------===//
4345
4346multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4347def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4348                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4349                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4350                    Sched<[sched.XMM]>;
4351def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4352                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4353                    [(set VR128:$dst,
4354                      (v2f64 (X86Movddup
4355                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4356                    Sched<[sched.XMM.Folded]>;
4357}
4358
4359// FIXME: Merge with above classes when there are patterns for the ymm version
4360multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4361def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4362                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4363                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4364                    Sched<[sched.YMM]>;
4365def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4366                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4367                    [(set VR256:$dst,
4368                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4369                    Sched<[sched.YMM.Folded]>;
4370}
4371
4372let Predicates = [HasAVX, NoVLX] in {
4373  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4374                                      VEX, VEX_WIG;
4375  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4376                                        VEX, VEX_L, VEX_WIG;
4377}
4378
4379defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4380
4381
4382let Predicates = [HasAVX, NoVLX] in {
4383  def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
4384            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4385  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4386            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4387}
4388
4389let Predicates = [UseSSE3] in {
4390  // No need for aligned memory as this only loads 64-bits.
4391  def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
4392            (MOVDDUPrm addr:$src)>;
4393  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4394            (MOVDDUPrm addr:$src)>;
4395}
4396
4397//===---------------------------------------------------------------------===//
4398// SSE3 - Move Unaligned Integer
4399//===---------------------------------------------------------------------===//
4400
4401let Predicates = [HasAVX] in {
4402  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4403                      "vlddqu\t{$src, $dst|$dst, $src}",
4404                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4405                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4406  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4407                       "vlddqu\t{$src, $dst|$dst, $src}",
4408                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4409                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4410} // Predicates
4411
4412def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4413                   "lddqu\t{$src, $dst|$dst, $src}",
4414                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4415                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4416
4417//===---------------------------------------------------------------------===//
4418// SSE3 - Arithmetic
4419//===---------------------------------------------------------------------===//
4420
4421multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4422                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4423                       PatFrag ld_frag, bit Is2Addr = 1> {
4424  def rr : I<0xD0, MRMSrcReg,
4425       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4426       !if(Is2Addr,
4427           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4428           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4429       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4430       Sched<[sched]>;
4431  def rm : I<0xD0, MRMSrcMem,
4432       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4433       !if(Is2Addr,
4434           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4435           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4436       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4437       Sched<[sched.Folded, sched.ReadAfterFold]>;
4438}
4439
4440let Predicates = [HasAVX] in {
4441  let ExeDomain = SSEPackedSingle in {
4442    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4443                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4444                                 XD, VEX_4V, VEX_WIG;
4445    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4446                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4447                                  XD, VEX_4V, VEX_L, VEX_WIG;
4448  }
4449  let ExeDomain = SSEPackedDouble in {
4450    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4451                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4452                                 PD, VEX_4V, VEX_WIG;
4453    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4454                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4455                                  PD, VEX_4V, VEX_L, VEX_WIG;
4456  }
4457}
4458let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4459  let ExeDomain = SSEPackedSingle in
4460  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4461                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4462  let ExeDomain = SSEPackedDouble in
4463  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4464                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4465}
4466
4467//===---------------------------------------------------------------------===//
4468// SSE3 Instructions
4469//===---------------------------------------------------------------------===//
4470
4471// Horizontal ops
4472multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4473                   X86MemOperand x86memop, SDNode OpNode,
4474                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4475                   bit Is2Addr = 1> {
4476  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4477       !if(Is2Addr,
4478         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4479         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4480      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4481      Sched<[sched]>;
4482
4483  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4484       !if(Is2Addr,
4485         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4486         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4487      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4488      Sched<[sched.Folded, sched.ReadAfterFold]>;
4489}
4490multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4491                  X86MemOperand x86memop, SDNode OpNode,
4492                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4493                  bit Is2Addr = 1> {
4494  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4495       !if(Is2Addr,
4496         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4497         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4498      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4499        Sched<[sched]>;
4500
4501  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4502       !if(Is2Addr,
4503         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4504         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4505      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4506        Sched<[sched.Folded, sched.ReadAfterFold]>;
4507}
4508
4509let Predicates = [HasAVX] in {
4510  let ExeDomain = SSEPackedSingle in {
4511    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4512                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4513    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4514                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4515    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4516                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4517    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4518                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4519  }
4520  let ExeDomain = SSEPackedDouble in {
4521    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4522                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4523    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4524                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4525    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4526                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4527    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4528                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4529  }
4530}
4531
4532let Constraints = "$src1 = $dst" in {
4533  let ExeDomain = SSEPackedSingle in {
4534    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4535                          WriteFHAdd, memopv4f32>;
4536    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4537                          WriteFHAdd, memopv4f32>;
4538  }
4539  let ExeDomain = SSEPackedDouble in {
4540    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4541                         WriteFHAdd, memopv2f64>;
4542    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4543                         WriteFHAdd, memopv2f64>;
4544  }
4545}
4546
4547//===---------------------------------------------------------------------===//
4548// SSSE3 - Packed Absolute Instructions
4549//===---------------------------------------------------------------------===//
4550
4551/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4552multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4553                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4554  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4555                 (ins VR128:$src),
4556                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4557                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4558                 Sched<[sched.XMM]>;
4559
4560  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4561                 (ins i128mem:$src),
4562                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4563                 [(set VR128:$dst,
4564                   (vt (OpNode (ld_frag addr:$src))))]>,
4565                 Sched<[sched.XMM.Folded]>;
4566}
4567
4568/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4569multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4570                          SDNode OpNode, X86SchedWriteWidths sched> {
4571  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4572                  (ins VR256:$src),
4573                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4574                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4575                  Sched<[sched.YMM]>;
4576
4577  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4578                  (ins i256mem:$src),
4579                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4580                  [(set VR256:$dst,
4581                    (vt (OpNode (load addr:$src))))]>,
4582                  Sched<[sched.YMM.Folded]>;
4583}
4584
4585let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4586  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4587                              load>, VEX, VEX_WIG;
4588  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4589                              load>, VEX, VEX_WIG;
4590}
4591let Predicates = [HasAVX, NoVLX] in {
4592  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4593                              load>, VEX, VEX_WIG;
4594}
4595let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4596  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4597                                VEX, VEX_L, VEX_WIG;
4598  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4599                                VEX, VEX_L, VEX_WIG;
4600}
4601let Predicates = [HasAVX2, NoVLX] in {
4602  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4603                                VEX, VEX_L, VEX_WIG;
4604}
4605
4606defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4607                          memop>;
4608defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4609                          memop>;
4610defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4611                          memop>;
4612
4613//===---------------------------------------------------------------------===//
4614// SSSE3 - Packed Binary Operator Instructions
4615//===---------------------------------------------------------------------===//
4616
4617/// SS3I_binop_rm - Simple SSSE3 bin op
4618multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4619                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4620                         PatFrag memop_frag, X86MemOperand x86memop,
4621                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4622  let isCommutable = 1 in
4623  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4624       (ins RC:$src1, RC:$src2),
4625       !if(Is2Addr,
4626         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4627         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4628       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4629       Sched<[sched]>;
4630  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4631       (ins RC:$src1, x86memop:$src2),
4632       !if(Is2Addr,
4633         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4634         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4635       [(set RC:$dst,
4636         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4637       Sched<[sched.Folded, sched.ReadAfterFold]>;
4638}
4639
4640/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4641multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4642                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4643                             PatFrag ld_frag, bit Is2Addr = 1> {
4644  let isCommutable = 1 in
4645  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4646       (ins VR128:$src1, VR128:$src2),
4647       !if(Is2Addr,
4648         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4649         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4650       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4651       Sched<[sched]>;
4652  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4653       (ins VR128:$src1, i128mem:$src2),
4654       !if(Is2Addr,
4655         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4656         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4657       [(set VR128:$dst,
4658         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4659       Sched<[sched.Folded, sched.ReadAfterFold]>;
4660}
4661
4662multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4663                               Intrinsic IntId256,
4664                               X86FoldableSchedWrite sched> {
4665  let isCommutable = 1 in
4666  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4667       (ins VR256:$src1, VR256:$src2),
4668       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4669       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4670       Sched<[sched]>;
4671  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4672       (ins VR256:$src1, i256mem:$src2),
4673       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4674       [(set VR256:$dst,
4675         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4676       Sched<[sched.Folded, sched.ReadAfterFold]>;
4677}
4678
4679let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4680let isCommutable = 0 in {
4681  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4682                                  VR128, load, i128mem,
4683                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4684  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4685                                  v16i8, VR128, load, i128mem,
4686                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4687}
4688defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4689                                  VR128, load, i128mem,
4690                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4691}
4692
4693let ImmT = NoImm, Predicates = [HasAVX] in {
4694let isCommutable = 0 in {
4695  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4696                                  load, i128mem,
4697                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4698  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4699                                  load, i128mem,
4700                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4701  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4702                                  load, i128mem,
4703                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4704  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4705                                  load, i128mem,
4706                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
4707  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4708                                      int_x86_ssse3_psign_b_128,
4709                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4710  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4711                                      int_x86_ssse3_psign_w_128,
4712                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4713  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4714                                      int_x86_ssse3_psign_d_128,
4715                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4716  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4717                                      int_x86_ssse3_phadd_sw_128,
4718                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4719  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4720                                      int_x86_ssse3_phsub_sw_128,
4721                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4722}
4723}
4724
4725let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4726let isCommutable = 0 in {
4727  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4728                                  VR256, load, i256mem,
4729                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4730  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4731                                   v32i8, VR256, load, i256mem,
4732                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4733}
4734defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4735                                  VR256, load, i256mem,
4736                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4737}
4738
4739let ImmT = NoImm, Predicates = [HasAVX2] in {
4740let isCommutable = 0 in {
4741  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4742                                  VR256, load, i256mem,
4743                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4744  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4745                                  load, i256mem,
4746                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4747  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4748                                  VR256, load, i256mem,
4749                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4750  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4751                                  load, i256mem,
4752                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4753  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4754                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4755  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4756                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4757  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4758                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4759  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4760                                       int_x86_avx2_phadd_sw,
4761                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4762  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4763                                       int_x86_avx2_phsub_sw,
4764                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4765}
4766}
4767
4768// None of these have i8 immediate fields.
4769let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4770let isCommutable = 0 in {
4771  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4772                                 memop, i128mem, SchedWritePHAdd.XMM>;
4773  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4774                                 memop, i128mem, SchedWritePHAdd.XMM>;
4775  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4776                                 memop, i128mem, SchedWritePHAdd.XMM>;
4777  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4778                                 memop, i128mem, SchedWritePHAdd.XMM>;
4779  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4780                                     SchedWriteVecALU.XMM, memop>;
4781  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4782                                     SchedWriteVecALU.XMM, memop>;
4783  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4784                                     SchedWriteVecALU.XMM, memop>;
4785  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4786                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4787  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4788                                     int_x86_ssse3_phadd_sw_128,
4789                                     SchedWritePHAdd.XMM, memop>;
4790  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4791                                     int_x86_ssse3_phsub_sw_128,
4792                                     SchedWritePHAdd.XMM, memop>;
4793  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4794                                 v16i8, VR128, memop, i128mem,
4795                                 SchedWriteVecIMul.XMM>;
4796}
4797defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4798                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4799}
4800
4801//===---------------------------------------------------------------------===//
4802// SSSE3 - Packed Align Instruction Patterns
4803//===---------------------------------------------------------------------===//
4804
4805multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4806                         PatFrag memop_frag, X86MemOperand x86memop,
4807                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4808  let hasSideEffects = 0 in {
4809  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4810      (ins RC:$src1, RC:$src2, u8imm:$src3),
4811      !if(Is2Addr,
4812        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4813        !strconcat(asm,
4814                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4815      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>,
4816      Sched<[sched]>;
4817  let mayLoad = 1 in
4818  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4819      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4820      !if(Is2Addr,
4821        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4822        !strconcat(asm,
4823                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4824      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4825                                     (memop_frag addr:$src2),
4826                                     (i8 imm:$src3))))]>,
4827      Sched<[sched.Folded, sched.ReadAfterFold]>;
4828  }
4829}
4830
4831let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4832  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4833                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4834let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4835  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4836                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4837let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4838  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4839                               SchedWriteShuffle.XMM>;
4840
4841//===---------------------------------------------------------------------===//
4842// SSSE3 - Thread synchronization
4843//===---------------------------------------------------------------------===//
4844
4845let SchedRW = [WriteSystem] in {
4846let Uses = [EAX, ECX, EDX] in
4847def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4848                     TB, Requires<[HasSSE3, Not64BitMode]>;
4849let Uses = [RAX, ECX, EDX] in
4850def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4851                     TB, Requires<[HasSSE3, In64BitMode]>;
4852
4853let Uses = [ECX, EAX] in
4854def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4855                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4856} // SchedRW
4857
4858def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4859def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4860
4861def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4862      Requires<[Not64BitMode]>;
4863def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4864      Requires<[In64BitMode]>;
4865
4866//===----------------------------------------------------------------------===//
4867// SSE4.1 - Packed Move with Sign/Zero Extend
4868// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4869//===----------------------------------------------------------------------===//
4870
4871multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4872                            RegisterClass OutRC, RegisterClass InRC,
4873                            X86FoldableSchedWrite sched> {
4874  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4875                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4876                 Sched<[sched]>;
4877
4878  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4879                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4880                 Sched<[sched.Folded]>;
4881}
4882
4883multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4884                              X86MemOperand MemOp, X86MemOperand MemYOp,
4885                              Predicate prd> {
4886  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4887                               SchedWriteShuffle.XMM>;
4888  let Predicates = [HasAVX, prd] in
4889    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4890                                     VR128, VR128, SchedWriteShuffle.XMM>,
4891                                     VEX, VEX_WIG;
4892  let Predicates = [HasAVX2, prd] in
4893    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4894                                     VR256, VR128, WriteShuffle256>,
4895                                     VEX, VEX_L, VEX_WIG;
4896}
4897
4898multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4899                          X86MemOperand MemYOp, Predicate prd> {
4900  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4901                                        MemOp, MemYOp, prd>;
4902  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4903                                        !strconcat("pmovzx", OpcodeStr),
4904                                        MemOp, MemYOp, prd>;
4905}
4906
4907defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4908defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4909defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4910
4911defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4912defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4913
4914defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4915
4916// AVX2 Patterns
4917multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4918                                     SDNode ExtOp, SDNode InVecOp> {
4919  // Register-Register patterns
4920  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4921  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4922            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4923  }
4924  let Predicates = [HasAVX2, NoVLX] in {
4925  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4926            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4927  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4928            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4929
4930  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4931            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4932  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4933            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4934
4935  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4936            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4937  }
4938
4939  // Simple Register-Memory patterns
4940  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4941  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4942            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4943
4944  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
4945            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4946  }
4947
4948  let Predicates = [HasAVX2, NoVLX] in {
4949  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4950            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4951  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4952            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4953
4954  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4955            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4956  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
4957            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4958
4959  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
4960            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4961  }
4962
4963  // AVX2 Register-Memory patterns
4964  let Predicates = [HasAVX2, NoVLX] in {
4965  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
4966            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
4967
4968  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4969            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4970  def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
4971            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
4972
4973  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
4974            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
4975
4976  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
4977            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4978  def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
4979            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
4980
4981  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
4982            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4983  def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
4984            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
4985  }
4986}
4987
4988defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
4989defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
4990
4991// SSE4.1/AVX patterns.
4992multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
4993                                SDNode ExtOp> {
4994  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4995  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
4996            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
4997  }
4998  let Predicates = [HasAVX, NoVLX] in {
4999  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5000            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5001  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5002            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5003
5004  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5005            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5006  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5007            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5008
5009  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5010            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5011  }
5012  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5013  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5014            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5015  }
5016  let Predicates = [HasAVX, NoVLX] in {
5017  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5018            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5019  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5020            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5021
5022  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5023            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5024  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5025            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5026
5027  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5028            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5029  }
5030  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5031  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5032            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5033  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5034            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5035  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5036            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5037  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5038            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5039  }
5040  let Predicates = [HasAVX, NoVLX] in {
5041  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5042            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5043  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5044            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5045  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5046            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5047
5048  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5049            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5050  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5051            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5052
5053  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5054            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5055  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5056            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5057  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5058            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5059  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5060            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5061
5062  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5063            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5064  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5065            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5066  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5067            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5068
5069  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5070            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5071  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5072            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5073  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5074            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5075  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5076            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5077  }
5078}
5079
5080defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5081defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5082
5083let Predicates = [UseSSE41] in {
5084  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5085  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5086}
5087
5088//===----------------------------------------------------------------------===//
5089// SSE4.1 - Extract Instructions
5090//===----------------------------------------------------------------------===//
5091
5092/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5093multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5094  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5095                 (ins VR128:$src1, u8imm:$src2),
5096                 !strconcat(OpcodeStr,
5097                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5098                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5099                                         imm:$src2))]>,
5100                  Sched<[WriteVecExtract]>;
5101  let hasSideEffects = 0, mayStore = 1 in
5102  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5103                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5104                 !strconcat(OpcodeStr,
5105                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5106                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5107                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5108}
5109
5110let Predicates = [HasAVX, NoBWI] in
5111  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5112
5113defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5114
5115
5116/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5117multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5118  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5119  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5120                   (ins VR128:$src1, u8imm:$src2),
5121                   !strconcat(OpcodeStr,
5122                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5123                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5124
5125  let hasSideEffects = 0, mayStore = 1 in
5126  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5127                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5128                 !strconcat(OpcodeStr,
5129                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5130                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5131                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5132}
5133
5134let Predicates = [HasAVX, NoBWI] in
5135  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5136
5137defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5138
5139
5140/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5141multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5142  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5143                 (ins VR128:$src1, u8imm:$src2),
5144                 !strconcat(OpcodeStr,
5145                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5146                 [(set GR32:$dst,
5147                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5148                  Sched<[WriteVecExtract]>;
5149  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5150                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5151                 !strconcat(OpcodeStr,
5152                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5153                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5154                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5155}
5156
5157let Predicates = [HasAVX, NoDQI] in
5158  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5159
5160defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5161
5162/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5163multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5164  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5165                 (ins VR128:$src1, u8imm:$src2),
5166                 !strconcat(OpcodeStr,
5167                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5168                 [(set GR64:$dst,
5169                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5170                  Sched<[WriteVecExtract]>;
5171  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5172                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5173                 !strconcat(OpcodeStr,
5174                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5175                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5176                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5177}
5178
5179let Predicates = [HasAVX, NoDQI] in
5180  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5181
5182defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5183
5184/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5185/// destination
5186multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5187  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5188                   (ins VR128:$src1, u8imm:$src2),
5189                   !strconcat(OpcodeStr,
5190                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5191                   [(set GR32orGR64:$dst,
5192                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5193                   Sched<[WriteVecExtract]>;
5194  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5195                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5196                   !strconcat(OpcodeStr,
5197                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5198                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5199                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5200}
5201
5202let ExeDomain = SSEPackedSingle in {
5203  let Predicates = [UseAVX] in
5204    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5205  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5206}
5207
5208//===----------------------------------------------------------------------===//
5209// SSE4.1 - Insert Instructions
5210//===----------------------------------------------------------------------===//
5211
5212multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5213  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5214      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5215      !if(Is2Addr,
5216        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5217        !strconcat(asm,
5218                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5219      [(set VR128:$dst,
5220        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5221      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5222  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5223      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5224      !if(Is2Addr,
5225        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5226        !strconcat(asm,
5227                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5228      [(set VR128:$dst,
5229        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5230                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5231}
5232
5233let Predicates = [HasAVX, NoBWI] in
5234  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5235let Constraints = "$src1 = $dst" in
5236  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5237
5238multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5239  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5240      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5241      !if(Is2Addr,
5242        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5243        !strconcat(asm,
5244                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5245      [(set VR128:$dst,
5246        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5247      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5248  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5249      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5250      !if(Is2Addr,
5251        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5252        !strconcat(asm,
5253                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5254      [(set VR128:$dst,
5255        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5256                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5257}
5258
5259let Predicates = [HasAVX, NoDQI] in
5260  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5261let Constraints = "$src1 = $dst" in
5262  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5263
5264multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5265  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5266      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5267      !if(Is2Addr,
5268        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5269        !strconcat(asm,
5270                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5271      [(set VR128:$dst,
5272        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5273      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5274  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5275      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5276      !if(Is2Addr,
5277        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5278        !strconcat(asm,
5279                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5280      [(set VR128:$dst,
5281        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5282                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5283}
5284
5285let Predicates = [HasAVX, NoDQI] in
5286  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5287let Constraints = "$src1 = $dst" in
5288  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5289
5290// insertps has a few different modes, there's the first two here below which
5291// are optimized inserts that won't zero arbitrary elements in the destination
5292// vector. The next one matches the intrinsic and could zero arbitrary elements
5293// in the target vector.
5294multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5295  let isCommutable = 1 in
5296  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5297      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5298      !if(Is2Addr,
5299        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5300        !strconcat(asm,
5301                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5302      [(set VR128:$dst,
5303        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>,
5304      Sched<[SchedWriteFShuffle.XMM]>;
5305  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5306      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5307      !if(Is2Addr,
5308        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5309        !strconcat(asm,
5310                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5311      [(set VR128:$dst,
5312        (X86insertps VR128:$src1,
5313                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5314                    imm:$src3))]>,
5315      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5316}
5317
5318let ExeDomain = SSEPackedSingle in {
5319  let Predicates = [UseAVX] in
5320    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5321                     VEX_4V, VEX_WIG;
5322  let Constraints = "$src1 = $dst" in
5323    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5324}
5325
5326let Predicates = [UseAVX] in {
5327  // If we're inserting an element from a vbroadcast of a load, fold the
5328  // load into the X86insertps instruction.
5329  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5330                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
5331            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5332  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
5333                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
5334            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
5335}
5336
5337//===----------------------------------------------------------------------===//
5338// SSE4.1 - Round Instructions
5339//===----------------------------------------------------------------------===//
5340
5341multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5342                           X86MemOperand x86memop, RegisterClass RC,
5343                           ValueType VT, PatFrag mem_frag, SDNode OpNode,
5344                           X86FoldableSchedWrite sched> {
5345  // Intrinsic operation, reg.
5346  // Vector intrinsic operation, reg
5347  def r : SS4AIi8<opc, MRMSrcReg,
5348                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5349                  !strconcat(OpcodeStr,
5350                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5351                  [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>,
5352                  Sched<[sched]>;
5353
5354  // Vector intrinsic operation, mem
5355  def m : SS4AIi8<opc, MRMSrcMem,
5356                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5357                  !strconcat(OpcodeStr,
5358                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5359                  [(set RC:$dst,
5360                        (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>,
5361                  Sched<[sched.Folded]>;
5362}
5363
5364multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5365                          string OpcodeStr, X86FoldableSchedWrite sched> {
5366let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5367  def SSr : SS4AIi8<opcss, MRMSrcReg,
5368        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5369        !strconcat(OpcodeStr,
5370            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5371      []>, Sched<[sched]>;
5372
5373  let mayLoad = 1 in
5374  def SSm : SS4AIi8<opcss, MRMSrcMem,
5375        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5376        !strconcat(OpcodeStr,
5377             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5378        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5379} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5380
5381let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5382  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5383        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5384        !strconcat(OpcodeStr,
5385              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5386        []>, Sched<[sched]>;
5387
5388  let mayLoad = 1 in
5389  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5390        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5391        !strconcat(OpcodeStr,
5392             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5393        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5394} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5395}
5396
5397multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5398                           string OpcodeStr, X86FoldableSchedWrite sched> {
5399let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
5400  def SSr : SS4AIi8<opcss, MRMSrcReg,
5401                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5402                    !strconcat(OpcodeStr,
5403                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5404                    []>, Sched<[sched]>;
5405
5406  let mayLoad = 1 in
5407  def SSm : SS4AIi8<opcss, MRMSrcMem,
5408                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5409                    !strconcat(OpcodeStr,
5410                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5411                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5412} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5413
5414let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
5415  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5416                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5417                    !strconcat(OpcodeStr,
5418                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5419                    []>, Sched<[sched]>;
5420
5421  let mayLoad = 1 in
5422  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5423                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5424                    !strconcat(OpcodeStr,
5425                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5426                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5427} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5428}
5429
5430multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5431                            string OpcodeStr, X86FoldableSchedWrite sched,
5432                            ValueType VT32, ValueType VT64,
5433                            SDNode OpNode, bit Is2Addr = 1> {
5434let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
5435  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5436        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5437        !if(Is2Addr,
5438            !strconcat(OpcodeStr,
5439                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5440            !strconcat(OpcodeStr,
5441                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5442        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5443        Sched<[sched]>;
5444
5445  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5446        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5447        !if(Is2Addr,
5448            !strconcat(OpcodeStr,
5449                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5450            !strconcat(OpcodeStr,
5451                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5452        [(set VR128:$dst,
5453             (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
5454        Sched<[sched.Folded, sched.ReadAfterFold]>;
5455} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5456
5457let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
5458  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5459        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5460        !if(Is2Addr,
5461            !strconcat(OpcodeStr,
5462                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5463            !strconcat(OpcodeStr,
5464                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5465        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
5466        Sched<[sched]>;
5467
5468  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5469        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5470        !if(Is2Addr,
5471            !strconcat(OpcodeStr,
5472                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5473            !strconcat(OpcodeStr,
5474                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5475        [(set VR128:$dst,
5476              (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
5477        Sched<[sched.Folded, sched.ReadAfterFold]>;
5478} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5479}
5480
5481// FP round - roundss, roundps, roundsd, roundpd
5482let Predicates = [HasAVX, NoVLX] in {
5483  let ExeDomain = SSEPackedSingle in {
5484    // Intrinsic form
5485    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5486                                     loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
5487                                   VEX, VEX_WIG;
5488    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5489                                     loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
5490                                   VEX, VEX_L, VEX_WIG;
5491  }
5492
5493  let ExeDomain = SSEPackedDouble in {
5494    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5495                                     loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
5496                                   VEX, VEX_WIG;
5497    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5498                                     loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
5499                                   VEX, VEX_L, VEX_WIG;
5500  }
5501}
5502let Predicates = [UseAVX] in {
5503  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5504                                  v4f32, v2f64, X86RndScales, 0>,
5505                                  VEX_4V, VEX_LIG, VEX_WIG;
5506  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5507                                VEX_4V, VEX_LIG, VEX_WIG;
5508}
5509
5510let Predicates = [UseAVX] in {
5511  def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
5512            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>;
5513  def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
5514            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>;
5515}
5516
5517let Predicates = [UseAVX, OptForSize] in {
5518  def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
5519            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
5520  def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
5521            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>;
5522}
5523
5524let ExeDomain = SSEPackedSingle in
5525defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5526                                memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
5527let ExeDomain = SSEPackedDouble in
5528defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5529                                memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
5530
5531defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5532
5533let Constraints = "$src1 = $dst" in
5534defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5535                               v4f32, v2f64, X86RndScales>;
5536
5537let Predicates = [UseSSE41] in {
5538  def : Pat<(X86VRndScale FR32:$src1, imm:$src2),
5539            (ROUNDSSr FR32:$src1, imm:$src2)>;
5540  def : Pat<(X86VRndScale FR64:$src1, imm:$src2),
5541            (ROUNDSDr FR64:$src1, imm:$src2)>;
5542}
5543
5544let Predicates = [UseSSE41, OptForSize] in {
5545  def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2),
5546            (ROUNDSSm addr:$src1, imm:$src2)>;
5547  def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2),
5548            (ROUNDSDm addr:$src1, imm:$src2)>;
5549}
5550
5551//===----------------------------------------------------------------------===//
5552// SSE4.1 - Packed Bit Test
5553//===----------------------------------------------------------------------===//
5554
5555// ptest instruction we'll lower to this in X86ISelLowering primarily from
5556// the intel intrinsic that corresponds to this.
5557let Defs = [EFLAGS], Predicates = [HasAVX] in {
5558def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5559                "vptest\t{$src2, $src1|$src1, $src2}",
5560                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5561                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5562def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5563                "vptest\t{$src2, $src1|$src1, $src2}",
5564                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5565                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5566                VEX, VEX_WIG;
5567
5568def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5569                "vptest\t{$src2, $src1|$src1, $src2}",
5570                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5571                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5572def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5573                "vptest\t{$src2, $src1|$src1, $src2}",
5574                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5575                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5576                VEX, VEX_L, VEX_WIG;
5577}
5578
5579let Defs = [EFLAGS] in {
5580def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5581              "ptest\t{$src2, $src1|$src1, $src2}",
5582              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5583              Sched<[SchedWriteVecTest.XMM]>;
5584def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5585              "ptest\t{$src2, $src1|$src1, $src2}",
5586              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5587              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5588}
5589
5590// The bit test instructions below are AVX only
5591multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5592                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5593                       X86FoldableSchedWrite sched> {
5594  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5595            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5596            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5597            Sched<[sched]>, VEX;
5598  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5599            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5600            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5601            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5602}
5603
5604let Defs = [EFLAGS], Predicates = [HasAVX] in {
5605let ExeDomain = SSEPackedSingle in {
5606defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5607                            SchedWriteFTest.XMM>;
5608defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5609                            SchedWriteFTest.YMM>, VEX_L;
5610}
5611let ExeDomain = SSEPackedDouble in {
5612defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5613                            SchedWriteFTest.XMM>;
5614defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5615                            SchedWriteFTest.YMM>, VEX_L;
5616}
5617}
5618
5619//===----------------------------------------------------------------------===//
5620// SSE4.1 - Misc Instructions
5621//===----------------------------------------------------------------------===//
5622
5623let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5624  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5625                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5626                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5627                     Sched<[WritePOPCNT]>, OpSize16, XS;
5628  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5629                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5630                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5631                      (implicit EFLAGS)]>,
5632                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5633
5634  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5635                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5636                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5637                     Sched<[WritePOPCNT]>, OpSize32, XS;
5638
5639  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5640                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5641                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5642                      (implicit EFLAGS)]>,
5643                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5644
5645  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5646                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5647                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5648                      Sched<[WritePOPCNT]>, XS;
5649  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5650                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5651                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5652                       (implicit EFLAGS)]>,
5653                       Sched<[WritePOPCNT.Folded]>, XS;
5654}
5655
5656// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5657multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5658                                 SDNode OpNode, PatFrag ld_frag,
5659                                 X86FoldableSchedWrite Sched> {
5660  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5661                 (ins VR128:$src),
5662                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5663                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5664                 Sched<[Sched]>;
5665  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5666                  (ins i128mem:$src),
5667                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5668                  [(set VR128:$dst,
5669                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5670                 Sched<[Sched.Folded]>;
5671}
5672
5673// PHMIN has the same profile as PSAD, thus we use the same scheduling
5674// model, although the naming is misleading.
5675let Predicates = [HasAVX] in
5676defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5677                                         X86phminpos, load,
5678                                         WritePHMINPOS>, VEX, VEX_WIG;
5679defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5680                                         X86phminpos, memop,
5681                                         WritePHMINPOS>;
5682
5683/// SS48I_binop_rm - Simple SSE41 binary operator.
5684multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5685                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5686                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5687                          bit Is2Addr = 1> {
5688  let isCommutable = 1 in
5689  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5690       (ins RC:$src1, RC:$src2),
5691       !if(Is2Addr,
5692           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5693           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5694       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5695       Sched<[sched]>;
5696  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5697       (ins RC:$src1, x86memop:$src2),
5698       !if(Is2Addr,
5699           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5700           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5701       [(set RC:$dst,
5702         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5703       Sched<[sched.Folded, sched.ReadAfterFold]>;
5704}
5705
5706let Predicates = [HasAVX, NoVLX] in {
5707  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5708                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5709                                  VEX_4V, VEX_WIG;
5710  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5711                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5712                                  VEX_4V, VEX_WIG;
5713  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5714                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5715                                  VEX_4V, VEX_WIG;
5716  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5717                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5718                                  VEX_4V, VEX_WIG;
5719  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5720                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5721                                  VEX_4V, VEX_WIG;
5722}
5723let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5724  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5725                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5726                                  VEX_4V, VEX_WIG;
5727  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5728                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5729                                  VEX_4V, VEX_WIG;
5730  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5731                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5732                                  VEX_4V, VEX_WIG;
5733  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5734                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5735                                  VEX_4V, VEX_WIG;
5736}
5737
5738let Predicates = [HasAVX2, NoVLX] in {
5739  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5740                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5741                                  VEX_4V, VEX_L, VEX_WIG;
5742  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5743                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5744                                  VEX_4V, VEX_L, VEX_WIG;
5745  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5746                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5747                                  VEX_4V, VEX_L, VEX_WIG;
5748  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5749                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5750                                  VEX_4V, VEX_L, VEX_WIG;
5751  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5752                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5753                                  VEX_4V, VEX_L, VEX_WIG;
5754}
5755let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5756  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5757                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5758                                  VEX_4V, VEX_L, VEX_WIG;
5759  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5760                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5761                                  VEX_4V, VEX_L, VEX_WIG;
5762  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5763                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5764                                  VEX_4V, VEX_L, VEX_WIG;
5765  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5766                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5767                                  VEX_4V, VEX_L, VEX_WIG;
5768}
5769
5770let Constraints = "$src1 = $dst" in {
5771  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5772                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5773  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5774                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5775  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5776                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5777  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5778                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5779  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5780                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5781  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5782                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5783  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5784                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5785  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5786                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5787  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5788                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5789}
5790
5791let Predicates = [HasAVX, NoVLX] in
5792  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5793                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5794                                 VEX_4V, VEX_WIG;
5795let Predicates = [HasAVX] in
5796  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5797                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5798                                 VEX_4V, VEX_WIG;
5799
5800let Predicates = [HasAVX2, NoVLX] in
5801  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5802                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5803                                  VEX_4V, VEX_L, VEX_WIG;
5804let Predicates = [HasAVX2] in
5805  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5806                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5807                                  VEX_4V, VEX_L, VEX_WIG;
5808
5809let Constraints = "$src1 = $dst" in {
5810  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5811                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5812  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5813                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5814}
5815
5816/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5817multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5818                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5819                 X86MemOperand x86memop, bit Is2Addr,
5820                 X86FoldableSchedWrite sched> {
5821  let isCommutable = 1 in
5822  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5823        (ins RC:$src1, RC:$src2, u8imm:$src3),
5824        !if(Is2Addr,
5825            !strconcat(OpcodeStr,
5826                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5827            !strconcat(OpcodeStr,
5828                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5829        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
5830        Sched<[sched]>;
5831  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5832        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5833        !if(Is2Addr,
5834            !strconcat(OpcodeStr,
5835                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5836            !strconcat(OpcodeStr,
5837                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5838        [(set RC:$dst,
5839          (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>,
5840        Sched<[sched.Folded, sched.ReadAfterFold]>;
5841}
5842
5843/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5844multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5845                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5846                           X86MemOperand x86memop, bit Is2Addr,
5847                           X86FoldableSchedWrite sched> {
5848  let isCommutable = 1 in
5849  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5850        (ins RC:$src1, RC:$src2, u8imm:$src3),
5851        !if(Is2Addr,
5852            !strconcat(OpcodeStr,
5853                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5854            !strconcat(OpcodeStr,
5855                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5856        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
5857        Sched<[sched]>;
5858  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5859        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5860        !if(Is2Addr,
5861            !strconcat(OpcodeStr,
5862                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5863            !strconcat(OpcodeStr,
5864                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5865        [(set RC:$dst,
5866          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
5867        Sched<[sched.Folded, sched.ReadAfterFold]>;
5868}
5869
5870def BlendCommuteImm2 : SDNodeXForm<imm, [{
5871  uint8_t Imm = N->getZExtValue() & 0x03;
5872  return getI8Imm(Imm ^ 0x03, SDLoc(N));
5873}]>;
5874
5875def BlendCommuteImm4 : SDNodeXForm<imm, [{
5876  uint8_t Imm = N->getZExtValue() & 0x0f;
5877  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5878}]>;
5879
5880def BlendCommuteImm8 : SDNodeXForm<imm, [{
5881  uint8_t Imm = N->getZExtValue() & 0xff;
5882  return getI8Imm(Imm ^ 0xff, SDLoc(N));
5883}]>;
5884
5885// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5886def BlendScaleImm4 : SDNodeXForm<imm, [{
5887  uint8_t Imm = N->getZExtValue();
5888  uint8_t NewImm = 0;
5889  for (unsigned i = 0; i != 4; ++i) {
5890    if (Imm & (1 << i))
5891      NewImm |= 0x3 << (i * 2);
5892  }
5893  return getI8Imm(NewImm, SDLoc(N));
5894}]>;
5895
5896// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5897def BlendScaleImm2 : SDNodeXForm<imm, [{
5898  uint8_t Imm = N->getZExtValue();
5899  uint8_t NewImm = 0;
5900  for (unsigned i = 0; i != 2; ++i) {
5901    if (Imm & (1 << i))
5902      NewImm |= 0xf << (i * 4);
5903  }
5904  return getI8Imm(NewImm, SDLoc(N));
5905}]>;
5906
5907// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5908def BlendScaleImm2to4 : SDNodeXForm<imm, [{
5909  uint8_t Imm = N->getZExtValue();
5910  uint8_t NewImm = 0;
5911  for (unsigned i = 0; i != 2; ++i) {
5912    if (Imm & (1 << i))
5913      NewImm |= 0x3 << (i * 2);
5914  }
5915  return getI8Imm(NewImm, SDLoc(N));
5916}]>;
5917
5918// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5919def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{
5920  uint8_t Imm = N->getZExtValue();
5921  uint8_t NewImm = 0;
5922  for (unsigned i = 0; i != 4; ++i) {
5923    if (Imm & (1 << i))
5924      NewImm |= 0x3 << (i * 2);
5925  }
5926  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5927}]>;
5928
5929// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5930def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{
5931  uint8_t Imm = N->getZExtValue();
5932  uint8_t NewImm = 0;
5933  for (unsigned i = 0; i != 2; ++i) {
5934    if (Imm & (1 << i))
5935      NewImm |= 0xf << (i * 4);
5936  }
5937  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5938}]>;
5939
5940// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5941def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{
5942  uint8_t Imm = N->getZExtValue();
5943  uint8_t NewImm = 0;
5944  for (unsigned i = 0; i != 2; ++i) {
5945    if (Imm & (1 << i))
5946      NewImm |= 0x3 << (i * 2);
5947  }
5948  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
5949}]>;
5950
5951let Predicates = [HasAVX] in {
5952  let isCommutable = 0 in {
5953    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
5954                                        VR128, load, i128mem, 0,
5955                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
5956  }
5957
5958  let ExeDomain = SSEPackedSingle in
5959  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
5960                                   VR128, load, f128mem, 0,
5961                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
5962  let ExeDomain = SSEPackedDouble in
5963  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
5964                                   VR128, load, f128mem, 0,
5965                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
5966  let ExeDomain = SSEPackedSingle in
5967  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
5968                                    VR256, load, i256mem, 0,
5969                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
5970}
5971
5972let Predicates = [HasAVX2] in {
5973  let isCommutable = 0 in {
5974  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
5975                                  VR256, load, i256mem, 0,
5976                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
5977  }
5978}
5979
5980let Constraints = "$src1 = $dst" in {
5981  let isCommutable = 0 in {
5982  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
5983                                     VR128, memop, i128mem, 1,
5984                                     SchedWriteMPSAD.XMM>;
5985  }
5986
5987  let ExeDomain = SSEPackedSingle in
5988  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
5989                                  VR128, memop, f128mem, 1,
5990                                  SchedWriteDPPS.XMM>;
5991  let ExeDomain = SSEPackedDouble in
5992  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
5993                                  VR128, memop, f128mem, 1,
5994                                  SchedWriteDPPD.XMM>;
5995}
5996
5997/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
5998multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5999                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6000                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6001                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6002let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6003  let isCommutable = 1 in
6004  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6005        (ins RC:$src1, RC:$src2, u8imm:$src3),
6006        !if(Is2Addr,
6007            !strconcat(OpcodeStr,
6008                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6009            !strconcat(OpcodeStr,
6010                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6011        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
6012        Sched<[sched]>;
6013  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6014        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6015        !if(Is2Addr,
6016            !strconcat(OpcodeStr,
6017                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6018            !strconcat(OpcodeStr,
6019                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6020        [(set RC:$dst,
6021          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>,
6022        Sched<[sched.Folded, sched.ReadAfterFold]>;
6023}
6024
6025  // Pattern to commute if load is in first source.
6026  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)),
6027            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6028                                            (commuteXForm imm:$src3))>;
6029}
6030
6031let Predicates = [HasAVX] in {
6032  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6033                                  VR128, load, f128mem, 0, SSEPackedSingle,
6034                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6035                                  VEX_4V, VEX_WIG;
6036  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6037                                   VR256, load, f256mem, 0, SSEPackedSingle,
6038                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6039                                   VEX_4V, VEX_L, VEX_WIG;
6040  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6041                                  VR128, load, f128mem, 0, SSEPackedDouble,
6042                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6043                                  VEX_4V, VEX_WIG;
6044  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6045                                   VR256, load, f256mem, 0, SSEPackedDouble,
6046                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6047                                   VEX_4V, VEX_L, VEX_WIG;
6048  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6049                                  VR128, load, i128mem, 0, SSEPackedInt,
6050                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6051                                  VEX_4V, VEX_WIG;
6052}
6053
6054let Predicates = [HasAVX2] in {
6055  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6056                                   VR256, load, i256mem, 0, SSEPackedInt,
6057                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6058                                   VEX_4V, VEX_L, VEX_WIG;
6059}
6060
6061// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6062// ExecutionDomainFixPass will cleanup domains later on.
6063let Predicates = [HasAVX1Only] in {
6064def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
6065          (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>;
6066def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
6067          (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>;
6068def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
6069          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>;
6070
6071// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6072// it from becoming movsd via commuting under optsize.
6073def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
6074          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
6075def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
6076          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
6077def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
6078          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
6079
6080def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3),
6081          (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>;
6082def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3),
6083          (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>;
6084def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3),
6085          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>;
6086
6087// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6088// it from becoming movss via commuting under optsize.
6089def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
6090          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
6091def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3),
6092          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
6093def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3),
6094          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
6095}
6096
6097defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6098                               VR128, memop, f128mem, 1, SSEPackedSingle,
6099                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6100defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6101                               VR128, memop, f128mem, 1, SSEPackedDouble,
6102                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6103defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6104                               VR128, memop, i128mem, 1, SSEPackedInt,
6105                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6106
6107let Predicates = [UseSSE41] in {
6108// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6109// it from becoming movss via commuting under optsize.
6110def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
6111          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>;
6112def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3),
6113          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>;
6114def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3),
6115          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>;
6116
6117def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3),
6118          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>;
6119def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3),
6120          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
6121def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3),
6122          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
6123}
6124
6125// For insertion into the zero index (low half) of a 256-bit vector, it is
6126// more efficient to generate a blend with immediate instead of an insert*128.
6127let Predicates = [HasAVX] in {
6128def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6129          (VBLENDPDYrri VR256:$src1,
6130                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6131                                       VR128:$src2, sub_xmm), 0x3)>;
6132def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6133          (VBLENDPSYrri VR256:$src1,
6134                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6135                                       VR128:$src2, sub_xmm), 0xf)>;
6136
6137def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6138          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6139                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6140def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6141          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6142                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6143}
6144
6145/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6146multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6147                                X86MemOperand x86memop, ValueType VT,
6148                                PatFrag mem_frag, SDNode OpNode,
6149                                X86FoldableSchedWrite sched> {
6150  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6151                  (ins RC:$src1, RC:$src2, RC:$src3),
6152                  !strconcat(OpcodeStr,
6153                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6154                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6155                  SSEPackedInt>, TAPD, VEX_4V,
6156                Sched<[sched]>;
6157
6158  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6159                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6160                  !strconcat(OpcodeStr,
6161                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6162                  [(set RC:$dst,
6163                        (OpNode RC:$src3, (mem_frag addr:$src2),
6164                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6165                Sched<[sched.Folded, sched.ReadAfterFold,
6166                       // x86memop:$src2
6167                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6168                       ReadDefault,
6169                       // RC::$src3
6170                       sched.ReadAfterFold]>;
6171}
6172
6173let Predicates = [HasAVX] in {
6174let ExeDomain = SSEPackedDouble in {
6175defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6176                                       v2f64, loadv2f64, X86Blendv,
6177                                       SchedWriteFVarBlend.XMM>;
6178defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6179                                       v4f64, loadv4f64, X86Blendv,
6180                                       SchedWriteFVarBlend.YMM>, VEX_L;
6181} // ExeDomain = SSEPackedDouble
6182let ExeDomain = SSEPackedSingle in {
6183defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6184                                       v4f32, loadv4f32, X86Blendv,
6185                                       SchedWriteFVarBlend.XMM>;
6186defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6187                                       v8f32, loadv8f32, X86Blendv,
6188                                       SchedWriteFVarBlend.YMM>, VEX_L;
6189} // ExeDomain = SSEPackedSingle
6190defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6191                                       v16i8, loadv16i8, X86Blendv,
6192                                       SchedWriteVarBlend.XMM>;
6193}
6194
6195let Predicates = [HasAVX2] in {
6196defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6197                                       v32i8, loadv32i8, X86Blendv,
6198                                       SchedWriteVarBlend.YMM>, VEX_L;
6199}
6200
6201let Predicates = [HasAVX] in {
6202  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6203                              (v4i32 VR128:$src2))),
6204            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6205  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6206                              (v2i64 VR128:$src2))),
6207            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6208  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6209                              (v8i32 VR256:$src2))),
6210            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6211  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6212                              (v4i64 VR256:$src2))),
6213            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6214}
6215
6216// Prefer a movss or movsd over a blendps when optimizing for size. these were
6217// changed to use blends because blends have better throughput on sandybridge
6218// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6219let Predicates = [HasAVX, OptForSpeed] in {
6220  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6221            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6222  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6223            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6224
6225  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6226            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6227  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6228            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6229  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6230            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6231
6232  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6233            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6234  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6235            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6236  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6237            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6238
6239  // Move low f32 and clear high bits.
6240  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6241            (SUBREG_TO_REG (i32 0),
6242             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6243                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6244                          (i8 1))), sub_xmm)>;
6245  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6246            (SUBREG_TO_REG (i32 0),
6247             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6248                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6249                          (i8 3))), sub_xmm)>;
6250}
6251
6252// Prefer a movss or movsd over a blendps when optimizing for size. these were
6253// changed to use blends because blends have better throughput on sandybridge
6254// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6255let Predicates = [UseSSE41, OptForSpeed] in {
6256  // With SSE41 we can use blends for these patterns.
6257  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6258            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6259  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6260            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6261
6262  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6263            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6264  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6265            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6266  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6267            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6268
6269  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6270            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6271  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6272            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6273  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6274            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6275}
6276
6277
6278/// SS41I_ternary - SSE 4.1 ternary operator
6279let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6280  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6281                           PatFrag mem_frag, X86MemOperand x86memop,
6282                           SDNode OpNode, X86FoldableSchedWrite sched> {
6283    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6284                    (ins VR128:$src1, VR128:$src2),
6285                    !strconcat(OpcodeStr,
6286                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6287                    [(set VR128:$dst,
6288                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6289                    Sched<[sched]>;
6290
6291    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6292                    (ins VR128:$src1, x86memop:$src2),
6293                    !strconcat(OpcodeStr,
6294                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6295                    [(set VR128:$dst,
6296                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6297                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6298  }
6299}
6300
6301let ExeDomain = SSEPackedDouble in
6302defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6303                              X86Blendv, SchedWriteFVarBlend.XMM>;
6304let ExeDomain = SSEPackedSingle in
6305defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6306                              X86Blendv, SchedWriteFVarBlend.XMM>;
6307defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6308                              X86Blendv, SchedWriteVarBlend.XMM>;
6309
6310// Aliases with the implicit xmm0 argument
6311def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6312                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6313def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6314                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6315def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6316                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6317def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6318                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6319def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6320                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6321def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6322                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6323
6324let Predicates = [UseSSE41] in {
6325  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6326                              (v4i32 VR128:$src2))),
6327            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6328  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6329                              (v2i64 VR128:$src2))),
6330            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6331}
6332
6333let AddedComplexity = 400 in { // Prefer non-temporal versions
6334
6335let Predicates = [HasAVX, NoVLX] in
6336def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6337                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6338                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6339let Predicates = [HasAVX2, NoVLX] in
6340def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6341                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6342                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6343def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6344                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6345                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6346
6347let Predicates = [HasAVX2, NoVLX] in {
6348  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6349            (VMOVNTDQAYrm addr:$src)>;
6350  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6351            (VMOVNTDQAYrm addr:$src)>;
6352  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6353            (VMOVNTDQAYrm addr:$src)>;
6354  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6355            (VMOVNTDQAYrm addr:$src)>;
6356  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6357            (VMOVNTDQAYrm addr:$src)>;
6358  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6359            (VMOVNTDQAYrm addr:$src)>;
6360}
6361
6362let Predicates = [HasAVX, NoVLX] in {
6363  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6364            (VMOVNTDQArm addr:$src)>;
6365  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6366            (VMOVNTDQArm addr:$src)>;
6367  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6368            (VMOVNTDQArm addr:$src)>;
6369  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6370            (VMOVNTDQArm addr:$src)>;
6371  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6372            (VMOVNTDQArm addr:$src)>;
6373  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6374            (VMOVNTDQArm addr:$src)>;
6375}
6376
6377let Predicates = [UseSSE41] in {
6378  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6379            (MOVNTDQArm addr:$src)>;
6380  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6381            (MOVNTDQArm addr:$src)>;
6382  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6383            (MOVNTDQArm addr:$src)>;
6384  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6385            (MOVNTDQArm addr:$src)>;
6386  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6387            (MOVNTDQArm addr:$src)>;
6388  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6389            (MOVNTDQArm addr:$src)>;
6390}
6391
6392} // AddedComplexity
6393
6394//===----------------------------------------------------------------------===//
6395// SSE4.2 - Compare Instructions
6396//===----------------------------------------------------------------------===//
6397
6398/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6399multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6400                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6401                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6402                          bit Is2Addr = 1> {
6403  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6404       (ins RC:$src1, RC:$src2),
6405       !if(Is2Addr,
6406           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6407           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6408       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6409       Sched<[sched]>;
6410  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6411       (ins RC:$src1, x86memop:$src2),
6412       !if(Is2Addr,
6413           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6414           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6415       [(set RC:$dst,
6416         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6417       Sched<[sched.Folded, sched.ReadAfterFold]>;
6418}
6419
6420let Predicates = [HasAVX] in
6421  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6422                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6423                                 VEX_4V, VEX_WIG;
6424
6425let Predicates = [HasAVX2] in
6426  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6427                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6428                                  VEX_4V, VEX_L, VEX_WIG;
6429
6430let Constraints = "$src1 = $dst" in
6431  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6432                                memop, i128mem, SchedWriteVecALU.XMM>;
6433
6434//===----------------------------------------------------------------------===//
6435// SSE4.2 - String/text Processing Instructions
6436//===----------------------------------------------------------------------===//
6437
6438multiclass pcmpistrm_SS42AI<string asm> {
6439  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6440    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6441    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6442    []>, Sched<[WritePCmpIStrM]>;
6443  let mayLoad = 1 in
6444  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6445    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6446    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6447    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6448}
6449
6450let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6451  let Predicates = [HasAVX] in
6452  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6453  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6454}
6455
6456multiclass SS42AI_pcmpestrm<string asm> {
6457  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6458    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6459    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6460    []>, Sched<[WritePCmpEStrM]>;
6461  let mayLoad = 1 in
6462  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6463    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6464    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6465    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6466}
6467
6468let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6469  let Predicates = [HasAVX] in
6470  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6471  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6472}
6473
6474multiclass SS42AI_pcmpistri<string asm> {
6475  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6476    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6477    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6478    []>, Sched<[WritePCmpIStrI]>;
6479  let mayLoad = 1 in
6480  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6481    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6482    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6483    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6484}
6485
6486let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6487  let Predicates = [HasAVX] in
6488  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6489  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6490}
6491
6492multiclass SS42AI_pcmpestri<string asm> {
6493  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6494    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6495    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6496    []>, Sched<[WritePCmpEStrI]>;
6497  let mayLoad = 1 in
6498  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6499    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6500    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6501    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6502}
6503
6504let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6505  let Predicates = [HasAVX] in
6506  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6507  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6508}
6509
6510//===----------------------------------------------------------------------===//
6511// SSE4.2 - CRC Instructions
6512//===----------------------------------------------------------------------===//
6513
6514// No CRC instructions have AVX equivalents
6515
6516// crc intrinsic instruction
6517// This set of instructions are only rm, the only difference is the size
6518// of r and m.
6519class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6520                   RegisterClass RCIn, SDPatternOperator Int> :
6521  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6522         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6523         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6524         Sched<[WriteCRC32]>;
6525
6526class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6527                   X86MemOperand x86memop, SDPatternOperator Int> :
6528  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6529         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6530         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6531         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6532
6533let Constraints = "$src1 = $dst" in {
6534  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6535                                 int_x86_sse42_crc32_32_8>;
6536  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6537                                 int_x86_sse42_crc32_32_8>;
6538  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6539                                 int_x86_sse42_crc32_32_16>, OpSize16;
6540  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6541                                 int_x86_sse42_crc32_32_16>, OpSize16;
6542  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6543                                 int_x86_sse42_crc32_32_32>, OpSize32;
6544  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6545                                 int_x86_sse42_crc32_32_32>, OpSize32;
6546  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6547                                 int_x86_sse42_crc32_64_64>, REX_W;
6548  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6549                                 int_x86_sse42_crc32_64_64>, REX_W;
6550  let hasSideEffects = 0 in {
6551    let mayLoad = 1 in
6552    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6553                                   null_frag>, REX_W;
6554    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6555                                   null_frag>, REX_W;
6556  }
6557}
6558
6559//===----------------------------------------------------------------------===//
6560// SHA-NI Instructions
6561//===----------------------------------------------------------------------===//
6562
6563// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6564multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6565                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6566  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6567             (ins VR128:$src1, VR128:$src2),
6568             !if(UsesXMM0,
6569                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6570                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6571             [!if(UsesXMM0,
6572                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6573                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6574             T8, Sched<[sched]>;
6575
6576  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6577             (ins VR128:$src1, i128mem:$src2),
6578             !if(UsesXMM0,
6579                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6580                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6581             [!if(UsesXMM0,
6582                  (set VR128:$dst, (IntId VR128:$src1,
6583                    (memop addr:$src2), XMM0)),
6584                  (set VR128:$dst, (IntId VR128:$src1,
6585                    (memop addr:$src2))))]>, T8,
6586             Sched<[sched.Folded, sched.ReadAfterFold]>;
6587}
6588
6589let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6590  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6591                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6592                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6593                         [(set VR128:$dst,
6594                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6595                            (i8 imm:$src3)))]>, TA,
6596                         Sched<[SchedWriteVecIMul.XMM]>;
6597  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6598                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6599                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6600                         [(set VR128:$dst,
6601                           (int_x86_sha1rnds4 VR128:$src1,
6602                            (memop addr:$src2),
6603                            (i8 imm:$src3)))]>, TA,
6604                         Sched<[SchedWriteVecIMul.XMM.Folded,
6605                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6606
6607  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6608                              SchedWriteVecIMul.XMM>;
6609  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6610                              SchedWriteVecIMul.XMM>;
6611  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6612                              SchedWriteVecIMul.XMM>;
6613
6614  let Uses=[XMM0] in
6615  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6616                                SchedWriteVecIMul.XMM, 1>;
6617
6618  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6619                               SchedWriteVecIMul.XMM>;
6620  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6621                               SchedWriteVecIMul.XMM>;
6622}
6623
6624// Aliases with explicit %xmm0
6625def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6626                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6627def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6628                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6629
6630//===----------------------------------------------------------------------===//
6631// AES-NI Instructions
6632//===----------------------------------------------------------------------===//
6633
6634multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6635                             Intrinsic IntId, PatFrag ld_frag,
6636                             bit Is2Addr = 0, RegisterClass RC = VR128,
6637                             X86MemOperand MemOp = i128mem> {
6638  let AsmString = OpcodeStr##
6639                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6640                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6641    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6642                   (ins RC:$src1, RC:$src2), "",
6643                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6644                   Sched<[WriteAESDecEnc]>;
6645    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6646                   (ins RC:$src1, MemOp:$src2), "",
6647                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6648                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6649  }
6650}
6651
6652// Perform One Round of an AES Encryption/Decryption Flow
6653let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6654  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6655                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6656  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6657                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6658  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6659                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6660  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6661                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6662}
6663
6664let Predicates = [NoVLX, HasVAES] in {
6665  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6666                         int_x86_aesni_aesenc_256, load, 0, VR256,
6667                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6668  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6669                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6670                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6671  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6672                         int_x86_aesni_aesdec_256, load, 0, VR256,
6673                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6674  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6675                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6676                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6677}
6678
6679let Constraints = "$src1 = $dst" in {
6680  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6681                         int_x86_aesni_aesenc, memop, 1>;
6682  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6683                         int_x86_aesni_aesenclast, memop, 1>;
6684  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6685                         int_x86_aesni_aesdec, memop, 1>;
6686  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6687                         int_x86_aesni_aesdeclast, memop, 1>;
6688}
6689
6690// Perform the AES InvMixColumn Transformation
6691let Predicates = [HasAVX, HasAES] in {
6692  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6693      (ins VR128:$src1),
6694      "vaesimc\t{$src1, $dst|$dst, $src1}",
6695      [(set VR128:$dst,
6696        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6697      VEX, VEX_WIG;
6698  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6699      (ins i128mem:$src1),
6700      "vaesimc\t{$src1, $dst|$dst, $src1}",
6701      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6702      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6703}
6704def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6705  (ins VR128:$src1),
6706  "aesimc\t{$src1, $dst|$dst, $src1}",
6707  [(set VR128:$dst,
6708    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6709def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6710  (ins i128mem:$src1),
6711  "aesimc\t{$src1, $dst|$dst, $src1}",
6712  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6713  Sched<[WriteAESIMC.Folded]>;
6714
6715// AES Round Key Generation Assist
6716let Predicates = [HasAVX, HasAES] in {
6717  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6718      (ins VR128:$src1, u8imm:$src2),
6719      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6720      [(set VR128:$dst,
6721        (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6722      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6723  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6724      (ins i128mem:$src1, u8imm:$src2),
6725      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6726      [(set VR128:$dst,
6727        (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>,
6728      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6729}
6730def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6731  (ins VR128:$src1, u8imm:$src2),
6732  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6733  [(set VR128:$dst,
6734    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
6735  Sched<[WriteAESKeyGen]>;
6736def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6737  (ins i128mem:$src1, u8imm:$src2),
6738  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6739  [(set VR128:$dst,
6740    (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>,
6741  Sched<[WriteAESKeyGen.Folded]>;
6742
6743//===----------------------------------------------------------------------===//
6744// PCLMUL Instructions
6745//===----------------------------------------------------------------------===//
6746
6747// Immediate transform to help with commuting.
6748def PCLMULCommuteImm : SDNodeXForm<imm, [{
6749  uint8_t Imm = N->getZExtValue();
6750  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6751}]>;
6752
6753// SSE carry-less Multiplication instructions
6754let Predicates = [NoAVX, HasPCLMUL] in {
6755  let Constraints = "$src1 = $dst" in {
6756    let isCommutable = 1 in
6757    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6758              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6759              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6760              [(set VR128:$dst,
6761                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
6762                Sched<[WriteCLMul]>;
6763
6764    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6765              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6766              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6767              [(set VR128:$dst,
6768                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6769                  imm:$src3))]>,
6770              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6771  } // Constraints = "$src1 = $dst"
6772
6773  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6774                                (i8 imm:$src3)),
6775            (PCLMULQDQrm VR128:$src1, addr:$src2,
6776                          (PCLMULCommuteImm imm:$src3))>;
6777} // Predicates = [NoAVX, HasPCLMUL]
6778
6779// SSE aliases
6780foreach HI = ["hq","lq"] in
6781foreach LO = ["hq","lq"] in {
6782  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6783                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6784                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6785  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6786                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6787                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6788}
6789
6790// AVX carry-less Multiplication instructions
6791multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6792                      PatFrag LdFrag, Intrinsic IntId> {
6793  let isCommutable = 1 in
6794  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6795            (ins RC:$src1, RC:$src2, u8imm:$src3),
6796            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6797            [(set RC:$dst,
6798              (IntId RC:$src1, RC:$src2, imm:$src3))]>,
6799            Sched<[WriteCLMul]>;
6800
6801  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6802            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6803            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6804            [(set RC:$dst,
6805               (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
6806            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6807
6808  // We can commute a load in the first operand by swapping the sources and
6809  // rotating the immediate.
6810  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
6811            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6812                                           (PCLMULCommuteImm imm:$src3))>;
6813}
6814
6815let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6816defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6817                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6818
6819let Predicates = [NoVLX, HasVPCLMULQDQ] in
6820defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6821                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6822
6823multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6824                                   X86MemOperand MemOp, string Hi, string Lo> {
6825  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6826                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6827                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6828  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6829                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6830                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6831}
6832
6833multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6834                              X86MemOperand MemOp> {
6835  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6836  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6837  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6838  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6839}
6840
6841// AVX aliases
6842defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6843defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6844
6845//===----------------------------------------------------------------------===//
6846// SSE4A Instructions
6847//===----------------------------------------------------------------------===//
6848
6849let Predicates = [HasSSE4A] in {
6850
6851let ExeDomain = SSEPackedInt in {
6852let Constraints = "$src = $dst" in {
6853def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6854                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6855                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6856                 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
6857                                    imm:$idx))]>,
6858                 PD, Sched<[SchedWriteVecALU.XMM]>;
6859def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6860              (ins VR128:$src, VR128:$mask),
6861              "extrq\t{$mask, $src|$src, $mask}",
6862              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6863                                 VR128:$mask))]>,
6864              PD, Sched<[SchedWriteVecALU.XMM]>;
6865
6866def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6867                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6868                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6869                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6870                                      imm:$len, imm:$idx))]>,
6871                   XD, Sched<[SchedWriteVecALU.XMM]>;
6872def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6873                 (ins VR128:$src, VR128:$mask),
6874                 "insertq\t{$mask, $src|$src, $mask}",
6875                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6876                                    VR128:$mask))]>,
6877                 XD, Sched<[SchedWriteVecALU.XMM]>;
6878}
6879} // ExeDomain = SSEPackedInt
6880
6881// Non-temporal (unaligned) scalar stores.
6882let AddedComplexity = 400 in { // Prefer non-temporal versions
6883let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6884def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6885                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6886
6887def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6888                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6889} // SchedRW
6890
6891def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6892          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6893
6894def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6895          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6896
6897} // AddedComplexity
6898} // HasSSE4A
6899
6900//===----------------------------------------------------------------------===//
6901// AVX Instructions
6902//===----------------------------------------------------------------------===//
6903
6904//===----------------------------------------------------------------------===//
6905// VBROADCAST - Load from memory and broadcast to all elements of the
6906//              destination operand
6907//
6908class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6909                           X86MemOperand x86memop, ValueType VT,
6910                           PatFrag ld_frag, SchedWrite Sched> :
6911  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6912        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6913        [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
6914        Sched<[Sched]>, VEX;
6915
6916// AVX2 adds register forms
6917class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6918                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6919  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6920         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6921         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6922         Sched<[Sched]>, VEX;
6923
6924let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6925  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6926                                         f32mem, v4f32, loadf32,
6927                                         SchedWriteFShuffle.XMM.Folded>;
6928  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6929                                         f32mem, v8f32, loadf32,
6930                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
6931}
6932let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6933def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6934                                        v4f64, loadf64,
6935                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
6936
6937let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6938  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6939                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
6940  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6941                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
6942}
6943let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6944def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6945                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
6946
6947let Predicates = [HasAVX, NoVLX] in {
6948  def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
6949            (VBROADCASTSSrm addr:$src)>;
6950  def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
6951            (VBROADCASTSSYrm addr:$src)>;
6952  def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
6953            (VBROADCASTSDYrm addr:$src)>;
6954}
6955
6956//===----------------------------------------------------------------------===//
6957// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
6958//                  halves of a 256-bit vector.
6959//
6960let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
6961def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
6962                           (ins i128mem:$src),
6963                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
6964                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
6965
6966let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
6967    ExeDomain = SSEPackedSingle in
6968def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
6969                           (ins f128mem:$src),
6970                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
6971                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
6972
6973let Predicates = [HasAVX, NoVLX] in {
6974def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
6975          (VBROADCASTF128 addr:$src)>;
6976def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
6977          (VBROADCASTF128 addr:$src)>;
6978}
6979
6980// NOTE: We're using FP instructions here, but execution domain fixing can
6981// convert to integer when profitable.
6982let Predicates = [HasAVX, NoVLX] in {
6983def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
6984          (VBROADCASTF128 addr:$src)>;
6985def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
6986          (VBROADCASTF128 addr:$src)>;
6987def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
6988          (VBROADCASTF128 addr:$src)>;
6989def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
6990          (VBROADCASTF128 addr:$src)>;
6991}
6992
6993//===----------------------------------------------------------------------===//
6994// VINSERTF128 - Insert packed floating-point values
6995//
6996let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
6997def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
6998          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
6999          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7000          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7001let mayLoad = 1 in
7002def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7003          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7004          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7005          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7006}
7007
7008// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7009// with YMM register containing zero.
7010// FIXME: Avoid producing vxorps to clear the fake inputs.
7011let Predicates = [HasAVX1Only] in {
7012def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7013}
7014
7015multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7016                            PatFrag memop_frag> {
7017  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7018                                   (iPTR imm)),
7019            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7020                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7021  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7022                                    (From (memop_frag addr:$src2)),
7023                                    (iPTR imm)),
7024            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7025                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7026}
7027
7028let Predicates = [HasAVX, NoVLX] in {
7029  defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7030  defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7031}
7032
7033let Predicates = [HasAVX1Only] in {
7034  defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
7035  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
7036  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7037  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
7038}
7039
7040//===----------------------------------------------------------------------===//
7041// VEXTRACTF128 - Extract packed floating-point values
7042//
7043let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7044def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7045          (ins VR256:$src1, u8imm:$src2),
7046          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7047          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7048let mayStore = 1 in
7049def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7050          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7051          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7052          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7053}
7054
7055multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7056  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7057            (To (!cast<Instruction>(InstrStr#rr)
7058                                    (From VR256:$src1),
7059                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7060  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7061                                                 (iPTR imm))), addr:$dst),
7062            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7063             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7064}
7065
7066// AVX1 patterns
7067let Predicates = [HasAVX, NoVLX] in {
7068  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7069  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7070}
7071
7072let Predicates = [HasAVX1Only] in {
7073  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7074  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7075  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7076  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7077}
7078
7079//===----------------------------------------------------------------------===//
7080// VMASKMOV - Conditional SIMD Packed Loads and Stores
7081//
7082multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7083                          Intrinsic IntLd, Intrinsic IntLd256,
7084                          Intrinsic IntSt, Intrinsic IntSt256> {
7085  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7086             (ins VR128:$src1, f128mem:$src2),
7087             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7088             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7089             VEX_4V, Sched<[WriteFMaskedLoad]>;
7090  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7091             (ins VR256:$src1, f256mem:$src2),
7092             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7093             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7094             VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>;
7095  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7096             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7097             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7098             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7099             VEX_4V, Sched<[WriteFMaskedStore]>;
7100  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7101             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7102             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7103             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7104             VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>;
7105}
7106
7107let ExeDomain = SSEPackedSingle in
7108defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7109                                 int_x86_avx_maskload_ps,
7110                                 int_x86_avx_maskload_ps_256,
7111                                 int_x86_avx_maskstore_ps,
7112                                 int_x86_avx_maskstore_ps_256>;
7113let ExeDomain = SSEPackedDouble in
7114defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7115                                 int_x86_avx_maskload_pd,
7116                                 int_x86_avx_maskload_pd_256,
7117                                 int_x86_avx_maskstore_pd,
7118                                 int_x86_avx_maskstore_pd_256>;
7119
7120//===----------------------------------------------------------------------===//
7121// VPERMIL - Permute Single and Double Floating-Point Values
7122//
7123
7124multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7125                      RegisterClass RC, X86MemOperand x86memop_f,
7126                      X86MemOperand x86memop_i,
7127                      ValueType f_vt, ValueType i_vt,
7128                      X86FoldableSchedWrite sched,
7129                      X86FoldableSchedWrite varsched> {
7130  let Predicates = [HasAVX, NoVLX] in {
7131    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7132               (ins RC:$src1, RC:$src2),
7133               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7134               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7135               Sched<[varsched]>;
7136    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7137               (ins RC:$src1, x86memop_i:$src2),
7138               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7139               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7140                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7141               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7142
7143    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7144             (ins RC:$src1, u8imm:$src2),
7145             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7146             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
7147             Sched<[sched]>;
7148    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7149             (ins x86memop_f:$src1, u8imm:$src2),
7150             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7151             [(set RC:$dst,
7152               (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
7153             Sched<[sched.Folded]>;
7154  }// Predicates = [HasAVX, NoVLX]
7155}
7156
7157let ExeDomain = SSEPackedSingle in {
7158  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7159                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7160                               SchedWriteFVarShuffle.XMM>;
7161  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7162                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7163                               SchedWriteFVarShuffle.YMM>, VEX_L;
7164}
7165let ExeDomain = SSEPackedDouble in {
7166  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7167                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7168                               SchedWriteFVarShuffle.XMM>;
7169  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7170                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7171                               SchedWriteFVarShuffle.YMM>, VEX_L;
7172}
7173
7174//===----------------------------------------------------------------------===//
7175// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7176//
7177
7178let ExeDomain = SSEPackedSingle in {
7179let isCommutable = 1 in
7180def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7181          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7182          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7183          [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7184                              (i8 imm:$src3))))]>, VEX_4V, VEX_L,
7185          Sched<[WriteFShuffle256]>;
7186def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7187          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7188          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7189          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7190                             (i8 imm:$src3)))]>, VEX_4V, VEX_L,
7191          Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7192}
7193
7194// Immediate transform to help with commuting.
7195def Perm2XCommuteImm : SDNodeXForm<imm, [{
7196  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7197}]>;
7198
7199let Predicates = [HasAVX] in {
7200// Pattern with load in other operand.
7201def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7202                                VR256:$src1, (i8 imm:$imm))),
7203          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7204}
7205
7206let Predicates = [HasAVX1Only] in {
7207def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
7208          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
7209def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7210                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
7211          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
7212// Pattern with load in other operand.
7213def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7214                                VR256:$src1, (i8 imm:$imm))),
7215          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7216}
7217
7218//===----------------------------------------------------------------------===//
7219// VZERO - Zero YMM registers
7220// Note: These instruction do not affect the YMM16-YMM31.
7221//
7222
7223let SchedRW = [WriteSystem] in {
7224let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7225            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7226  // Zero All YMM registers
7227  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7228                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7229                  Requires<[HasAVX]>, VEX_WIG;
7230
7231  // Zero Upper bits of YMM registers
7232  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7233                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7234                     Requires<[HasAVX]>, VEX_WIG;
7235} // Defs
7236} // SchedRW
7237
7238//===----------------------------------------------------------------------===//
7239// Half precision conversion instructions
7240//
7241
7242multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7243                      X86FoldableSchedWrite sched> {
7244  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7245             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7246             [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7247             T8PD, VEX, Sched<[sched]>;
7248  let hasSideEffects = 0, mayLoad = 1 in
7249  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7250             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7251             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7252             T8PD, VEX, Sched<[sched.Folded]>;
7253}
7254
7255multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7256                      SchedWrite RR, SchedWrite MR> {
7257  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7258               (ins RC:$src1, i32u8imm:$src2),
7259               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7260               [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
7261               TAPD, VEX, Sched<[RR]>;
7262  let hasSideEffects = 0, mayStore = 1 in
7263  def mr : Ii8<0x1D, MRMDestMem, (outs),
7264               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7265               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7266               TAPD, VEX, Sched<[MR]>;
7267}
7268
7269let Predicates = [HasF16C, NoVLX] in {
7270  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
7271  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
7272  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7273                               WriteCvtPS2PHSt>;
7274  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7275                               WriteCvtPS2PHYSt>, VEX_L;
7276
7277  // Pattern match vcvtph2ps of a scalar i64 load.
7278  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7279            (VCVTPH2PSrm addr:$src)>;
7280  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
7281              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7282            (VCVTPH2PSrm addr:$src)>;
7283
7284  def : Pat<(store (f64 (extractelt
7285                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7286                         (iPTR 0))), addr:$dst),
7287            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7288  def : Pat<(store (i64 (extractelt
7289                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
7290                         (iPTR 0))), addr:$dst),
7291            (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
7292  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
7293            (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
7294}
7295
7296// Patterns for  matching conversions from float to half-float and vice versa.
7297let Predicates = [HasF16C, NoVLX] in {
7298  // Use MXCSR.RC for rounding instead of explicitly specifying the default
7299  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7300  // configurations we support (the default). However, falling back to MXCSR is
7301  // more consistent with other instructions, which are always controlled by it.
7302  // It's encoded as 0b100.
7303  def : Pat<(fp_to_f16 FR32:$src),
7304            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7305              (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7306
7307  def : Pat<(f16_to_fp GR16:$src),
7308            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7309              (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7310
7311  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7312            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7313             (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7314}
7315
7316//===----------------------------------------------------------------------===//
7317// AVX2 Instructions
7318//===----------------------------------------------------------------------===//
7319
7320/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7321multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7322                          ValueType OpVT, X86FoldableSchedWrite sched,
7323                          RegisterClass RC,
7324                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7325  let isCommutable = 1 in
7326  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7327        (ins RC:$src1, RC:$src2, u8imm:$src3),
7328        !strconcat(OpcodeStr,
7329            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7330        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
7331        Sched<[sched]>, VEX_4V;
7332  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7333        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7334        !strconcat(OpcodeStr,
7335            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7336        [(set RC:$dst,
7337          (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>,
7338        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7339
7340  // Pattern to commute if load is in first source.
7341  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)),
7342            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7343                                            (commuteXForm imm:$src3))>;
7344}
7345
7346let Predicates = [HasAVX2] in {
7347defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7348                               SchedWriteBlend.XMM, VR128, i128mem,
7349                               BlendCommuteImm4>;
7350defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7351                                SchedWriteBlend.YMM, VR256, i256mem,
7352                                BlendCommuteImm8>, VEX_L;
7353
7354def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3),
7355          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>;
7356def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3),
7357          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>;
7358def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3),
7359          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>;
7360
7361def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3),
7362          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>;
7363def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3),
7364          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>;
7365def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3),
7366          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>;
7367}
7368
7369// For insertion into the zero index (low half) of a 256-bit vector, it is
7370// more efficient to generate a blend with immediate instead of an insert*128.
7371// NOTE: We're using FP instructions here, but exeuction domain fixing should
7372// take care of using integer instructions when profitable.
7373let Predicates = [HasAVX] in {
7374def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7375          (VBLENDPSYrri VR256:$src1,
7376                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7377                                       VR128:$src2, sub_xmm), 0xf)>;
7378def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7379          (VBLENDPSYrri VR256:$src1,
7380                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7381                                       VR128:$src2, sub_xmm), 0xf)>;
7382def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7383          (VBLENDPSYrri VR256:$src1,
7384                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7385                                       VR128:$src2, sub_xmm), 0xf)>;
7386def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7387          (VBLENDPSYrri VR256:$src1,
7388                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7389                                       VR128:$src2, sub_xmm), 0xf)>;
7390
7391def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7392          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7393                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7394def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7395          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7396                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7397def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7398          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7399                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7400def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7401          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7402                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7403}
7404
7405//===----------------------------------------------------------------------===//
7406// VPBROADCAST - Load from memory and broadcast to all elements of the
7407//               destination operand
7408//
7409multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7410                          X86MemOperand x86memop, PatFrag ld_frag,
7411                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7412  let Predicates = [HasAVX2, prd] in {
7413    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7414                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7415                  [(set VR128:$dst,
7416                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7417                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7418    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7419                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7420                  [(set VR128:$dst,
7421                   (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
7422                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7423    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7424                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7425                   [(set VR256:$dst,
7426                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7427                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7428    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7429                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7430                   [(set VR256:$dst,
7431                    (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
7432                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7433
7434    // Provide aliases for broadcast from the same register class that
7435    // automatically does the extract.
7436    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7437              (!cast<Instruction>(NAME#"Yrr")
7438                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7439  }
7440}
7441
7442defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
7443                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7444defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
7445                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7446defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
7447                                    v4i32, v8i32, NoVLX>;
7448defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
7449                                    v2i64, v4i64, NoVLX>;
7450
7451let Predicates = [HasAVX2, NoVLX] in {
7452  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7453  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7454            (VPBROADCASTQrm addr:$src)>;
7455  def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7456            (VPBROADCASTQYrm addr:$src)>;
7457
7458  def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7459            (VPBROADCASTDrm addr:$src)>;
7460  def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
7461            (VPBROADCASTDYrm addr:$src)>;
7462  def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7463            (VPBROADCASTQrm addr:$src)>;
7464  def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
7465            (VPBROADCASTQYrm addr:$src)>;
7466}
7467let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7468  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7469  // This means we'll encounter truncated i32 loads; match that here.
7470  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7471            (VPBROADCASTWrm addr:$src)>;
7472  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7473            (VPBROADCASTWYrm addr:$src)>;
7474  def : Pat<(v8i16 (X86VBroadcast
7475              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7476            (VPBROADCASTWrm addr:$src)>;
7477  def : Pat<(v8i16 (X86VBroadcast
7478              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7479            (VPBROADCASTWrm addr:$src)>;
7480  def : Pat<(v16i16 (X86VBroadcast
7481              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7482            (VPBROADCASTWYrm addr:$src)>;
7483  def : Pat<(v16i16 (X86VBroadcast
7484              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7485            (VPBROADCASTWYrm addr:$src)>;
7486}
7487
7488let Predicates = [HasAVX2, NoVLX] in {
7489  // Provide aliases for broadcast from the same register class that
7490  // automatically does the extract.
7491  def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
7492            (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
7493                                                    sub_xmm)))>;
7494  def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
7495            (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
7496                                                    sub_xmm)))>;
7497}
7498
7499let Predicates = [HasAVX2, NoVLX] in {
7500  // Provide fallback in case the load node that is used in the patterns above
7501  // is used by additional users, which prevents the pattern selection.
7502    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7503              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7504    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7505              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7506    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7507              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7508}
7509
7510let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7511  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7512        (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS
7513                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7514                                             GR8:$src, sub_8bit)),
7515                         VR128)))>;
7516  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7517        (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS
7518                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7519                                              GR8:$src, sub_8bit)),
7520                          VR128)))>;
7521
7522  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7523        (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS
7524                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7525                                             GR16:$src, sub_16bit)),
7526                         VR128)))>;
7527  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7528        (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS
7529                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7530                                              GR16:$src, sub_16bit)),
7531                          VR128)))>;
7532}
7533let Predicates = [HasAVX2, NoVLX] in {
7534  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7535            (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
7536  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7537            (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>;
7538  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7539            (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
7540  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7541            (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>;
7542}
7543
7544// AVX1 broadcast patterns
7545let Predicates = [HasAVX1Only] in {
7546def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7547          (VBROADCASTSSYrm addr:$src)>;
7548def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
7549          (VBROADCASTSDYrm addr:$src)>;
7550def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7551          (VBROADCASTSSrm addr:$src)>;
7552}
7553
7554  // Provide fallback in case the load node that is used in the patterns above
7555  // is used by additional users, which prevents the pattern selection.
7556let Predicates = [HasAVX, NoVLX] in {
7557  // 128bit broadcasts:
7558  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7559            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7560  def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
7561            (VMOVDDUPrm addr:$src)>;
7562
7563  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7564            (VMOVDDUPrr VR128:$src)>;
7565  def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
7566            (VMOVDDUPrm addr:$src)>;
7567  def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
7568            (VMOVDDUPrm addr:$src)>;
7569}
7570
7571let Predicates = [HasAVX1Only] in {
7572  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7573            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7574  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7575            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7576              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7577              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7578  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7579            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7580              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7581              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7582
7583  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7584            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>;
7585  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7586            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7587              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm),
7588              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>;
7589  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7590            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7591              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm),
7592              (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>;
7593
7594  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7595            (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>;
7596  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
7597            (VMOVDDUPrm addr:$src)>;
7598}
7599
7600//===----------------------------------------------------------------------===//
7601// VPERM - Permute instructions
7602//
7603
7604multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7605                     ValueType OpVT, X86FoldableSchedWrite Sched,
7606                     X86MemOperand memOp> {
7607  let Predicates = [HasAVX2, NoVLX] in {
7608    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7609                     (ins VR256:$src1, VR256:$src2),
7610                     !strconcat(OpcodeStr,
7611                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7612                     [(set VR256:$dst,
7613                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7614                     Sched<[Sched]>, VEX_4V, VEX_L;
7615    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7616                     (ins VR256:$src1, memOp:$src2),
7617                     !strconcat(OpcodeStr,
7618                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7619                     [(set VR256:$dst,
7620                       (OpVT (X86VPermv VR256:$src1,
7621                              (load addr:$src2))))]>,
7622                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7623  }
7624}
7625
7626defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7627let ExeDomain = SSEPackedSingle in
7628defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7629
7630multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7631                         ValueType OpVT, X86FoldableSchedWrite Sched,
7632                         X86MemOperand memOp> {
7633  let Predicates = [HasAVX2, NoVLX] in {
7634    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7635                       (ins VR256:$src1, u8imm:$src2),
7636                       !strconcat(OpcodeStr,
7637                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7638                       [(set VR256:$dst,
7639                         (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
7640                       Sched<[Sched]>, VEX, VEX_L;
7641    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7642                       (ins memOp:$src1, u8imm:$src2),
7643                       !strconcat(OpcodeStr,
7644                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7645                       [(set VR256:$dst,
7646                         (OpVT (X86VPermi (mem_frag addr:$src1),
7647                                (i8 imm:$src2))))]>,
7648                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7649  }
7650}
7651
7652defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7653                            WriteShuffle256, i256mem>, VEX_W;
7654let ExeDomain = SSEPackedDouble in
7655defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7656                             WriteFShuffle256, f256mem>, VEX_W;
7657
7658//===----------------------------------------------------------------------===//
7659// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7660//
7661let isCommutable = 1 in
7662def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7663          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7664          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7665          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7666                            (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
7667          VEX_4V, VEX_L;
7668def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7669          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7670          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7671          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7672                             (i8 imm:$src3)))]>,
7673          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7674
7675let Predicates = [HasAVX2] in
7676def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7677                                VR256:$src1, (i8 imm:$imm))),
7678          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
7679
7680
7681//===----------------------------------------------------------------------===//
7682// VINSERTI128 - Insert packed integer values
7683//
7684let hasSideEffects = 0 in {
7685def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7686          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7687          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7688          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7689let mayLoad = 1 in
7690def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7691          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7692          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7693          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7694}
7695
7696let Predicates = [HasAVX2, NoVLX] in {
7697  defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
7698  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
7699  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7700  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
7701}
7702
7703//===----------------------------------------------------------------------===//
7704// VEXTRACTI128 - Extract packed integer values
7705//
7706def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7707          (ins VR256:$src1, u8imm:$src2),
7708          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7709          Sched<[WriteShuffle256]>, VEX, VEX_L;
7710let hasSideEffects = 0, mayStore = 1 in
7711def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7712          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7713          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7714          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7715
7716let Predicates = [HasAVX2, NoVLX] in {
7717  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7718  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7719  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7720  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7721}
7722
7723//===----------------------------------------------------------------------===//
7724// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7725//
7726multiclass avx2_pmovmask<string OpcodeStr,
7727                         Intrinsic IntLd128, Intrinsic IntLd256,
7728                         Intrinsic IntSt128, Intrinsic IntSt256> {
7729  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7730             (ins VR128:$src1, i128mem:$src2),
7731             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7732             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7733             VEX_4V, Sched<[WriteVecMaskedLoad]>;
7734  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7735             (ins VR256:$src1, i256mem:$src2),
7736             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7737             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7738             VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
7739  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7740             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7741             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7742             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7743             VEX_4V, Sched<[WriteVecMaskedStore]>;
7744  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7745             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7746             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7747             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7748             VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
7749}
7750
7751defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7752                                int_x86_avx2_maskload_d,
7753                                int_x86_avx2_maskload_d_256,
7754                                int_x86_avx2_maskstore_d,
7755                                int_x86_avx2_maskstore_d_256>;
7756defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7757                                int_x86_avx2_maskload_q,
7758                                int_x86_avx2_maskload_q_256,
7759                                int_x86_avx2_maskstore_q,
7760                                int_x86_avx2_maskstore_q_256>, VEX_W;
7761
7762multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7763                          ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
7764    // masked store
7765    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7766             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7767    // masked load
7768    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7769             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7770    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7771                              (VT immAllZerosV))),
7772             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7773}
7774let Predicates = [HasAVX] in {
7775  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
7776  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
7777  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
7778  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
7779}
7780let Predicates = [HasAVX1Only] in {
7781  // load/store i32/i64 not supported use ps/pd version
7782  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
7783  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
7784  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
7785  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
7786}
7787let Predicates = [HasAVX2] in {
7788  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
7789  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
7790  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
7791  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
7792}
7793
7794//===----------------------------------------------------------------------===//
7795// SubVector Broadcasts
7796// Provide fallback in case the load node that is used in the patterns above
7797// is used by additional users, which prevents the pattern selection.
7798
7799let Predicates = [HasAVX, NoVLX] in {
7800def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7801          (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7802                         (v2f64 VR128:$src), 1)>;
7803def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7804          (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7805                         (v4f32 VR128:$src), 1)>;
7806}
7807
7808// NOTE: We're using FP instructions here, but execution domain fixing can
7809// convert to integer when profitable.
7810let Predicates = [HasAVX, NoVLX] in {
7811def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7812          (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7813                         (v2i64 VR128:$src), 1)>;
7814def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7815          (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7816                         (v4i32 VR128:$src), 1)>;
7817def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7818          (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7819                         (v8i16 VR128:$src), 1)>;
7820def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7821          (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7822                         (v16i8 VR128:$src), 1)>;
7823}
7824
7825//===----------------------------------------------------------------------===//
7826// Variable Bit Shifts
7827//
7828multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7829                          ValueType vt128, ValueType vt256> {
7830  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7831             (ins VR128:$src1, VR128:$src2),
7832             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7833             [(set VR128:$dst,
7834               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7835             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7836  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7837             (ins VR128:$src1, i128mem:$src2),
7838             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7839             [(set VR128:$dst,
7840               (vt128 (OpNode VR128:$src1,
7841                       (vt128 (load addr:$src2)))))]>,
7842             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7843                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7844  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7845             (ins VR256:$src1, VR256:$src2),
7846             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7847             [(set VR256:$dst,
7848               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7849             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7850  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7851             (ins VR256:$src1, i256mem:$src2),
7852             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7853             [(set VR256:$dst,
7854               (vt256 (OpNode VR256:$src1,
7855                       (vt256 (load addr:$src2)))))]>,
7856             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7857                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7858}
7859
7860let Predicates = [HasAVX2, NoVLX] in {
7861  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7862  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7863  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7864  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7865  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7866}
7867
7868//===----------------------------------------------------------------------===//
7869// VGATHER - GATHER Operations
7870
7871// FIXME: Improve scheduling of gather instructions.
7872multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7873                       ValueType VTy, PatFrag GatherNode128,
7874                       PatFrag GatherNode256, RegisterClass RC256,
7875                       X86MemOperand memop128, X86MemOperand memop256,
7876                       ValueType MTx = VTx, ValueType MTy = VTy> {
7877  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7878            (ins VR128:$src1, memop128:$src2, VR128:$mask),
7879            !strconcat(OpcodeStr,
7880              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7881            [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
7882                  (GatherNode128 VR128:$src1, VR128:$mask,
7883                                vectoraddr:$src2))]>,
7884            VEX, Sched<[WriteLoad]>;
7885  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7886            (ins RC256:$src1, memop256:$src2, RC256:$mask),
7887            !strconcat(OpcodeStr,
7888              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7889            [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
7890                  (GatherNode256 RC256:$src1, RC256:$mask,
7891                                vectoraddr:$src2))]>,
7892            VEX, VEX_L, Sched<[WriteLoad]>;
7893}
7894
7895let Predicates = [HasAVX2] in {
7896  let mayLoad = 1, hasSideEffects = 0, Constraints
7897    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7898    in {
7899    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
7900                        mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
7901    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
7902                        mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
7903    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
7904                        mgatherv8i32, VR256, vx128mem, vy256mem>;
7905    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
7906                        mgatherv4i64, VR128, vx64mem, vy128mem>;
7907
7908    let ExeDomain = SSEPackedDouble in {
7909      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
7910                          mgatherv4i32, VR256, vx128mem, vx256mem,
7911                          v2i64, v4i64>, VEX_W;
7912      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
7913                          mgatherv4i64, VR256, vx128mem, vy256mem,
7914                          v2i64, v4i64>, VEX_W;
7915    }
7916
7917    let ExeDomain = SSEPackedSingle in {
7918      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
7919                          mgatherv8i32, VR256, vx128mem, vy256mem,
7920                          v4i32, v8i32>;
7921      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
7922                          mgatherv4i64, VR128, vx64mem, vy128mem,
7923                          v4i32, v4i32>;
7924    }
7925  }
7926}
7927
7928//===----------------------------------------------------------------------===//
7929// GFNI instructions
7930//===----------------------------------------------------------------------===//
7931
7932multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7933                        RegisterClass RC, PatFrag MemOpFrag,
7934                        X86MemOperand X86MemOp, bit Is2Addr = 0> {
7935  let ExeDomain = SSEPackedInt,
7936      AsmString = !if(Is2Addr,
7937        OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
7938        OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7939    let isCommutable = 1 in
7940    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7941                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7942             Sched<[SchedWriteVecALU.XMM]>, T8PD;
7943
7944    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7945                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7946                                 (MemOpFrag addr:$src2))))]>,
7947             Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7948  }
7949}
7950
7951multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7952                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7953                           X86MemOperand X86MemOp, bit Is2Addr = 0> {
7954  let AsmString = !if(Is2Addr,
7955      OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7956      OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7957  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7958              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7959              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
7960              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7961  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7962              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7963              [(set RC:$dst, (OpVT (OpNode RC:$src1,
7964                                    (MemOpFrag addr:$src2),
7965                              imm:$src3)))], SSEPackedInt>,
7966              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
7967  }
7968}
7969
7970multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
7971  let Constraints = "$src1 = $dst",
7972      Predicates  = [HasGFNI, UseSSE2] in
7973  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
7974                                      VR128, load, i128mem, 1>;
7975  let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7976    defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
7977                                      load, i128mem>, VEX_4V, VEX_W;
7978    defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
7979                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
7980  }
7981}
7982
7983// GF2P8MULB
7984let Constraints = "$src1 = $dst",
7985    Predicates  = [HasGFNI, UseSSE2] in
7986defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
7987                                    i128mem, 1>;
7988let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7989  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
7990                                   i128mem>, VEX_4V;
7991  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
7992                                   i256mem>, VEX_4V, VEX_L;
7993}
7994// GF2P8AFFINEINVQB, GF2P8AFFINEQB
7995let isCommutable = 0 in {
7996  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
7997                                             X86GF2P8affineinvqb>, TAPD;
7998  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
7999                                             X86GF2P8affineqb>, TAPD;
8000}
8001
8002