xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               ComplexPattern mem_cpat, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
119  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
120                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
121}
122
123//===----------------------------------------------------------------------===//
124// AVX & SSE - Zero/One Vectors
125//===----------------------------------------------------------------------===//
126
127// Alias instruction that maps zero vector to pxor / xorp* for sse.
128// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
129// swizzled by ExecutionDomainFix to pxor.
130// We set canFoldAsLoad because this can be converted to a constant-pool
131// load of an all-zeros value if folding it would be beneficial.
132let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
133    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
134def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
135               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
136}
137
138let Predicates = [NoAVX512] in {
139def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
140def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
141def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
142def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
143def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
144}
145
146
147// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
148// and doesn't need it because on sandy bridge the register is set to zero
149// at the rename stage without using any execution unit, so SET0PSY
150// and SET0PDY can be used for vector int instructions without penalty
151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
153def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
154                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
155}
156
157let Predicates = [NoAVX512] in {
158def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
159def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
160def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
161def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
163}
164
165// We set canFoldAsLoad because this can be converted to a constant-pool
166// load of an all-ones value if folding it would be beneficial.
167let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
168    isPseudo = 1, SchedRW = [WriteZero] in {
169  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
170                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
171  let Predicates = [HasAVX1Only, OptForMinSize] in {
172  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
173                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
174  }
175  let Predicates = [HasAVX2] in
176  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178}
179
180//===----------------------------------------------------------------------===//
181// SSE 1 & 2 - Move FP Scalar Instructions
182//
183// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
184// register copies because it's a partial register update; Register-to-register
185// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
186// that the insert be implementable in terms of a copy, and just mentioned, we
187// don't use movss/movsd for copies.
188//===----------------------------------------------------------------------===//
189
190multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
191                         X86MemOperand x86memop, string base_opc,
192                         string asm_opr, Domain d, string Name> {
193  let isCommutable = 1 in
194  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
195              (ins VR128:$src1, VR128:$src2),
196              !strconcat(base_opc, asm_opr),
197              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
198              Sched<[SchedWriteFShuffle.XMM]>;
199
200  // For the disassembler
201  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
202  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
203                  (ins VR128:$src1, VR128:$src2),
204                  !strconcat(base_opc, asm_opr), []>,
205                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
206}
207
208multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
209                      X86MemOperand x86memop, string OpcodeStr,
210                      Domain d, string Name, Predicate pred> {
211  // AVX
212  let Predicates = [UseAVX, OptForSize] in
213  defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
214                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
215                              "V"#Name>,
216                              VEX_4V, VEX_LIG, VEX_WIG;
217
218  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
219                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
220                     [(store RC:$src, addr:$dst)], d>,
221                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
222  // SSE1 & 2
223  let Constraints = "$src1 = $dst" in {
224    let Predicates = [pred, NoSSE41_Or_OptForSize] in
225    defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
226                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
227  }
228
229  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
230                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231                     [(store RC:$src, addr:$dst)], d>,
232                     Sched<[WriteFStore]>;
233
234  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
235                  (!cast<Instruction>("V"#NAME#"rr_REV")
236                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
237  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
238                  (!cast<Instruction>(NAME#"rr_REV")
239                   VR128:$dst, VR128:$src2), 0>;
240}
241
242// Loading from memory automatically zeroing upper bits.
243multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
244                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
245                         Domain d> {
246  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
247                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
249                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
250  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
251                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
252                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
253                     Sched<[WriteFLoad]>;
254
255  // _alt version uses FR32/FR64 register class.
256  let isCodeGenOnly = 1 in {
257  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
258                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
259                         [(set RC:$dst, (mem_pat addr:$src))], d>,
260                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
261  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
262                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
263                         [(set RC:$dst, (mem_pat addr:$src))], d>,
264                         Sched<[WriteFLoad]>;
265  }
266}
267
268defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
269                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
270defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
271                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
272
273let canFoldAsLoad = 1, isReMaterializable = 1 in {
274  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
275                             SSEPackedSingle>, XS;
276  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
277                             SSEPackedDouble>, XD;
278}
279
280// Patterns
281let Predicates = [UseAVX] in {
282  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
283            (VMOVSSrm addr:$src)>;
284  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
285            (VMOVSDrm addr:$src)>;
286
287  // Represent the same patterns above but in the form they appear for
288  // 256-bit types
289  def : Pat<(v8f32 (X86vzload32 addr:$src)),
290            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
291  def : Pat<(v4f64 (X86vzload64 addr:$src)),
292            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
293}
294
295let Predicates = [UseAVX, OptForSize] in {
296  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
297  // MOVSS to the lower bits.
298  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
299            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
300  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
301            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
302
303  // Move low f32 and clear high bits.
304  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
305            (SUBREG_TO_REG (i32 0),
306             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
307              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
308  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
309            (SUBREG_TO_REG (i32 0),
310             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
311              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
312}
313
314let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
315// Move scalar to XMM zero-extended, zeroing a VR128 then do a
316// MOVSS to the lower bits.
317def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
318          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
319def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
320          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
321}
322
323let Predicates = [UseSSE2] in
324def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
325          (MOVSDrm addr:$src)>;
326
327let Predicates = [UseSSE1] in
328def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
329          (MOVSSrm addr:$src)>;
330
331//===----------------------------------------------------------------------===//
332// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
333//===----------------------------------------------------------------------===//
334
335multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
336                            X86MemOperand x86memop, PatFrag ld_frag,
337                            string asm, Domain d,
338                            X86SchedWriteMoveLS sched> {
339let hasSideEffects = 0, isMoveReg = 1 in
340  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
341              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
342           Sched<[sched.RR]>;
343let canFoldAsLoad = 1, isReMaterializable = 1 in
344  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
345              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
346                   [(set RC:$dst, (ld_frag addr:$src))], d>,
347           Sched<[sched.RM]>;
348}
349
350let Predicates = [HasAVX, NoVLX] in {
351defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
352                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
353                                PS, VEX, VEX_WIG;
354defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
355                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
356                                PD, VEX, VEX_WIG;
357defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
358                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
359                                PS, VEX, VEX_WIG;
360defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
361                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
362                                PD, VEX, VEX_WIG;
363
364defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
365                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
366                                 PS, VEX, VEX_L, VEX_WIG;
367defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
368                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
369                                 PD, VEX, VEX_L, VEX_WIG;
370defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
371                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
372                                 PS, VEX, VEX_L, VEX_WIG;
373defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
374                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
375                                 PD, VEX, VEX_L, VEX_WIG;
376}
377
378let Predicates = [UseSSE1] in {
379defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
380                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
381                               PS;
382defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
383                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384                               PS;
385}
386let Predicates = [UseSSE2] in {
387defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
388                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
389                               PD;
390defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
391                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
392                               PD;
393}
394
395let Predicates = [HasAVX, NoVLX]  in {
396let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
397def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
398                   "movaps\t{$src, $dst|$dst, $src}",
399                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
400                   VEX, VEX_WIG;
401def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
402                   "movapd\t{$src, $dst|$dst, $src}",
403                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
404                   VEX, VEX_WIG;
405def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
406                   "movups\t{$src, $dst|$dst, $src}",
407                   [(store (v4f32 VR128:$src), addr:$dst)]>,
408                   VEX, VEX_WIG;
409def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
410                   "movupd\t{$src, $dst|$dst, $src}",
411                   [(store (v2f64 VR128:$src), addr:$dst)]>,
412                   VEX, VEX_WIG;
413} // SchedRW
414
415let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
416def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
417                   "movaps\t{$src, $dst|$dst, $src}",
418                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
419                   VEX, VEX_L, VEX_WIG;
420def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
421                   "movapd\t{$src, $dst|$dst, $src}",
422                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
423                   VEX, VEX_L, VEX_WIG;
424def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
425                   "movups\t{$src, $dst|$dst, $src}",
426                   [(store (v8f32 VR256:$src), addr:$dst)]>,
427                   VEX, VEX_L, VEX_WIG;
428def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
429                   "movupd\t{$src, $dst|$dst, $src}",
430                   [(store (v4f64 VR256:$src), addr:$dst)]>,
431                   VEX, VEX_L, VEX_WIG;
432} // SchedRW
433} // Predicate
434
435// For disassembler
436let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
437    isMoveReg = 1 in {
438let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
439  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
440                          (ins VR128:$src),
441                          "movaps\t{$src, $dst|$dst, $src}", []>,
442                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
443  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
444                           (ins VR128:$src),
445                           "movapd\t{$src, $dst|$dst, $src}", []>,
446                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
447  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
448                           (ins VR128:$src),
449                           "movups\t{$src, $dst|$dst, $src}", []>,
450                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
451  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
452                           (ins VR128:$src),
453                           "movupd\t{$src, $dst|$dst, $src}", []>,
454                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
455} // SchedRW
456
457let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
458  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
459                            (ins VR256:$src),
460                            "movaps\t{$src, $dst|$dst, $src}", []>,
461                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
462  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
463                            (ins VR256:$src),
464                            "movapd\t{$src, $dst|$dst, $src}", []>,
465                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
466  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
467                            (ins VR256:$src),
468                            "movups\t{$src, $dst|$dst, $src}", []>,
469                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
470  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
471                            (ins VR256:$src),
472                            "movupd\t{$src, $dst|$dst, $src}", []>,
473                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
474} // SchedRW
475} // Predicate
476
477// Reversed version with ".s" suffix for GAS compatibility.
478def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
479                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
480def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
481                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
482def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
483                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
484def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
485                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
486def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
487                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
488def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
489                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
490def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
491                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
492def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
493                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
494
495let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
496def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
497                   "movaps\t{$src, $dst|$dst, $src}",
498                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
499def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500                   "movapd\t{$src, $dst|$dst, $src}",
501                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
502def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503                   "movups\t{$src, $dst|$dst, $src}",
504                   [(store (v4f32 VR128:$src), addr:$dst)]>;
505def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506                   "movupd\t{$src, $dst|$dst, $src}",
507                   [(store (v2f64 VR128:$src), addr:$dst)]>;
508} // SchedRW
509
510// For disassembler
511let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
512    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
513  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
514                         "movaps\t{$src, $dst|$dst, $src}", []>,
515                         FoldGenData<"MOVAPSrr">;
516  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517                         "movapd\t{$src, $dst|$dst, $src}", []>,
518                         FoldGenData<"MOVAPDrr">;
519  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520                         "movups\t{$src, $dst|$dst, $src}", []>,
521                         FoldGenData<"MOVUPSrr">;
522  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523                         "movupd\t{$src, $dst|$dst, $src}", []>,
524                         FoldGenData<"MOVUPDrr">;
525}
526
527// Reversed version with ".s" suffix for GAS compatibility.
528def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
529                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
530def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
531                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
532def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
533                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
534def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
535                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
536
537let Predicates = [HasAVX, NoVLX] in {
538  // 256-bit load/store need to use floating point load/store in case we don't
539  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
540  // available and changing the domain is beneficial.
541  def : Pat<(alignedloadv4i64 addr:$src),
542            (VMOVAPSYrm addr:$src)>;
543  def : Pat<(alignedloadv8i32 addr:$src),
544            (VMOVAPSYrm addr:$src)>;
545  def : Pat<(alignedloadv16i16 addr:$src),
546            (VMOVAPSYrm addr:$src)>;
547  def : Pat<(alignedloadv32i8 addr:$src),
548            (VMOVAPSYrm addr:$src)>;
549  def : Pat<(loadv4i64 addr:$src),
550            (VMOVUPSYrm addr:$src)>;
551  def : Pat<(loadv8i32 addr:$src),
552            (VMOVUPSYrm addr:$src)>;
553  def : Pat<(loadv16i16 addr:$src),
554            (VMOVUPSYrm addr:$src)>;
555  def : Pat<(loadv32i8 addr:$src),
556            (VMOVUPSYrm addr:$src)>;
557
558  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
559            (VMOVAPSYmr addr:$dst, VR256:$src)>;
560  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
561            (VMOVAPSYmr addr:$dst, VR256:$src)>;
562  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
563            (VMOVAPSYmr addr:$dst, VR256:$src)>;
564  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
565            (VMOVAPSYmr addr:$dst, VR256:$src)>;
566  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
567            (VMOVUPSYmr addr:$dst, VR256:$src)>;
568  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
569            (VMOVUPSYmr addr:$dst, VR256:$src)>;
570  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
571            (VMOVUPSYmr addr:$dst, VR256:$src)>;
572  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
573            (VMOVUPSYmr addr:$dst, VR256:$src)>;
574}
575
576// Use movaps / movups for SSE integer load / store (one byte shorter).
577// The instructions selected below are then converted to MOVDQA/MOVDQU
578// during the SSE domain pass.
579let Predicates = [UseSSE1] in {
580  def : Pat<(alignedloadv2i64 addr:$src),
581            (MOVAPSrm addr:$src)>;
582  def : Pat<(alignedloadv4i32 addr:$src),
583            (MOVAPSrm addr:$src)>;
584  def : Pat<(alignedloadv8i16 addr:$src),
585            (MOVAPSrm addr:$src)>;
586  def : Pat<(alignedloadv16i8 addr:$src),
587            (MOVAPSrm addr:$src)>;
588  def : Pat<(loadv2i64 addr:$src),
589            (MOVUPSrm addr:$src)>;
590  def : Pat<(loadv4i32 addr:$src),
591            (MOVUPSrm addr:$src)>;
592  def : Pat<(loadv8i16 addr:$src),
593            (MOVUPSrm addr:$src)>;
594  def : Pat<(loadv16i8 addr:$src),
595            (MOVUPSrm addr:$src)>;
596
597  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
598            (MOVAPSmr addr:$dst, VR128:$src)>;
599  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
600            (MOVAPSmr addr:$dst, VR128:$src)>;
601  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
602            (MOVAPSmr addr:$dst, VR128:$src)>;
603  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
604            (MOVAPSmr addr:$dst, VR128:$src)>;
605  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
606            (MOVUPSmr addr:$dst, VR128:$src)>;
607  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
608            (MOVUPSmr addr:$dst, VR128:$src)>;
609  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
610            (MOVUPSmr addr:$dst, VR128:$src)>;
611  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
612            (MOVUPSmr addr:$dst, VR128:$src)>;
613}
614
615//===----------------------------------------------------------------------===//
616// SSE 1 & 2 - Move Low packed FP Instructions
617//===----------------------------------------------------------------------===//
618
619multiclass sse12_mov_hilo_packed_base<bits<8>opc,  SDNode pdnode,
620                                      string base_opc, string asm_opr> {
621  // No pattern as they need be special cased between high and low.
622  let hasSideEffects = 0, mayLoad = 1 in
623  def PSrm : PI<opc, MRMSrcMem,
624                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
625                !strconcat(base_opc, "s", asm_opr),
626                [], SSEPackedSingle>, PS,
627                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
628
629  def PDrm : PI<opc, MRMSrcMem,
630         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
631         !strconcat(base_opc, "d", asm_opr),
632     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
633                              (scalar_to_vector (loadf64 addr:$src2)))))],
634              SSEPackedDouble>, PD,
635     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
636}
637
638multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
639                                 string base_opc> {
640  let Predicates = [UseAVX] in
641    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
642                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
643                                    VEX_4V, VEX_WIG;
644
645  let Constraints = "$src1 = $dst" in
646    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
647                                    "\t{$src2, $dst|$dst, $src2}">;
648}
649
650defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
651
652let SchedRW = [WriteFStore] in {
653let Predicates = [UseAVX] in {
654let mayStore = 1, hasSideEffects = 0 in
655def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
656                     "movlps\t{$src, $dst|$dst, $src}",
657                     []>,
658                     VEX, VEX_WIG;
659def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
660                     "movlpd\t{$src, $dst|$dst, $src}",
661                     [(store (f64 (extractelt (v2f64 VR128:$src),
662                                   (iPTR 0))), addr:$dst)]>,
663                     VEX, VEX_WIG;
664}// UseAVX
665let mayStore = 1, hasSideEffects = 0 in
666def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
667                   "movlps\t{$src, $dst|$dst, $src}",
668                   []>;
669def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
670                   "movlpd\t{$src, $dst|$dst, $src}",
671                   [(store (f64 (extractelt (v2f64 VR128:$src),
672                                 (iPTR 0))), addr:$dst)]>;
673} // SchedRW
674
675let Predicates = [UseSSE1] in {
676  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
677  // end up with a movsd or blend instead of shufp.
678  // No need for aligned load, we're only loading 64-bits.
679  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
680                      (i8 -28)),
681            (MOVLPSrm VR128:$src1, addr:$src2)>;
682  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
683            (MOVLPSrm VR128:$src1, addr:$src2)>;
684
685  def : Pat<(v4f32 (X86vzload64 addr:$src)),
686            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
687  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
688            (MOVLPSmr addr:$dst, VR128:$src)>;
689}
690
691//===----------------------------------------------------------------------===//
692// SSE 1 & 2 - Move Hi packed FP Instructions
693//===----------------------------------------------------------------------===//
694
695defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
696
697let SchedRW = [WriteFStore] in {
698// v2f64 extract element 1 is always custom lowered to unpack high to low
699// and extract element 0 so the non-store version isn't too horrible.
700let Predicates = [UseAVX] in {
701let mayStore = 1, hasSideEffects = 0 in
702def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703                   "movhps\t{$src, $dst|$dst, $src}",
704                   []>, VEX, VEX_WIG;
705def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
706                   "movhpd\t{$src, $dst|$dst, $src}",
707                   [(store (f64 (extractelt
708                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
709                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
710} // UseAVX
711let mayStore = 1, hasSideEffects = 0 in
712def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713                   "movhps\t{$src, $dst|$dst, $src}",
714                   []>;
715def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
716                   "movhpd\t{$src, $dst|$dst, $src}",
717                   [(store (f64 (extractelt
718                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
719                                 (iPTR 0))), addr:$dst)]>;
720} // SchedRW
721
722let Predicates = [UseAVX] in {
723  // Also handle an i64 load because that may get selected as a faster way to
724  // load the data.
725  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
726                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
727            (VMOVHPDrm VR128:$src1, addr:$src2)>;
728  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
729            (VMOVHPDrm VR128:$src1, addr:$src2)>;
730
731  def : Pat<(store (f64 (extractelt
732                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
733                          (iPTR 0))), addr:$dst),
734            (VMOVHPDmr addr:$dst, VR128:$src)>;
735
736  // MOVLPD patterns
737  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
738            (VMOVLPDrm VR128:$src1, addr:$src2)>;
739}
740
741let Predicates = [UseSSE1] in {
742  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
743  // end up with a movsd or blend instead of shufp.
744  // No need for aligned load, we're only loading 64-bits.
745  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
746            (MOVHPSrm VR128:$src1, addr:$src2)>;
747  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
748            (MOVHPSrm VR128:$src1, addr:$src2)>;
749
750  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
751                                addr:$dst),
752            (MOVHPSmr addr:$dst, VR128:$src)>;
753}
754
755let Predicates = [UseSSE2] in {
756  // MOVHPD patterns
757
758  // Also handle an i64 load because that may get selected as a faster way to
759  // load the data.
760  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
761                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
762            (MOVHPDrm VR128:$src1, addr:$src2)>;
763  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
764            (MOVHPDrm VR128:$src1, addr:$src2)>;
765
766  def : Pat<(store (f64 (extractelt
767                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
768                          (iPTR 0))), addr:$dst),
769            (MOVHPDmr addr:$dst, VR128:$src)>;
770
771  // MOVLPD patterns
772  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
773            (MOVLPDrm VR128:$src1, addr:$src2)>;
774}
775
776let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
777  // Use MOVLPD to load into the low bits from a full vector unless we can use
778  // BLENDPD.
779  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
780            (MOVLPDrm VR128:$src1, addr:$src2)>;
781}
782
783//===----------------------------------------------------------------------===//
784// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
785//===----------------------------------------------------------------------===//
786
787let Predicates = [UseAVX] in {
788  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
789                                       (ins VR128:$src1, VR128:$src2),
790                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
791                      [(set VR128:$dst,
792                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
793                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
794  let isCommutable = 1 in
795  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
796                                       (ins VR128:$src1, VR128:$src2),
797                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
798                      [(set VR128:$dst,
799                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
800                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
801                      NotMemoryFoldable;
802}
803let Constraints = "$src1 = $dst" in {
804  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
805                                       (ins VR128:$src1, VR128:$src2),
806                      "movlhps\t{$src2, $dst|$dst, $src2}",
807                      [(set VR128:$dst,
808                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
809                      Sched<[SchedWriteFShuffle.XMM]>;
810  let isCommutable = 1 in
811  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
812                                       (ins VR128:$src1, VR128:$src2),
813                      "movhlps\t{$src2, $dst|$dst, $src2}",
814                      [(set VR128:$dst,
815                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
816                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
817}
818
819//===----------------------------------------------------------------------===//
820// SSE 1 & 2 - Conversion Instructions
821//===----------------------------------------------------------------------===//
822
823multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
824                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
825                     string asm, string mem, X86FoldableSchedWrite sched,
826                     Domain d,
827                     SchedRead Int2Fpu = ReadDefault> {
828  let ExeDomain = d in {
829  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
830              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
831              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
832              Sched<[sched, Int2Fpu]>;
833  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
834              mem#"\t{$src, $dst|$dst, $src}",
835              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
836              Sched<[sched.Folded]>;
837  }
838}
839
840multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
841                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
842                       string asm, Domain d, X86FoldableSchedWrite sched> {
843let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
844  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
845             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
846             Sched<[sched]>;
847  let mayLoad = 1 in
848  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
849             [(set RC:$dst, (DstTy (any_sint_to_fp
850                                    (SrcTy (ld_frag addr:$src)))))], d>,
851             Sched<[sched.Folded]>;
852}
853}
854
855multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
856                          X86MemOperand x86memop, string asm, string mem,
857                          X86FoldableSchedWrite sched, Domain d> {
858let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
859  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
860              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
861              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
862  let mayLoad = 1 in
863  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
864              (ins DstRC:$src1, x86memop:$src),
865              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
866           Sched<[sched.Folded, sched.ReadAfterFold]>;
867} // hasSideEffects = 0
868}
869
870let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
871defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
872                                "cvttss2si", "cvttss2si",
873                                WriteCvtSS2I, SSEPackedSingle>,
874                                XS, VEX, VEX_LIG;
875defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
876                                "cvttss2si", "cvttss2si",
877                                WriteCvtSS2I, SSEPackedSingle>,
878                                XS, VEX, VEX_W, VEX_LIG;
879defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
880                                "cvttsd2si", "cvttsd2si",
881                                WriteCvtSD2I, SSEPackedDouble>,
882                                XD, VEX, VEX_LIG;
883defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
884                                "cvttsd2si", "cvttsd2si",
885                                WriteCvtSD2I, SSEPackedDouble>,
886                                XD, VEX, VEX_W, VEX_LIG;
887}
888
889// The assembler can recognize rr 64-bit instructions by seeing a rxx
890// register, but the same isn't true when only using memory operands,
891// provide other assembly "l" and "q" forms to address this explicitly
892// where appropriate to do so.
893let isCodeGenOnly = 1 in {
894defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
895                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
896                                  VEX_LIG, SIMD_EXC;
897defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
898                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
899                                  VEX_W, VEX_LIG, SIMD_EXC;
900defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
901                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
902                                  VEX_LIG;
903defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
904                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
905                                  VEX_W, VEX_LIG, SIMD_EXC;
906} // isCodeGenOnly = 1
907
908let Predicates = [UseAVX] in {
909  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
910            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
911  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
912            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
913  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
914            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
915  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
916            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
917
918  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
919            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
920  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
921            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
922  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
923            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
924  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
925            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
926}
927
928let isCodeGenOnly = 1 in {
929defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
930                      "cvttss2si", "cvttss2si",
931                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
932defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
933                      "cvttss2si", "cvttss2si",
934                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
935defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
936                      "cvttsd2si", "cvttsd2si",
937                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
938defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
939                      "cvttsd2si", "cvttsd2si",
940                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
941defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
942                      "cvtsi2ss", "cvtsi2ss{l}",
943                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
944defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
945                      "cvtsi2ss", "cvtsi2ss{q}",
946                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
947defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
948                      "cvtsi2sd", "cvtsi2sd{l}",
949                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
950defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
951                      "cvtsi2sd", "cvtsi2sd{q}",
952                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
953} // isCodeGenOnly = 1
954
955// Conversion Instructions Intrinsics - Match intrinsics which expect MM
956// and/or XMM operand(s).
957
958multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
959                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
960                          Operand memop, ComplexPattern mem_cpat, string asm,
961                          X86FoldableSchedWrite sched, Domain d> {
962let ExeDomain = d in {
963  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
964                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
965                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
966               Sched<[sched]>;
967  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
968                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
969                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
970               Sched<[sched.Folded]>;
971}
972}
973
974multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
975                    RegisterClass DstRC, X86MemOperand x86memop,
976                    string asm, string mem, X86FoldableSchedWrite sched,
977                    Domain d, bit Is2Addr = 1> {
978let hasSideEffects = 0, ExeDomain = d in {
979  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
980                  !if(Is2Addr,
981                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
982                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
983                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
984  let mayLoad = 1 in
985  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
986                  (ins DstRC:$src1, x86memop:$src2),
987                  !if(Is2Addr,
988                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
989                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
990                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
991}
992}
993
994let Uses = [MXCSR], mayRaiseFPException = 1 in {
995let Predicates = [UseAVX] in {
996defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
997                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
998                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
999defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1000                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1001                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1002}
1003defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1004                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1005                 SSEPackedDouble>, XD;
1006defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1007                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1008                   SSEPackedDouble>, XD, REX_W;
1009}
1010
1011let Predicates = [UseAVX] in {
1012defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1013          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1014          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1015defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1016          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1017          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1018defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1019          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1020          XD, VEX_4V, VEX_LIG;
1021defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1022          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1023          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1024}
1025let Constraints = "$src1 = $dst" in {
1026  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1027                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1028                        XS, SIMD_EXC;
1029  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1030                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1031                        XS, REX_W, SIMD_EXC;
1032  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1033                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1034                        XD;
1035  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1036                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1037                        XD, REX_W, SIMD_EXC;
1038}
1039
1040def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1041               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1042def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1043               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1044def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1045               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1046def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1047               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1048
1049def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1050              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1051def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1052              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1053
1054def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1055                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1056def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1057                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1058def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1059                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1060def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1061                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1062
1063def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1064                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1065def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1066                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1067
1068/// SSE 1 Only
1069
1070// Aliases for intrinsics
1071let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1072defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1073                                ssmem, sse_load_f32, "cvttss2si",
1074                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1075defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1076                               X86cvtts2Int, ssmem, sse_load_f32,
1077                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1078                               XS, VEX, VEX_LIG, VEX_W;
1079defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1080                                sdmem, sse_load_f64, "cvttsd2si",
1081                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1082defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1083                              X86cvtts2Int, sdmem, sse_load_f64,
1084                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1085                              XD, VEX, VEX_LIG, VEX_W;
1086}
1087let Uses = [MXCSR], mayRaiseFPException = 1 in {
1088defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1089                                    ssmem, sse_load_f32, "cvttss2si",
1090                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1091defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1092                                   X86cvtts2Int, ssmem, sse_load_f32,
1093                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1094                                   XS, REX_W;
1095defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1096                                    sdmem, sse_load_f64, "cvttsd2si",
1097                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1098defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1099                                  X86cvtts2Int, sdmem, sse_load_f64,
1100                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1101                                  XD, REX_W;
1102}
1103
1104def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1105                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1106def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1107                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1108def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1109                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1110def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1111                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1112def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1113                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1114def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1115                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1116def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1117                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1118def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1119                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1120
1121def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1122                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1123def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1124                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1125def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1126                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1127def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1128                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1129def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1130                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1131def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1132                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1133def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1134                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1135def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1136                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1137
1138let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1139defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1140                                  ssmem, sse_load_f32, "cvtss2si",
1141                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1142defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1143                                  ssmem, sse_load_f32, "cvtss2si",
1144                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1145}
1146let Uses = [MXCSR], mayRaiseFPException = 1 in {
1147defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1148                               ssmem, sse_load_f32, "cvtss2si",
1149                               WriteCvtSS2I, SSEPackedSingle>, XS;
1150defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1151                                 ssmem, sse_load_f32, "cvtss2si",
1152                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1153
1154defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1155                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1156                               SSEPackedSingle, WriteCvtI2PS>,
1157                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1158defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1159                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1160                               SSEPackedSingle, WriteCvtI2PSY>,
1161                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1162
1163defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1164                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1165                            SSEPackedSingle, WriteCvtI2PS>,
1166                            PS, Requires<[UseSSE2]>;
1167}
1168
1169// AVX aliases
1170def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1171                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1172def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1173                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1174def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1175                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1176def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1177                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1178def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1179                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1180def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1181                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1182def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1183                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1184def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1185                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1186
1187// SSE aliases
1188def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1189                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1190def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1191                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1192def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1193                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1194def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1195                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1196def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1197                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1198def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1199                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1200def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1201                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1202def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1203                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1204
1205/// SSE 2 Only
1206
1207// Convert scalar double to scalar single
1208let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1209def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1210                        (ins FR32:$src1, FR64:$src2),
1211                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1212                        VEX_4V, VEX_LIG, VEX_WIG,
1213                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1214let mayLoad = 1 in
1215def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1216                     (ins FR32:$src1, f64mem:$src2),
1217                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1218                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1219                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1220}
1221
1222def : Pat<(f32 (any_fpround FR64:$src)),
1223            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1224          Requires<[UseAVX]>;
1225
1226let isCodeGenOnly = 1 in {
1227def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1228                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1229                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1230                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1231def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1232                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1233                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1234                    XD, Requires<[UseSSE2, OptForSize]>,
1235                    Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
1236}
1237
1238let Uses = [MXCSR], mayRaiseFPException = 1 in {
1239def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1240                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1241                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1242                       [(set VR128:$dst,
1243                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1244                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1245                       Sched<[WriteCvtSD2SS]>;
1246def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1247                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1248                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1249                       [(set VR128:$dst,
1250                         (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
1251                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1252                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1253let Constraints = "$src1 = $dst" in {
1254def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1255                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1256                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1257                       [(set VR128:$dst,
1258                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1259                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1260def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1261                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1262                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1263                       [(set VR128:$dst,
1264                         (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
1265                       XD, Requires<[UseSSE2]>,
1266                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1267}
1268}
1269
1270// Convert scalar single to scalar double
1271// SSE2 instructions with XS prefix
1272let isCodeGenOnly = 1, hasSideEffects = 0 in {
1273def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1274                    (ins FR64:$src1, FR32:$src2),
1275                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1276                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1277                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1278let mayLoad = 1 in
1279def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1280                    (ins FR64:$src1, f32mem:$src2),
1281                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1282                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1283                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1284                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1285} // isCodeGenOnly = 1, hasSideEffects = 0
1286
1287def : Pat<(f64 (any_fpextend FR32:$src)),
1288    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1289def : Pat<(any_fpextend (loadf32 addr:$src)),
1290    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1291
1292let isCodeGenOnly = 1 in {
1293def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1294                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1295                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1296                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1297def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1298                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1299                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1300                   XS, Requires<[UseSSE2, OptForSize]>,
1301                   Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
1302} // isCodeGenOnly = 1
1303
1304let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
1305def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1306                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1307                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1308                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1309                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1310let mayLoad = 1 in
1311def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1312                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1313                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1314                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1315                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1316let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1317def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1318                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1319                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1320                    []>, XS, Requires<[UseSSE2]>,
1321                    Sched<[WriteCvtSS2SD]>;
1322let mayLoad = 1 in
1323def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1324                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1325                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1326                    []>, XS, Requires<[UseSSE2]>,
1327                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1328}
1329} // hasSideEffects = 0
1330
1331// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1332// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1333// vmovs{s,d} instructions
1334let Predicates = [UseAVX] in {
1335def : Pat<(v4f32 (X86Movss
1336                   (v4f32 VR128:$dst),
1337                   (v4f32 (scalar_to_vector
1338                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1339          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1340
1341def : Pat<(v2f64 (X86Movsd
1342                   (v2f64 VR128:$dst),
1343                   (v2f64 (scalar_to_vector
1344                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1345          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1346
1347def : Pat<(v4f32 (X86Movss
1348                   (v4f32 VR128:$dst),
1349                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1350          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1351
1352def : Pat<(v4f32 (X86Movss
1353                   (v4f32 VR128:$dst),
1354                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1355          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1356
1357def : Pat<(v4f32 (X86Movss
1358                   (v4f32 VR128:$dst),
1359                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1360          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1361
1362def : Pat<(v4f32 (X86Movss
1363                   (v4f32 VR128:$dst),
1364                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1365          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1366
1367def : Pat<(v2f64 (X86Movsd
1368                   (v2f64 VR128:$dst),
1369                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1370          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1371
1372def : Pat<(v2f64 (X86Movsd
1373                   (v2f64 VR128:$dst),
1374                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1375          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1376
1377def : Pat<(v2f64 (X86Movsd
1378                   (v2f64 VR128:$dst),
1379                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1380          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1381
1382def : Pat<(v2f64 (X86Movsd
1383                   (v2f64 VR128:$dst),
1384                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1385          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1386} // Predicates = [UseAVX]
1387
1388let Predicates = [UseSSE2] in {
1389def : Pat<(v4f32 (X86Movss
1390                   (v4f32 VR128:$dst),
1391                   (v4f32 (scalar_to_vector
1392                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1393          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1394
1395def : Pat<(v2f64 (X86Movsd
1396                   (v2f64 VR128:$dst),
1397                   (v2f64 (scalar_to_vector
1398                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1399          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1400
1401def : Pat<(v2f64 (X86Movsd
1402                   (v2f64 VR128:$dst),
1403                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1404          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1405
1406def : Pat<(v2f64 (X86Movsd
1407                   (v2f64 VR128:$dst),
1408                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1409          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1410
1411def : Pat<(v2f64 (X86Movsd
1412                   (v2f64 VR128:$dst),
1413                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1414          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1415
1416def : Pat<(v2f64 (X86Movsd
1417                   (v2f64 VR128:$dst),
1418                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1419          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1420} // Predicates = [UseSSE2]
1421
1422let Predicates = [UseSSE1] in {
1423def : Pat<(v4f32 (X86Movss
1424                   (v4f32 VR128:$dst),
1425                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1426          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1427
1428def : Pat<(v4f32 (X86Movss
1429                   (v4f32 VR128:$dst),
1430                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1431          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1432
1433def : Pat<(v4f32 (X86Movss
1434                   (v4f32 VR128:$dst),
1435                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1436          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1437
1438def : Pat<(v4f32 (X86Movss
1439                   (v4f32 VR128:$dst),
1440                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1441          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1442} // Predicates = [UseSSE1]
1443
1444let Predicates = [HasAVX, NoVLX] in {
1445// Convert packed single/double fp to doubleword
1446def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1447                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1448                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1449                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1450def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1451                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1452                       [(set VR128:$dst,
1453                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1454                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1455def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1456                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1457                        [(set VR256:$dst,
1458                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1459                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1460def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1461                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1462                        [(set VR256:$dst,
1463                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1464                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1465}
1466def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1467                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1468                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1469                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1470def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1471                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1472                     [(set VR128:$dst,
1473                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1474                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1475
1476
1477// Convert Packed Double FP to Packed DW Integers
1478let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1479// The assembler can recognize rr 256-bit instructions by seeing a ymm
1480// register, but the same isn't true when using memory operands instead.
1481// Provide other assembly rr and rm forms to address this explicitly.
1482def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1483                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1484                       [(set VR128:$dst,
1485                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1486                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1487
1488// XMM only
1489def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1490                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1491                      [(set VR128:$dst,
1492                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1493                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1494
1495// YMM only
1496def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1497                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1498                       [(set VR128:$dst,
1499                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1500                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1501def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1502                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1503                       [(set VR128:$dst,
1504                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1505                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1506}
1507
1508def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1509                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1510def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1511                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1512
1513def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1514                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1515                      [(set VR128:$dst,
1516                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1517                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1518def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1519                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1520                      [(set VR128:$dst,
1521                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1522                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1523
1524// Convert with truncation packed single/double fp to doubleword
1525// SSE2 packed instructions with XS prefix
1526let Uses = [MXCSR], mayRaiseFPException = 1 in {
1527let Predicates = [HasAVX, NoVLX] in {
1528def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1529                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1530                         [(set VR128:$dst,
1531                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1532                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1533def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1534                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1535                         [(set VR128:$dst,
1536                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1537                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1538def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1539                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1540                          [(set VR256:$dst,
1541                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1542                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1543def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1544                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1545                          [(set VR256:$dst,
1546                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1547                          VEX, VEX_L,
1548                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1549}
1550
1551def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1552                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1553                       [(set VR128:$dst,
1554                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1555                       Sched<[WriteCvtPS2I]>;
1556def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1557                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1558                       [(set VR128:$dst,
1559                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1560                       Sched<[WriteCvtPS2ILd]>;
1561}
1562
1563// The assembler can recognize rr 256-bit instructions by seeing a ymm
1564// register, but the same isn't true when using memory operands instead.
1565// Provide other assembly rr and rm forms to address this explicitly.
1566let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1567// XMM only
1568def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1569                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1570                        [(set VR128:$dst,
1571                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1572                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1573def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1574                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1575                        [(set VR128:$dst,
1576                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1577                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1578
1579// YMM only
1580def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1581                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1582                         [(set VR128:$dst,
1583                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1584                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1585def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1586                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1587                         [(set VR128:$dst,
1588                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1589                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1590} // Predicates = [HasAVX, NoVLX]
1591
1592def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1593                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1594def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1595                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1596
1597let Predicates = [HasAVX, NoVLX] in {
1598  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1599            (VCVTTPD2DQYrr VR256:$src)>;
1600  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1601            (VCVTTPD2DQYrm addr:$src)>;
1602}
1603
1604def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1605                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1606                      [(set VR128:$dst,
1607                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1608                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1609def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1610                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1611                      [(set VR128:$dst,
1612                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1613                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1614
1615// Convert packed single to packed double
1616let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1617                  // SSE2 instructions without OpSize prefix
1618def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1619                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1620                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1621                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1622def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1623                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1624                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1625                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1626def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1627                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1628                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1629                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1630def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1631                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1632                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1633                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1634}
1635
1636let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1637def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1638                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1639                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1640                   PS, Sched<[WriteCvtPS2PD]>;
1641def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1642                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1643                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1644                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1645}
1646
1647// Convert Packed DW Integers to Packed Double FP
1648let Predicates = [HasAVX, NoVLX] in {
1649let hasSideEffects = 0, mayLoad = 1 in
1650def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1651                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1652                        [(set VR128:$dst,
1653                          (v2f64 (X86any_VSintToFP
1654                                  (bc_v4i32
1655                                   (v2i64 (scalar_to_vector
1656                                           (loadi64 addr:$src)))))))]>,
1657                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1658def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1659                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1660                        [(set VR128:$dst,
1661                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1662                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1663def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1664                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1665                         [(set VR256:$dst,
1666                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1667                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1668                         VEX_WIG;
1669def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1670                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1671                         [(set VR256:$dst,
1672                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1673                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1674}
1675
1676let hasSideEffects = 0, mayLoad = 1 in
1677def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1678                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1679                       [(set VR128:$dst,
1680                         (v2f64 (X86any_VSintToFP
1681                                 (bc_v4i32
1682                                  (v2i64 (scalar_to_vector
1683                                          (loadi64 addr:$src)))))))]>,
1684                       Sched<[WriteCvtI2PDLd]>;
1685def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1686                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1687                       [(set VR128:$dst,
1688                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1689                       Sched<[WriteCvtI2PD]>;
1690
1691// AVX register conversion intrinsics
1692let Predicates = [HasAVX, NoVLX] in {
1693  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1694            (VCVTDQ2PDrm addr:$src)>;
1695} // Predicates = [HasAVX, NoVLX]
1696
1697// SSE2 register conversion intrinsics
1698let Predicates = [UseSSE2] in {
1699  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1700            (CVTDQ2PDrm addr:$src)>;
1701} // Predicates = [UseSSE2]
1702
1703// Convert packed double to packed single
1704// The assembler can recognize rr 256-bit instructions by seeing a ymm
1705// register, but the same isn't true when using memory operands instead.
1706// Provide other assembly rr and rm forms to address this explicitly.
1707let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1708// XMM only
1709def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1710                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1711                       [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1712                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1713def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1714                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1715                       [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
1716                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1717
1718def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1719                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1720                        [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
1721                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1722def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1723                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1724                        [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
1725                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1726} // Predicates = [HasAVX, NoVLX]
1727
1728def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1729                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1730def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1731                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1732
1733def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1734                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1735                     [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1736                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1737def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1738                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1739                     [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
1740                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1741
1742//===----------------------------------------------------------------------===//
1743// SSE 1 & 2 - Compare Instructions
1744//===----------------------------------------------------------------------===//
1745
1746// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1747multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1748                            SDNode OpNode, ValueType VT,
1749                            PatFrag ld_frag, string asm,
1750                            X86FoldableSchedWrite sched> {
1751let Uses = [MXCSR], mayRaiseFPException = 1 in {
1752  let isCommutable = 1 in
1753  def rr : SIi8<0xC2, MRMSrcReg,
1754                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1755                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
1756                Sched<[sched]>;
1757  def rm : SIi8<0xC2, MRMSrcMem,
1758                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1759                [(set RC:$dst, (OpNode (VT RC:$src1),
1760                                         (ld_frag addr:$src2), timm:$cc))]>,
1761                Sched<[sched.Folded, sched.ReadAfterFold]>;
1762}
1763}
1764
1765let isCodeGenOnly = 1 in {
1766  let ExeDomain = SSEPackedSingle in
1767  defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1768                   "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1769                   SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
1770  let ExeDomain = SSEPackedDouble in
1771  defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1772                   "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1773                   SchedWriteFCmpSizes.PD.Scl>,
1774                   XD, VEX_4V, VEX_LIG, VEX_WIG;
1775
1776  let Constraints = "$src1 = $dst" in {
1777    let ExeDomain = SSEPackedSingle in
1778    defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
1779                    "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1780                    SchedWriteFCmpSizes.PS.Scl>, XS;
1781    let ExeDomain = SSEPackedDouble in
1782    defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
1783                    "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1784                    SchedWriteFCmpSizes.PD.Scl>, XD;
1785  }
1786}
1787
1788multiclass sse12_cmp_scalar_int<Operand memop,
1789                         Intrinsic Int, string asm, X86FoldableSchedWrite sched,
1790                         ComplexPattern mem_cpat> {
1791let Uses = [MXCSR], mayRaiseFPException = 1 in {
1792  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1793                      (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
1794                        [(set VR128:$dst, (Int VR128:$src1,
1795                                               VR128:$src, timm:$cc))]>,
1796           Sched<[sched]>;
1797let mayLoad = 1 in
1798  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1799                      (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
1800                        [(set VR128:$dst, (Int VR128:$src1,
1801                                               mem_cpat:$src, timm:$cc))]>,
1802           Sched<[sched.Folded, sched.ReadAfterFold]>;
1803}
1804}
1805
1806// Aliases to match intrinsics which expect XMM operand(s).
1807let ExeDomain = SSEPackedSingle in
1808defm VCMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1809                     "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1810                     SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1811                     XS, VEX_4V, VEX_LIG, VEX_WIG;
1812let ExeDomain = SSEPackedDouble in
1813defm VCMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1814                     "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
1815                     SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1816                     XD, VEX_4V, VEX_LIG, VEX_WIG;
1817let Constraints = "$src1 = $dst" in {
1818  let ExeDomain = SSEPackedSingle in
1819  defm CMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
1820                       "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
1821                       SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1822  let ExeDomain = SSEPackedDouble in
1823  defm CMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
1824                       "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
1825                       SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1826}
1827
1828
1829// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1830multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1831                         ValueType vt, X86MemOperand x86memop,
1832                         PatFrag ld_frag, string OpcodeStr, Domain d,
1833                         X86FoldableSchedWrite sched = WriteFCom> {
1834let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1835    ExeDomain = d in {
1836  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1837                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1838                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1839          Sched<[sched]>;
1840let mayLoad = 1 in
1841  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1842                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1843                     [(set EFLAGS, (OpNode (vt RC:$src1),
1844                                           (ld_frag addr:$src2)))]>,
1845          Sched<[sched.Folded, sched.ReadAfterFold]>;
1846}
1847}
1848
1849// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1850multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1851                             ValueType vt, Operand memop,
1852                             ComplexPattern mem_cpat, string OpcodeStr,
1853                             Domain d,
1854                             X86FoldableSchedWrite sched = WriteFCom> {
1855let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
1856  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1857                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1858                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1859          Sched<[sched]>;
1860let mayLoad = 1 in
1861  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1862                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1863                     [(set EFLAGS, (OpNode (vt RC:$src1),
1864                                           mem_cpat:$src2))]>,
1865          Sched<[sched.Folded, sched.ReadAfterFold]>;
1866}
1867}
1868
1869let Defs = [EFLAGS] in {
1870  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1871                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1872  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1873                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1874  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1875                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1876  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1877                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1878
1879  let isCodeGenOnly = 1 in {
1880    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1881                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1882    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1883                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1884
1885    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1886                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1887    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1888                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1889  }
1890  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1891                                  "ucomiss", SSEPackedSingle>, PS;
1892  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1893                                  "ucomisd", SSEPackedDouble>, PD;
1894  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1895                                  "comiss", SSEPackedSingle>, PS;
1896  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1897                                  "comisd", SSEPackedDouble>, PD;
1898
1899  let isCodeGenOnly = 1 in {
1900    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1901                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1902    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1903                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1904
1905    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1906                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1907    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1908                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1909  }
1910} // Defs = [EFLAGS]
1911
1912// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1913multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1914                            ValueType VT, string asm,
1915                            X86FoldableSchedWrite sched,
1916                            Domain d, PatFrag ld_frag> {
1917let Uses = [MXCSR], mayRaiseFPException = 1 in {
1918  let isCommutable = 1 in
1919  def rri : PIi8<0xC2, MRMSrcReg,
1920             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1921             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1922            Sched<[sched]>;
1923  def rmi : PIi8<0xC2, MRMSrcMem,
1924             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1925             [(set RC:$dst,
1926               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1927            Sched<[sched.Folded, sched.ReadAfterFold]>;
1928}
1929}
1930
1931defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1932               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1933               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1934defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1935               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1936               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1937defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1938               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1939               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1940defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1941               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1942               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1943let Constraints = "$src1 = $dst" in {
1944  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1945                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1946                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1947  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1948                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1949                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1950}
1951
1952def CommutableCMPCC : PatLeaf<(timm), [{
1953  uint64_t Imm = N->getZExtValue() & 0x7;
1954  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1955}]>;
1956
1957// Patterns to select compares with loads in first operand.
1958let Predicates = [HasAVX] in {
1959  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1960                                CommutableCMPCC:$cc)),
1961            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1962
1963  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
1964                                CommutableCMPCC:$cc)),
1965            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1966
1967  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
1968                                CommutableCMPCC:$cc)),
1969            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1970
1971  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
1972                                CommutableCMPCC:$cc)),
1973            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1974
1975  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1976                          CommutableCMPCC:$cc)),
1977            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1978
1979  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1980                          CommutableCMPCC:$cc)),
1981            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1982}
1983
1984let Predicates = [UseSSE2] in {
1985  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
1986                                CommutableCMPCC:$cc)),
1987            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1988
1989  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1990                          CommutableCMPCC:$cc)),
1991            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1992}
1993
1994let Predicates = [UseSSE1] in {
1995  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
1996                                CommutableCMPCC:$cc)),
1997            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1998
1999  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2000                          CommutableCMPCC:$cc)),
2001            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2002}
2003
2004//===----------------------------------------------------------------------===//
2005// SSE 1 & 2 - Shuffle Instructions
2006//===----------------------------------------------------------------------===//
2007
2008/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2009multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2010                         ValueType vt, string asm, PatFrag mem_frag,
2011                         X86FoldableSchedWrite sched, Domain d,
2012                         bit IsCommutable = 0> {
2013  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2014                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2015                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2016                                       (i8 timm:$src3))))], d>,
2017            Sched<[sched.Folded, sched.ReadAfterFold]>;
2018  let isCommutable = IsCommutable in
2019  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2020                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2021                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2022                                     (i8 timm:$src3))))], d>,
2023            Sched<[sched]>;
2024}
2025
2026let Predicates = [HasAVX, NoVLX] in {
2027  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2028           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2029           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2030           PS, VEX_4V, VEX_WIG;
2031  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2032           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2033           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2034           PS, VEX_4V, VEX_L, VEX_WIG;
2035  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2036           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2037           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2038           PD, VEX_4V, VEX_WIG;
2039  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2040           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2041           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2042           PD, VEX_4V, VEX_L, VEX_WIG;
2043}
2044let Constraints = "$src1 = $dst" in {
2045  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2046                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2047                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2048  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2049                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2050                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2051}
2052
2053//===----------------------------------------------------------------------===//
2054// SSE 1 & 2 - Unpack FP Instructions
2055//===----------------------------------------------------------------------===//
2056
2057/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2058multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2059                                   PatFrag mem_frag, RegisterClass RC,
2060                                   X86MemOperand x86memop, string asm,
2061                                   X86FoldableSchedWrite sched, Domain d,
2062                                   bit IsCommutable = 0> {
2063    let isCommutable = IsCommutable in
2064    def rr : PI<opc, MRMSrcReg,
2065                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2066                asm, [(set RC:$dst,
2067                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2068                Sched<[sched]>;
2069    def rm : PI<opc, MRMSrcMem,
2070                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2071                asm, [(set RC:$dst,
2072                           (vt (OpNode RC:$src1,
2073                                       (mem_frag addr:$src2))))], d>,
2074             Sched<[sched.Folded, sched.ReadAfterFold]>;
2075}
2076
2077let Predicates = [HasAVX, NoVLX] in {
2078defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2079      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2080                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2081defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2082      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2083                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2084defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2085      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2086                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2087defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2088      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2089                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2090
2091defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2092      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2093                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2094defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2095      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2096                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2097defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2098      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2099                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2100defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2101      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2102                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2103}// Predicates = [HasAVX, NoVLX]
2104
2105let Constraints = "$src1 = $dst" in {
2106  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2107        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2108                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2109  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2110        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2111                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2112  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2113        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2114                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2115  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2116        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2117                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2118} // Constraints = "$src1 = $dst"
2119
2120let Predicates = [HasAVX1Only] in {
2121  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2122            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2123  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2124            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2125  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2126            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2127  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2128            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2129
2130  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2131            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2132  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2133            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2134  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2135            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2136  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2137            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2138}
2139
2140let Predicates = [UseSSE2] in {
2141  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2142  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2143                              (v2f64 (simple_load addr:$src2)))),
2144            (MOVHPDrm VR128:$src1, addr:$src2)>;
2145}
2146
2147//===----------------------------------------------------------------------===//
2148// SSE 1 & 2 - Extract Floating-Point Sign mask
2149//===----------------------------------------------------------------------===//
2150
2151/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2152multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2153                                string asm, Domain d> {
2154  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2155              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2156              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2157              Sched<[WriteFMOVMSK]>;
2158}
2159
2160let Predicates = [HasAVX] in {
2161  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2162                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2163  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2164                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2165  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2166                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2167  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2168                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2169
2170  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2171  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2172            (VMOVMSKPSrr VR128:$src)>;
2173  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2174            (VMOVMSKPDrr VR128:$src)>;
2175  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2176            (VMOVMSKPSYrr VR256:$src)>;
2177  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2178            (VMOVMSKPDYrr VR256:$src)>;
2179}
2180
2181defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2182                                     SSEPackedSingle>, PS;
2183defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2184                                     SSEPackedDouble>, PD;
2185
2186let Predicates = [UseSSE2] in {
2187  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2188  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2189            (MOVMSKPSrr VR128:$src)>;
2190  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2191            (MOVMSKPDrr VR128:$src)>;
2192}
2193
2194//===---------------------------------------------------------------------===//
2195// SSE2 - Packed Integer Logical Instructions
2196//===---------------------------------------------------------------------===//
2197
2198let ExeDomain = SSEPackedInt in { // SSE integer instructions
2199
2200/// PDI_binop_rm - Simple SSE2 binary operator.
2201multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2202                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2203                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2204                        bit IsCommutable, bit Is2Addr> {
2205  let isCommutable = IsCommutable in
2206  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2207       (ins RC:$src1, RC:$src2),
2208       !if(Is2Addr,
2209           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2210           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2211       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2212       Sched<[sched]>;
2213  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2214       (ins RC:$src1, x86memop:$src2),
2215       !if(Is2Addr,
2216           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2217           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2218       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2219       Sched<[sched.Folded, sched.ReadAfterFold]>;
2220}
2221} // ExeDomain = SSEPackedInt
2222
2223multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2224                         ValueType OpVT128, ValueType OpVT256,
2225                         X86SchedWriteWidths sched, bit IsCommutable,
2226                         Predicate prd> {
2227let Predicates = [HasAVX, prd] in
2228  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2229                             VR128, load, i128mem, sched.XMM,
2230                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2231
2232let Constraints = "$src1 = $dst" in
2233  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2234                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2235
2236let Predicates = [HasAVX2, prd] in
2237  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2238                               OpVT256, VR256, load, i256mem, sched.YMM,
2239                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2240}
2241
2242// These are ordered here for pattern ordering requirements with the fp versions
2243
2244defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2245                           SchedWriteVecLogic, 1, NoVLX>;
2246defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2247                           SchedWriteVecLogic, 1, NoVLX>;
2248defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2249                           SchedWriteVecLogic, 1, NoVLX>;
2250defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2251                           SchedWriteVecLogic, 0, NoVLX>;
2252
2253//===----------------------------------------------------------------------===//
2254// SSE 1 & 2 - Logical Instructions
2255//===----------------------------------------------------------------------===//
2256
2257/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2258///
2259/// There are no patterns here because isel prefers integer versions for SSE2
2260/// and later. There are SSE1 v4f32 patterns later.
2261multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2262                                   SDNode OpNode, X86SchedWriteWidths sched> {
2263  let Predicates = [HasAVX, NoVLX] in {
2264  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2265        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2266        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2267
2268  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2269        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2270        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2271
2272  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2273       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2274       [], [], 0>, PS, VEX_4V, VEX_WIG;
2275
2276  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2277       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2278       [], [], 0>, PD, VEX_4V, VEX_WIG;
2279  }
2280
2281  let Constraints = "$src1 = $dst" in {
2282    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2283         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2284         [], []>, PS;
2285
2286    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2287         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2288         [], []>, PD;
2289  }
2290}
2291
2292defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2293defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2294defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2295let isCommutable = 0 in
2296  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2297
2298let Predicates = [HasAVX2, NoVLX] in {
2299  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2300            (VPANDYrr VR256:$src1, VR256:$src2)>;
2301  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2302            (VPANDYrr VR256:$src1, VR256:$src2)>;
2303  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2304            (VPANDYrr VR256:$src1, VR256:$src2)>;
2305
2306  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2307            (VPORYrr VR256:$src1, VR256:$src2)>;
2308  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2309            (VPORYrr VR256:$src1, VR256:$src2)>;
2310  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2311            (VPORYrr VR256:$src1, VR256:$src2)>;
2312
2313  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2314            (VPXORYrr VR256:$src1, VR256:$src2)>;
2315  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2316            (VPXORYrr VR256:$src1, VR256:$src2)>;
2317  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2318            (VPXORYrr VR256:$src1, VR256:$src2)>;
2319
2320  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2321            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2322  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2323            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2324  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2325            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2326
2327  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2328            (VPANDYrm VR256:$src1, addr:$src2)>;
2329  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2330            (VPANDYrm VR256:$src1, addr:$src2)>;
2331  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2332            (VPANDYrm VR256:$src1, addr:$src2)>;
2333
2334  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2335            (VPORYrm VR256:$src1, addr:$src2)>;
2336  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2337            (VPORYrm VR256:$src1, addr:$src2)>;
2338  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2339            (VPORYrm VR256:$src1, addr:$src2)>;
2340
2341  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2342            (VPXORYrm VR256:$src1, addr:$src2)>;
2343  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2344            (VPXORYrm VR256:$src1, addr:$src2)>;
2345  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2346            (VPXORYrm VR256:$src1, addr:$src2)>;
2347
2348  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2349            (VPANDNYrm VR256:$src1, addr:$src2)>;
2350  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2351            (VPANDNYrm VR256:$src1, addr:$src2)>;
2352  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2353            (VPANDNYrm VR256:$src1, addr:$src2)>;
2354}
2355
2356// If only AVX1 is supported, we need to handle integer operations with
2357// floating point instructions since the integer versions aren't available.
2358let Predicates = [HasAVX1Only] in {
2359  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2360            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2361  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2362            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2363  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2364            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2365  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2366            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2367
2368  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2369            (VORPSYrr VR256:$src1, VR256:$src2)>;
2370  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2371            (VORPSYrr VR256:$src1, VR256:$src2)>;
2372  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2373            (VORPSYrr VR256:$src1, VR256:$src2)>;
2374  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2375            (VORPSYrr VR256:$src1, VR256:$src2)>;
2376
2377  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2378            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2379  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2380            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2381  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2382            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2383  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2384            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2385
2386  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2387            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2388  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2389            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2390  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2391            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2392  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2393            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2394
2395  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2396            (VANDPSYrm VR256:$src1, addr:$src2)>;
2397  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2398            (VANDPSYrm VR256:$src1, addr:$src2)>;
2399  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2400            (VANDPSYrm VR256:$src1, addr:$src2)>;
2401  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2402            (VANDPSYrm VR256:$src1, addr:$src2)>;
2403
2404  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2405            (VORPSYrm VR256:$src1, addr:$src2)>;
2406  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2407            (VORPSYrm VR256:$src1, addr:$src2)>;
2408  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2409            (VORPSYrm VR256:$src1, addr:$src2)>;
2410  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2411            (VORPSYrm VR256:$src1, addr:$src2)>;
2412
2413  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2414            (VXORPSYrm VR256:$src1, addr:$src2)>;
2415  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2416            (VXORPSYrm VR256:$src1, addr:$src2)>;
2417  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2418            (VXORPSYrm VR256:$src1, addr:$src2)>;
2419  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2420            (VXORPSYrm VR256:$src1, addr:$src2)>;
2421
2422  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2423            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2424  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2425            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2426  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2427            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2428  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2429            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2430}
2431
2432let Predicates = [HasAVX, NoVLX] in {
2433  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2434            (VPANDrr VR128:$src1, VR128:$src2)>;
2435  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2436            (VPANDrr VR128:$src1, VR128:$src2)>;
2437  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2438            (VPANDrr VR128:$src1, VR128:$src2)>;
2439
2440  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2441            (VPORrr VR128:$src1, VR128:$src2)>;
2442  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2443            (VPORrr VR128:$src1, VR128:$src2)>;
2444  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2445            (VPORrr VR128:$src1, VR128:$src2)>;
2446
2447  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2448            (VPXORrr VR128:$src1, VR128:$src2)>;
2449  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2450            (VPXORrr VR128:$src1, VR128:$src2)>;
2451  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2452            (VPXORrr VR128:$src1, VR128:$src2)>;
2453
2454  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2455            (VPANDNrr VR128:$src1, VR128:$src2)>;
2456  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2457            (VPANDNrr VR128:$src1, VR128:$src2)>;
2458  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2459            (VPANDNrr VR128:$src1, VR128:$src2)>;
2460
2461  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2462            (VPANDrm VR128:$src1, addr:$src2)>;
2463  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2464            (VPANDrm VR128:$src1, addr:$src2)>;
2465  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2466            (VPANDrm VR128:$src1, addr:$src2)>;
2467
2468  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2469            (VPORrm VR128:$src1, addr:$src2)>;
2470  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2471            (VPORrm VR128:$src1, addr:$src2)>;
2472  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2473            (VPORrm VR128:$src1, addr:$src2)>;
2474
2475  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2476            (VPXORrm VR128:$src1, addr:$src2)>;
2477  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2478            (VPXORrm VR128:$src1, addr:$src2)>;
2479  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2480            (VPXORrm VR128:$src1, addr:$src2)>;
2481
2482  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2483            (VPANDNrm VR128:$src1, addr:$src2)>;
2484  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2485            (VPANDNrm VR128:$src1, addr:$src2)>;
2486  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2487            (VPANDNrm VR128:$src1, addr:$src2)>;
2488}
2489
2490let Predicates = [UseSSE2] in {
2491  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2492            (PANDrr VR128:$src1, VR128:$src2)>;
2493  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2494            (PANDrr VR128:$src1, VR128:$src2)>;
2495  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2496            (PANDrr VR128:$src1, VR128:$src2)>;
2497
2498  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2499            (PORrr VR128:$src1, VR128:$src2)>;
2500  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2501            (PORrr VR128:$src1, VR128:$src2)>;
2502  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2503            (PORrr VR128:$src1, VR128:$src2)>;
2504
2505  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2506            (PXORrr VR128:$src1, VR128:$src2)>;
2507  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2508            (PXORrr VR128:$src1, VR128:$src2)>;
2509  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2510            (PXORrr VR128:$src1, VR128:$src2)>;
2511
2512  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2513            (PANDNrr VR128:$src1, VR128:$src2)>;
2514  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2515            (PANDNrr VR128:$src1, VR128:$src2)>;
2516  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2517            (PANDNrr VR128:$src1, VR128:$src2)>;
2518
2519  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2520            (PANDrm VR128:$src1, addr:$src2)>;
2521  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2522            (PANDrm VR128:$src1, addr:$src2)>;
2523  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2524            (PANDrm VR128:$src1, addr:$src2)>;
2525
2526  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2527            (PORrm VR128:$src1, addr:$src2)>;
2528  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2529            (PORrm VR128:$src1, addr:$src2)>;
2530  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2531            (PORrm VR128:$src1, addr:$src2)>;
2532
2533  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2534            (PXORrm VR128:$src1, addr:$src2)>;
2535  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2536            (PXORrm VR128:$src1, addr:$src2)>;
2537  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2538            (PXORrm VR128:$src1, addr:$src2)>;
2539
2540  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2541            (PANDNrm VR128:$src1, addr:$src2)>;
2542  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2543            (PANDNrm VR128:$src1, addr:$src2)>;
2544  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2545            (PANDNrm VR128:$src1, addr:$src2)>;
2546}
2547
2548// Patterns for packed operations when we don't have integer type available.
2549def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2550          (ANDPSrr VR128:$src1, VR128:$src2)>;
2551def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2552          (ORPSrr VR128:$src1, VR128:$src2)>;
2553def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2554          (XORPSrr VR128:$src1, VR128:$src2)>;
2555def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2556          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2557
2558def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2559          (ANDPSrm VR128:$src1, addr:$src2)>;
2560def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2561          (ORPSrm VR128:$src1, addr:$src2)>;
2562def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2563          (XORPSrm VR128:$src1, addr:$src2)>;
2564def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2565          (ANDNPSrm VR128:$src1, addr:$src2)>;
2566
2567//===----------------------------------------------------------------------===//
2568// SSE 1 & 2 - Arithmetic Instructions
2569//===----------------------------------------------------------------------===//
2570
2571/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2572/// vector forms.
2573///
2574/// In addition, we also have a special variant of the scalar form here to
2575/// represent the associated intrinsic operation.  This form is unlike the
2576/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2577/// and leaves the top elements unmodified (therefore these cannot be commuted).
2578///
2579/// These three forms can each be reg+reg or reg+mem.
2580///
2581
2582/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2583/// classes below
2584multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2585                                  SDNode OpNode, X86SchedWriteSizes sched> {
2586let Uses = [MXCSR], mayRaiseFPException = 1 in {
2587  let Predicates = [HasAVX, NoVLX] in {
2588  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2589                               VR128, v4f32, f128mem, loadv4f32,
2590                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2591  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2592                               VR128, v2f64, f128mem, loadv2f64,
2593                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2594
2595  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2596                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2597                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2598  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2599                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2600                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2601  }
2602
2603  let Constraints = "$src1 = $dst" in {
2604    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2605                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2606                              sched.PS.XMM>, PS;
2607    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2608                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2609                              sched.PD.XMM>, PD;
2610  }
2611}
2612}
2613
2614multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2615                                  X86SchedWriteSizes sched> {
2616let Uses = [MXCSR], mayRaiseFPException = 1 in {
2617  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2618                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2619                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2620  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2621                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2622                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2623
2624  let Constraints = "$src1 = $dst" in {
2625    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2626                              OpNode, FR32, f32mem, SSEPackedSingle,
2627                              sched.PS.Scl>, XS;
2628    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2629                              OpNode, FR64, f64mem, SSEPackedDouble,
2630                              sched.PD.Scl>, XD;
2631  }
2632}
2633}
2634
2635multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2636                                      SDPatternOperator OpNode,
2637                                      X86SchedWriteSizes sched> {
2638let Uses = [MXCSR], mayRaiseFPException = 1 in {
2639  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2640                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2641                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2642  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2643                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2644                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2645
2646  let Constraints = "$src1 = $dst" in {
2647    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2648                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2649                   SSEPackedSingle, sched.PS.Scl>, XS;
2650    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2651                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2652                   SSEPackedDouble, sched.PD.Scl>, XD;
2653  }
2654}
2655}
2656
2657// Binary Arithmetic instructions
2658defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2659           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2660           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2661defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2662           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2663           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2664let isCommutable = 0 in {
2665  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2666             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2667             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2668  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2669             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2670             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2671  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2672             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2673             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2674  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2675             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2676             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2677}
2678
2679let isCodeGenOnly = 1 in {
2680  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2681             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2682  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2683             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2684}
2685
2686// Patterns used to select SSE scalar fp arithmetic instructions from
2687// either:
2688//
2689// (1) a scalar fp operation followed by a blend
2690//
2691// The effect is that the backend no longer emits unnecessary vector
2692// insert instructions immediately after SSE scalar fp instructions
2693// like addss or mulss.
2694//
2695// For example, given the following code:
2696//   __m128 foo(__m128 A, __m128 B) {
2697//     A[0] += B[0];
2698//     return A;
2699//   }
2700//
2701// Previously we generated:
2702//   addss %xmm0, %xmm1
2703//   movss %xmm1, %xmm0
2704//
2705// We now generate:
2706//   addss %xmm1, %xmm0
2707//
2708// (2) a vector packed single/double fp operation followed by a vector insert
2709//
2710// The effect is that the backend converts the packed fp instruction
2711// followed by a vector insert into a single SSE scalar fp instruction.
2712//
2713// For example, given the following code:
2714//   __m128 foo(__m128 A, __m128 B) {
2715//     __m128 C = A + B;
2716//     return (__m128) {c[0], a[1], a[2], a[3]};
2717//   }
2718//
2719// Previously we generated:
2720//   addps %xmm0, %xmm1
2721//   movss %xmm1, %xmm0
2722//
2723// We now generate:
2724//   addss %xmm1, %xmm0
2725
2726// TODO: Some canonicalization in lowering would simplify the number of
2727// patterns we have to try to match.
2728multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2729                                    ValueType VT, ValueType EltTy,
2730                                    RegisterClass RC, PatFrag ld_frag,
2731                                    Predicate BasePredicate> {
2732  let Predicates = [BasePredicate] in {
2733    // extracted scalar math op with insert via movss/movsd
2734    def : Pat<(VT (Move (VT VR128:$dst),
2735                        (VT (scalar_to_vector
2736                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2737                                 RC:$src))))),
2738              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2739               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2740    def : Pat<(VT (Move (VT VR128:$dst),
2741                        (VT (scalar_to_vector
2742                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2743                                 (ld_frag addr:$src)))))),
2744              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2745  }
2746
2747  // Repeat for AVX versions of the instructions.
2748  let Predicates = [UseAVX] in {
2749    // extracted scalar math op with insert via movss/movsd
2750    def : Pat<(VT (Move (VT VR128:$dst),
2751                        (VT (scalar_to_vector
2752                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2753                                 RC:$src))))),
2754              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2755               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2756    def : Pat<(VT (Move (VT VR128:$dst),
2757                        (VT (scalar_to_vector
2758                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2759                                 (ld_frag addr:$src)))))),
2760              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2761  }
2762}
2763
2764defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2765defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2766defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2767defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2768
2769defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2770defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2771defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2772defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2773
2774/// Unop Arithmetic
2775/// In addition, we also have a special variant of the scalar form here to
2776/// represent the associated intrinsic operation.  This form is unlike the
2777/// plain scalar form, in that it takes an entire vector (instead of a
2778/// scalar) and leaves the top elements undefined.
2779///
2780/// And, we have a special variant form for a full-vector intrinsic form.
2781
2782/// sse_fp_unop_s - SSE1 unops in scalar form
2783/// For the non-AVX defs, we need $src1 to be tied to $dst because
2784/// the HW instructions are 2 operand / destructive.
2785multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2786                          ValueType ScalarVT, X86MemOperand x86memop,
2787                          Operand intmemop, SDNode OpNode, Domain d,
2788                          X86FoldableSchedWrite sched, Predicate target> {
2789  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2790  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2791              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2792            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2793            Requires<[target]>;
2794  let mayLoad = 1 in
2795  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2796            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2797            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2798            Sched<[sched.Folded]>,
2799            Requires<[target, OptForSize]>;
2800  }
2801
2802  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2803  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2804                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2805                Sched<[sched]>;
2806  let mayLoad = 1 in
2807  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2808                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2809                Sched<[sched.Folded, sched.ReadAfterFold]>;
2810  }
2811
2812}
2813
2814multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2815                              ComplexPattern int_cpat, Intrinsic Intr,
2816                              Predicate target, string Suffix> {
2817  let Predicates = [target] in {
2818  // These are unary operations, but they are modeled as having 2 source operands
2819  // because the high elements of the destination are unchanged in SSE.
2820  def : Pat<(Intr VR128:$src),
2821            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2822  }
2823  // We don't want to fold scalar loads into these instructions unless
2824  // optimizing for size. This is because the folded instruction will have a
2825  // partial register update, while the unfolded sequence will not, e.g.
2826  // movss mem, %xmm0
2827  // rcpss %xmm0, %xmm0
2828  // which has a clobber before the rcp, vs.
2829  // rcpss mem, %xmm0
2830  let Predicates = [target, OptForSize] in {
2831    def : Pat<(Intr int_cpat:$src2),
2832               (!cast<Instruction>(NAME#m_Int)
2833                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2834  }
2835}
2836
2837multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
2838                              Intrinsic Intr, Predicate target> {
2839  let Predicates = [target] in {
2840   def : Pat<(Intr VR128:$src),
2841             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2842                                 VR128:$src)>;
2843  }
2844  let Predicates = [target, OptForSize] in {
2845    def : Pat<(Intr int_cpat:$src2),
2846              (!cast<Instruction>(NAME#m_Int)
2847                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2848  }
2849}
2850
2851multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2852                          ValueType ScalarVT, X86MemOperand x86memop,
2853                          Operand intmemop, SDNode OpNode, Domain d,
2854                          X86FoldableSchedWrite sched, Predicate target> {
2855  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2856  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2857            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2858            [], d>, Sched<[sched]>;
2859  let mayLoad = 1 in
2860  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2861             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2862            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2863  }
2864  let hasSideEffects = 0, ExeDomain = d in {
2865  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2866                (ins VR128:$src1, VR128:$src2),
2867             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2868             []>, Sched<[sched]>;
2869  let mayLoad = 1 in
2870  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2871                (ins VR128:$src1, intmemop:$src2),
2872             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2873             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2874  }
2875
2876  // We don't want to fold scalar loads into these instructions unless
2877  // optimizing for size. This is because the folded instruction will have a
2878  // partial register update, while the unfolded sequence will not, e.g.
2879  // vmovss mem, %xmm0
2880  // vrcpss %xmm0, %xmm0, %xmm0
2881  // which has a clobber before the rcp, vs.
2882  // vrcpss mem, %xmm0, %xmm0
2883  // TODO: In theory, we could fold the load, and avoid the stall caused by
2884  // the partial register store, either in BreakFalseDeps or with smarter RA.
2885  let Predicates = [target] in {
2886   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2887                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2888  }
2889  let Predicates = [target, OptForSize] in {
2890    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2891              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2892            addr:$src)>;
2893  }
2894}
2895
2896/// sse1_fp_unop_p - SSE1 unops in packed form.
2897multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2898                          X86SchedWriteWidths sched, list<Predicate> prds> {
2899let Predicates = prds in {
2900  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2901                       !strconcat("v", OpcodeStr,
2902                                  "ps\t{$src, $dst|$dst, $src}"),
2903                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2904                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2905  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2906                       !strconcat("v", OpcodeStr,
2907                                  "ps\t{$src, $dst|$dst, $src}"),
2908                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2909                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2910  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2911                        !strconcat("v", OpcodeStr,
2912                                   "ps\t{$src, $dst|$dst, $src}"),
2913                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2914                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2915  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2916                        !strconcat("v", OpcodeStr,
2917                                   "ps\t{$src, $dst|$dst, $src}"),
2918                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2919                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2920}
2921
2922  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2923                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2924                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2925                Sched<[sched.XMM]>;
2926  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2927                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2928                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2929                Sched<[sched.XMM.Folded]>;
2930}
2931
2932/// sse2_fp_unop_p - SSE2 unops in vector forms.
2933multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2934                          SDNode OpNode, X86SchedWriteWidths sched> {
2935let Predicates = [HasAVX, NoVLX] in {
2936  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2937                       !strconcat("v", OpcodeStr,
2938                                  "pd\t{$src, $dst|$dst, $src}"),
2939                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2940                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2941  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2942                       !strconcat("v", OpcodeStr,
2943                                  "pd\t{$src, $dst|$dst, $src}"),
2944                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2945                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2946  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2947                        !strconcat("v", OpcodeStr,
2948                                   "pd\t{$src, $dst|$dst, $src}"),
2949                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2950                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2951  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2952                        !strconcat("v", OpcodeStr,
2953                                   "pd\t{$src, $dst|$dst, $src}"),
2954                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2955                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2956}
2957
2958  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2959                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2960                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2961                Sched<[sched.XMM]>;
2962  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2963                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2964                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2965                Sched<[sched.XMM.Folded]>;
2966}
2967
2968multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2969                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2970  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2971                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2972                      UseSSE1, "SS">, XS;
2973  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2974                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
2975                      AVXTarget>,
2976                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2977}
2978
2979multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2980                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2981  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
2982                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2983  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
2984                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2985                       XS, VEX_4V, VEX_LIG, VEX_WIG;
2986}
2987
2988multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2989                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2990  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
2991                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2992  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
2993                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2994                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2995}
2996
2997// Square root.
2998defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
2999             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3000             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3001             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3002
3003// Reciprocal approximations. Note that these typically require refinement
3004// in order to obtain suitable precision.
3005defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3006             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3007             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3008defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3009             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3010             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3011
3012// There is no f64 version of the reciprocal approximation instructions.
3013
3014multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3015                                      ValueType VT, Predicate BasePredicate> {
3016  let Predicates = [BasePredicate] in {
3017    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3018                                  (OpNode (extractelt VT:$src, 0))))),
3019              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3020  }
3021
3022  // Repeat for AVX versions of the instructions.
3023  let Predicates = [UseAVX] in {
3024    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3025                                  (OpNode (extractelt VT:$src, 0))))),
3026              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3027  }
3028}
3029
3030defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3031defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3032
3033multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3034                                           SDNode Move, ValueType VT,
3035                                           Predicate BasePredicate> {
3036  let Predicates = [BasePredicate] in {
3037    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3038              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3039  }
3040
3041  // Repeat for AVX versions of the instructions.
3042  let Predicates = [HasAVX] in {
3043    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3044              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3045  }
3046}
3047
3048defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3049                                       v4f32, UseSSE1>;
3050defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3051                                       v4f32, UseSSE1>;
3052
3053
3054//===----------------------------------------------------------------------===//
3055// SSE 1 & 2 - Non-temporal stores
3056//===----------------------------------------------------------------------===//
3057
3058let AddedComplexity = 400 in { // Prefer non-temporal versions
3059let Predicates = [HasAVX, NoVLX] in {
3060let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3061def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3062                     (ins f128mem:$dst, VR128:$src),
3063                     "movntps\t{$src, $dst|$dst, $src}",
3064                     [(alignednontemporalstore (v4f32 VR128:$src),
3065                                               addr:$dst)]>, VEX, VEX_WIG;
3066def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3067                     (ins f128mem:$dst, VR128:$src),
3068                     "movntpd\t{$src, $dst|$dst, $src}",
3069                     [(alignednontemporalstore (v2f64 VR128:$src),
3070                                               addr:$dst)]>, VEX, VEX_WIG;
3071} // SchedRW
3072
3073let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3074def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3075                     (ins f256mem:$dst, VR256:$src),
3076                     "movntps\t{$src, $dst|$dst, $src}",
3077                     [(alignednontemporalstore (v8f32 VR256:$src),
3078                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3079def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3080                     (ins f256mem:$dst, VR256:$src),
3081                     "movntpd\t{$src, $dst|$dst, $src}",
3082                     [(alignednontemporalstore (v4f64 VR256:$src),
3083                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3084} // SchedRW
3085
3086let ExeDomain = SSEPackedInt in {
3087def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3088                         (ins i128mem:$dst, VR128:$src),
3089                         "movntdq\t{$src, $dst|$dst, $src}",
3090                         [(alignednontemporalstore (v2i64 VR128:$src),
3091                                                   addr:$dst)]>, VEX, VEX_WIG,
3092                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3093def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3094                    (ins i256mem:$dst, VR256:$src),
3095                    "movntdq\t{$src, $dst|$dst, $src}",
3096                    [(alignednontemporalstore (v4i64 VR256:$src),
3097                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3098                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3099} // ExeDomain
3100} // Predicates
3101
3102let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3103def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3104                    "movntps\t{$src, $dst|$dst, $src}",
3105                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3106def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3107                    "movntpd\t{$src, $dst|$dst, $src}",
3108                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3109} // SchedRW
3110
3111let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3112def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3113                    "movntdq\t{$src, $dst|$dst, $src}",
3114                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3115
3116let SchedRW = [WriteStoreNT] in {
3117// There is no AVX form for instructions below this point
3118def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3119                 "movnti{l}\t{$src, $dst|$dst, $src}",
3120                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3121               PS, Requires<[HasSSE2]>;
3122def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3123                     "movnti{q}\t{$src, $dst|$dst, $src}",
3124                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3125                  PS, Requires<[HasSSE2]>;
3126} // SchedRW = [WriteStoreNT]
3127
3128let Predicates = [HasAVX, NoVLX] in {
3129  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3130            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3131  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3132            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3133  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3134            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3135
3136  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3137            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3138  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3139            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3140  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3141            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3142}
3143
3144let Predicates = [UseSSE2] in {
3145  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3146            (MOVNTDQmr addr:$dst, VR128:$src)>;
3147  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3148            (MOVNTDQmr addr:$dst, VR128:$src)>;
3149  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3150            (MOVNTDQmr addr:$dst, VR128:$src)>;
3151}
3152
3153} // AddedComplexity
3154
3155//===----------------------------------------------------------------------===//
3156// SSE 1 & 2 - Prefetch and memory fence
3157//===----------------------------------------------------------------------===//
3158
3159// Prefetch intrinsic.
3160let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3161def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3162    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3163def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3164    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3165def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3166    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3167def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3168    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3169}
3170
3171// FIXME: How should flush instruction be modeled?
3172let SchedRW = [WriteLoad] in {
3173// Flush cache
3174def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3175               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3176               PS, Requires<[HasSSE2]>;
3177}
3178
3179let SchedRW = [WriteNop] in {
3180// Pause. This "instruction" is encoded as "rep; nop", so even though it
3181// was introduced with SSE2, it's backward compatible.
3182def PAUSE : I<0x90, RawFrm, (outs), (ins),
3183              "pause", [(int_x86_sse2_pause)]>, OBXS;
3184}
3185
3186let SchedRW = [WriteFence] in {
3187// Load, store, and memory fence
3188// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
3189// to include any 64-bit target.
3190def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3191               PS, Requires<[HasSSE1]>;
3192def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3193               PS, Requires<[HasSSE2]>;
3194def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3195               PS, Requires<[HasMFence]>;
3196} // SchedRW
3197
3198def : Pat<(X86MFence), (MFENCE)>;
3199
3200//===----------------------------------------------------------------------===//
3201// SSE 1 & 2 - Load/Store XCSR register
3202//===----------------------------------------------------------------------===//
3203
3204let mayLoad=1, hasSideEffects=1 in
3205def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3206               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3207               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3208let mayStore=1, hasSideEffects=1 in
3209def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3210               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3211               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3212
3213let mayLoad=1, hasSideEffects=1 in
3214def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3215              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3216              TB, Sched<[WriteLDMXCSR]>;
3217let mayStore=1, hasSideEffects=1 in
3218def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3219              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3220              TB, Sched<[WriteSTMXCSR]>;
3221
3222//===---------------------------------------------------------------------===//
3223// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3224//===---------------------------------------------------------------------===//
3225
3226let ExeDomain = SSEPackedInt in { // SSE integer instructions
3227
3228let hasSideEffects = 0 in {
3229def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3230                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3231                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3232def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3233                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3234                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3235def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3236                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3237                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3238def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3239                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3240                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3241}
3242
3243// For Disassembler
3244let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3245def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3246                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3247                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3248                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3249def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3250                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3251                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3252                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3253def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3254                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3255                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3256                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3257def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3258                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3259                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3260                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3261}
3262
3263let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3264    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3265def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3266                      "movdqa\t{$src, $dst|$dst, $src}",
3267                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3268                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3269def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3270                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3271                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3272                      VEX, VEX_L, VEX_WIG;
3273def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3274                   "vmovdqu\t{$src, $dst|$dst, $src}",
3275                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3276                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3277                   XS, VEX, VEX_WIG;
3278def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3279                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3280                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3281                   XS, VEX, VEX_L, VEX_WIG;
3282}
3283
3284let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3285def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3286                      (ins i128mem:$dst, VR128:$src),
3287                      "movdqa\t{$src, $dst|$dst, $src}",
3288                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3289                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3290def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3291                      (ins i256mem:$dst, VR256:$src),
3292                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3293                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3294def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3295                   "vmovdqu\t{$src, $dst|$dst, $src}",
3296                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3297                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3298def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3299                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3300                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3301}
3302
3303let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3304let hasSideEffects = 0 in {
3305def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3306                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3307
3308def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3309                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3310                   XS, Requires<[UseSSE2]>;
3311}
3312
3313// For Disassembler
3314let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3315def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3316                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3317                       FoldGenData<"MOVDQArr">;
3318
3319def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3320                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3321                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3322}
3323} // SchedRW
3324
3325let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3326    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3327def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3328                   "movdqa\t{$src, $dst|$dst, $src}",
3329                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3330def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3331                   "movdqu\t{$src, $dst|$dst, $src}",
3332                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3333                 XS, Requires<[UseSSE2]>;
3334}
3335
3336let mayStore = 1, hasSideEffects = 0,
3337    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3338def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3339                   "movdqa\t{$src, $dst|$dst, $src}",
3340                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3341def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3342                   "movdqu\t{$src, $dst|$dst, $src}",
3343                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3344                 XS, Requires<[UseSSE2]>;
3345}
3346
3347} // ExeDomain = SSEPackedInt
3348
3349// Reversed version with ".s" suffix for GAS compatibility.
3350def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3351                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3352def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3353                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3354def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3355                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3356def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3357                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3358
3359// Reversed version with ".s" suffix for GAS compatibility.
3360def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3361                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3362def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3363                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3364
3365let Predicates = [HasAVX, NoVLX] in {
3366  // Additional patterns for other integer sizes.
3367  def : Pat<(alignedloadv4i32 addr:$src),
3368            (VMOVDQArm addr:$src)>;
3369  def : Pat<(alignedloadv8i16 addr:$src),
3370            (VMOVDQArm addr:$src)>;
3371  def : Pat<(alignedloadv16i8 addr:$src),
3372            (VMOVDQArm addr:$src)>;
3373  def : Pat<(loadv4i32 addr:$src),
3374            (VMOVDQUrm addr:$src)>;
3375  def : Pat<(loadv8i16 addr:$src),
3376            (VMOVDQUrm addr:$src)>;
3377  def : Pat<(loadv16i8 addr:$src),
3378            (VMOVDQUrm addr:$src)>;
3379
3380  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3381            (VMOVDQAmr addr:$dst, VR128:$src)>;
3382  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3383            (VMOVDQAmr addr:$dst, VR128:$src)>;
3384  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3385            (VMOVDQAmr addr:$dst, VR128:$src)>;
3386  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3387            (VMOVDQUmr addr:$dst, VR128:$src)>;
3388  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3389            (VMOVDQUmr addr:$dst, VR128:$src)>;
3390  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3391            (VMOVDQUmr addr:$dst, VR128:$src)>;
3392}
3393
3394//===---------------------------------------------------------------------===//
3395// SSE2 - Packed Integer Arithmetic Instructions
3396//===---------------------------------------------------------------------===//
3397
3398let ExeDomain = SSEPackedInt in { // SSE integer instructions
3399
3400/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3401multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3402                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3403                         PatFrag memop_frag, X86MemOperand x86memop,
3404                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3405  let isCommutable = 1 in
3406  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3407       (ins RC:$src1, RC:$src2),
3408       !if(Is2Addr,
3409           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3410           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3411       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3412       Sched<[sched]>;
3413  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3414       (ins RC:$src1, x86memop:$src2),
3415       !if(Is2Addr,
3416           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3417           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3418       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3419                                     (memop_frag addr:$src2))))]>,
3420       Sched<[sched.Folded, sched.ReadAfterFold]>;
3421}
3422} // ExeDomain = SSEPackedInt
3423
3424defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3425                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3426defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3427                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3428defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3429                             SchedWriteVecALU, 1, NoVLX>;
3430defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3431                             SchedWriteVecALU, 1, NoVLX>;
3432defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3433                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3434defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3435                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3436defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3437                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3438defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3439                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3440defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3441                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3442defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3443                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3444defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3445                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3446defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3447                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3448defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3449                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3450defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3451                             SchedWriteVecALU, 0, NoVLX>;
3452defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3453                             SchedWriteVecALU, 0, NoVLX>;
3454defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3455                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3456defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3457                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3458defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3459                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3460defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3461                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3462defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3463                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3464defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3465                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3466defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3467                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3468defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3469                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3470defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3471                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3472defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3473                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3474defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3475                             SchedWriteVecIMul, 1, NoVLX>;
3476
3477let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3478defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3479                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3480                              VEX_4V, VEX_WIG;
3481
3482let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3483defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3484                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3485                               0>, VEX_4V, VEX_L, VEX_WIG;
3486let Constraints = "$src1 = $dst" in
3487defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3488                             memop, i128mem, SchedWriteVecIMul.XMM>;
3489
3490let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3491defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3492                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3493                             VEX_4V, VEX_WIG;
3494let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3495defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3496                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3497                             VEX_4V, VEX_L, VEX_WIG;
3498let Constraints = "$src1 = $dst" in
3499defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3500                            memop, i128mem, SchedWritePSADBW.XMM>;
3501
3502//===---------------------------------------------------------------------===//
3503// SSE2 - Packed Integer Logical Instructions
3504//===---------------------------------------------------------------------===//
3505
3506multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3507                         string OpcodeStr, SDNode OpNode,
3508                         SDNode OpNode2, RegisterClass RC,
3509                         X86FoldableSchedWrite sched,
3510                         X86FoldableSchedWrite schedImm,
3511                         ValueType DstVT, ValueType SrcVT,
3512                         PatFrag ld_frag, bit Is2Addr = 1> {
3513  // src2 is always 128-bit
3514  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3515       (ins RC:$src1, VR128:$src2),
3516       !if(Is2Addr,
3517           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3518           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3519       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3520       Sched<[sched]>;
3521  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3522       (ins RC:$src1, i128mem:$src2),
3523       !if(Is2Addr,
3524           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3525           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3526       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3527                       (SrcVT (ld_frag addr:$src2)))))]>,
3528       Sched<[sched.Folded, sched.ReadAfterFold]>;
3529  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3530       (ins RC:$src1, u8imm:$src2),
3531       !if(Is2Addr,
3532           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3533           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3534       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3535       Sched<[schedImm]>;
3536}
3537
3538multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3539                             string OpcodeStr, SDNode OpNode,
3540                             SDNode OpNode2, ValueType DstVT128,
3541                             ValueType DstVT256, ValueType SrcVT,
3542                             X86SchedWriteWidths sched,
3543                             X86SchedWriteWidths schedImm, Predicate prd> {
3544let Predicates = [HasAVX, prd] in
3545  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3546                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3547                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3548let Predicates = [HasAVX2, prd] in
3549  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3550                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3551                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3552                                VEX_WIG;
3553let Constraints = "$src1 = $dst" in
3554  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3555                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3556                            memop>;
3557}
3558
3559multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3560                        SDNode OpNode, RegisterClass RC, ValueType VT,
3561                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3562  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3563       !if(Is2Addr,
3564           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3565           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3566       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3567       Sched<[sched]>;
3568}
3569
3570multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3571                            SDNode OpNode, X86SchedWriteWidths sched> {
3572let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3573  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3574                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3575let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3576  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3577                               VR256, v32i8, sched.YMM, 0>,
3578                               VEX_4V, VEX_L, VEX_WIG;
3579let Constraints = "$src1 = $dst" in
3580  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3581                           sched.XMM>;
3582}
3583
3584let ExeDomain = SSEPackedInt in {
3585  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3586                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3587                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3588  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3589                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3590                                 SchedWriteVecShiftImm, NoVLX>;
3591  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3592                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3593                                 SchedWriteVecShiftImm, NoVLX>;
3594
3595  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3596                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3597                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3598  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3599                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3600                                 SchedWriteVecShiftImm, NoVLX>;
3601  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3602                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3603                                 SchedWriteVecShiftImm, NoVLX>;
3604
3605  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3606                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3607                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3608  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3609                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3610                                 SchedWriteVecShiftImm, NoVLX>;
3611
3612  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3613                                 SchedWriteShuffle>;
3614  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3615                                 SchedWriteShuffle>;
3616} // ExeDomain = SSEPackedInt
3617
3618//===---------------------------------------------------------------------===//
3619// SSE2 - Packed Integer Comparison Instructions
3620//===---------------------------------------------------------------------===//
3621
3622defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3623                             SchedWriteVecALU, 1, TruePredicate>;
3624defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3625                             SchedWriteVecALU, 1, TruePredicate>;
3626defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3627                             SchedWriteVecALU, 1, TruePredicate>;
3628defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3629                             SchedWriteVecALU, 0, TruePredicate>;
3630defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3631                             SchedWriteVecALU, 0, TruePredicate>;
3632defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3633                             SchedWriteVecALU, 0, TruePredicate>;
3634
3635//===---------------------------------------------------------------------===//
3636// SSE2 - Packed Integer Shuffle Instructions
3637//===---------------------------------------------------------------------===//
3638
3639let ExeDomain = SSEPackedInt in {
3640multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3641                         SDNode OpNode, X86SchedWriteWidths sched,
3642                         Predicate prd> {
3643let Predicates = [HasAVX, prd] in {
3644  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3645                      (ins VR128:$src1, u8imm:$src2),
3646                      !strconcat("v", OpcodeStr,
3647                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3648                      [(set VR128:$dst,
3649                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3650                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3651  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3652                      (ins i128mem:$src1, u8imm:$src2),
3653                      !strconcat("v", OpcodeStr,
3654                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3655                     [(set VR128:$dst,
3656                       (vt128 (OpNode (load addr:$src1),
3657                        (i8 timm:$src2))))]>, VEX,
3658                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3659}
3660
3661let Predicates = [HasAVX2, prd] in {
3662  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3663                       (ins VR256:$src1, u8imm:$src2),
3664                       !strconcat("v", OpcodeStr,
3665                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3666                       [(set VR256:$dst,
3667                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3668                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3669  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3670                       (ins i256mem:$src1, u8imm:$src2),
3671                       !strconcat("v", OpcodeStr,
3672                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3673                      [(set VR256:$dst,
3674                        (vt256 (OpNode (load addr:$src1),
3675                         (i8 timm:$src2))))]>, VEX, VEX_L,
3676                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3677}
3678
3679let Predicates = [UseSSE2] in {
3680  def ri : Ii8<0x70, MRMSrcReg,
3681               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3682               !strconcat(OpcodeStr,
3683                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3684               [(set VR128:$dst,
3685                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3686               Sched<[sched.XMM]>;
3687  def mi : Ii8<0x70, MRMSrcMem,
3688               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3689               !strconcat(OpcodeStr,
3690                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3691               [(set VR128:$dst,
3692                 (vt128 (OpNode (memop addr:$src1),
3693                        (i8 timm:$src2))))]>,
3694               Sched<[sched.XMM.Folded]>;
3695}
3696}
3697} // ExeDomain = SSEPackedInt
3698
3699defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3700                             SchedWriteShuffle, NoVLX>, PD;
3701defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3702                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3703defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3704                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3705
3706//===---------------------------------------------------------------------===//
3707// Packed Integer Pack Instructions (SSE & AVX)
3708//===---------------------------------------------------------------------===//
3709
3710let ExeDomain = SSEPackedInt in {
3711multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3712                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3713                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3714                     PatFrag ld_frag, bit Is2Addr = 1> {
3715  def rr : PDI<opc, MRMSrcReg,
3716               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3717               !if(Is2Addr,
3718                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3719                   !strconcat(OpcodeStr,
3720                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3721               [(set RC:$dst,
3722                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3723               Sched<[sched]>;
3724  def rm : PDI<opc, MRMSrcMem,
3725               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3726               !if(Is2Addr,
3727                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3728                   !strconcat(OpcodeStr,
3729                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3730               [(set RC:$dst,
3731                     (OutVT (OpNode (ArgVT RC:$src1),
3732                                    (ld_frag addr:$src2))))]>,
3733               Sched<[sched.Folded, sched.ReadAfterFold]>;
3734}
3735
3736multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3737                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3738                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3739                     PatFrag ld_frag, bit Is2Addr = 1> {
3740  def rr : SS48I<opc, MRMSrcReg,
3741                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3742                 !if(Is2Addr,
3743                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3744                     !strconcat(OpcodeStr,
3745                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3746                 [(set RC:$dst,
3747                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3748                 Sched<[sched]>;
3749  def rm : SS48I<opc, MRMSrcMem,
3750                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3751                 !if(Is2Addr,
3752                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3753                     !strconcat(OpcodeStr,
3754                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3755                 [(set RC:$dst,
3756                       (OutVT (OpNode (ArgVT RC:$src1),
3757                                      (ld_frag addr:$src2))))]>,
3758                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3759}
3760
3761let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3762  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3763                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3764                             VEX_4V, VEX_WIG;
3765  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3766                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3767                             VEX_4V, VEX_WIG;
3768
3769  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3770                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3771                             VEX_4V, VEX_WIG;
3772  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3773                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3774                             VEX_4V;
3775}
3776
3777let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3778  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3779                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3780                              VEX_4V, VEX_L, VEX_WIG;
3781  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3782                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3783                              VEX_4V, VEX_L, VEX_WIG;
3784
3785  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3786                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3787                              VEX_4V, VEX_L, VEX_WIG;
3788  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3789                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3790                              VEX_4V, VEX_L;
3791}
3792
3793let Constraints = "$src1 = $dst" in {
3794  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3795                            i128mem, SchedWriteShuffle.XMM, memop>;
3796  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3797                            i128mem, SchedWriteShuffle.XMM, memop>;
3798
3799  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3800                            i128mem, SchedWriteShuffle.XMM, memop>;
3801
3802  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3803                            i128mem, SchedWriteShuffle.XMM, memop>;
3804}
3805} // ExeDomain = SSEPackedInt
3806
3807//===---------------------------------------------------------------------===//
3808// SSE2 - Packed Integer Unpack Instructions
3809//===---------------------------------------------------------------------===//
3810
3811let ExeDomain = SSEPackedInt in {
3812multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3813                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3814                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3815                       bit Is2Addr = 1> {
3816  def rr : PDI<opc, MRMSrcReg,
3817      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3818      !if(Is2Addr,
3819          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3820          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3821      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3822      Sched<[sched]>;
3823  def rm : PDI<opc, MRMSrcMem,
3824      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3825      !if(Is2Addr,
3826          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3827          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3828      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3829      Sched<[sched.Folded, sched.ReadAfterFold]>;
3830}
3831
3832let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3833  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3834                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3835                                 VEX_4V, VEX_WIG;
3836  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3837                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3838                                 VEX_4V, VEX_WIG;
3839  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3840                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3841                                 VEX_4V, VEX_WIG;
3842  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3843                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3844                                 VEX_4V, VEX_WIG;
3845}
3846
3847let Predicates = [HasAVX, NoVLX] in {
3848  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3849                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3850                                 VEX_4V, VEX_WIG;
3851  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3852                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3853                                 VEX_4V, VEX_WIG;
3854  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3855                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3856                                 VEX_4V, VEX_WIG;
3857  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3858                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3859                                 VEX_4V, VEX_WIG;
3860}
3861
3862let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3863  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3864                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3865                                  VEX_4V, VEX_L, VEX_WIG;
3866  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3867                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3868                                  VEX_4V, VEX_L, VEX_WIG;
3869  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3870                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3871                                  VEX_4V, VEX_L, VEX_WIG;
3872  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3873                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3874                                  VEX_4V, VEX_L, VEX_WIG;
3875}
3876
3877let Predicates = [HasAVX2, NoVLX] in {
3878  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3879                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3880                                  VEX_4V, VEX_L, VEX_WIG;
3881  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3882                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3883                                  VEX_4V, VEX_L, VEX_WIG;
3884  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3885                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3886                                  VEX_4V, VEX_L, VEX_WIG;
3887  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3888                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3889                                  VEX_4V, VEX_L, VEX_WIG;
3890}
3891
3892let Constraints = "$src1 = $dst" in {
3893  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3894                                i128mem, SchedWriteShuffle.XMM, memop>;
3895  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3896                                i128mem, SchedWriteShuffle.XMM, memop>;
3897  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3898                                i128mem, SchedWriteShuffle.XMM, memop>;
3899  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3900                                i128mem, SchedWriteShuffle.XMM, memop>;
3901
3902  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3903                                i128mem, SchedWriteShuffle.XMM, memop>;
3904  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3905                                i128mem, SchedWriteShuffle.XMM, memop>;
3906  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3907                                i128mem, SchedWriteShuffle.XMM, memop>;
3908  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3909                                i128mem, SchedWriteShuffle.XMM, memop>;
3910}
3911} // ExeDomain = SSEPackedInt
3912
3913//===---------------------------------------------------------------------===//
3914// SSE2 - Packed Integer Extract and Insert
3915//===---------------------------------------------------------------------===//
3916
3917let ExeDomain = SSEPackedInt in {
3918multiclass sse2_pinsrw<bit Is2Addr = 1> {
3919  def rr : Ii8<0xC4, MRMSrcReg,
3920       (outs VR128:$dst), (ins VR128:$src1,
3921        GR32orGR64:$src2, u8imm:$src3),
3922       !if(Is2Addr,
3923           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3924           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3925       [(set VR128:$dst,
3926         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3927       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3928  def rm : Ii8<0xC4, MRMSrcMem,
3929                      (outs VR128:$dst), (ins VR128:$src1,
3930                       i16mem:$src2, u8imm:$src3),
3931       !if(Is2Addr,
3932           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3933           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3934       [(set VR128:$dst,
3935         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3936                    imm:$src3))]>,
3937       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3938}
3939
3940// Extract
3941let Predicates = [HasAVX, NoBWI] in
3942def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3943                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3944                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3945                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3946                                            imm:$src2))]>,
3947                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3948def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3949                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3950                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3951                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3952                                            imm:$src2))]>,
3953               Sched<[WriteVecExtract]>;
3954
3955// Insert
3956let Predicates = [HasAVX, NoBWI] in
3957defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3958
3959let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3960defm PINSRW : sse2_pinsrw, PD;
3961
3962} // ExeDomain = SSEPackedInt
3963
3964//===---------------------------------------------------------------------===//
3965// SSE2 - Packed Mask Creation
3966//===---------------------------------------------------------------------===//
3967
3968let ExeDomain = SSEPackedInt in {
3969
3970def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3971           (ins VR128:$src),
3972           "pmovmskb\t{$src, $dst|$dst, $src}",
3973           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3974           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3975
3976let Predicates = [HasAVX2] in {
3977def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3978           (ins VR256:$src),
3979           "pmovmskb\t{$src, $dst|$dst, $src}",
3980           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3981           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3982}
3983
3984def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3985           "pmovmskb\t{$src, $dst|$dst, $src}",
3986           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3987           Sched<[WriteVecMOVMSK]>;
3988
3989} // ExeDomain = SSEPackedInt
3990
3991//===---------------------------------------------------------------------===//
3992// SSE2 - Conditional Store
3993//===---------------------------------------------------------------------===//
3994
3995let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3996let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
3997def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
3998           (ins VR128:$src, VR128:$mask),
3999           "maskmovdqu\t{$mask, $src|$src, $mask}",
4000           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4001           VEX, VEX_WIG;
4002let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4003def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4004           (ins VR128:$src, VR128:$mask),
4005           "maskmovdqu\t{$mask, $src|$src, $mask}",
4006           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4007           VEX, VEX_WIG;
4008
4009let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4010def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4011           "maskmovdqu\t{$mask, $src|$src, $mask}",
4012           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4013let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4014def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4015           "maskmovdqu\t{$mask, $src|$src, $mask}",
4016           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4017
4018} // ExeDomain = SSEPackedInt
4019
4020//===---------------------------------------------------------------------===//
4021// SSE2 - Move Doubleword/Quadword
4022//===---------------------------------------------------------------------===//
4023
4024//===---------------------------------------------------------------------===//
4025// Move Int Doubleword to Packed Double Int
4026//
4027let ExeDomain = SSEPackedInt in {
4028def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4029                        "movd\t{$src, $dst|$dst, $src}",
4030                        [(set VR128:$dst,
4031                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4032                          VEX, Sched<[WriteVecMoveFromGpr]>;
4033def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4034                        "movd\t{$src, $dst|$dst, $src}",
4035                        [(set VR128:$dst,
4036                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4037                        VEX, Sched<[WriteVecLoad]>;
4038def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4039                          "movq\t{$src, $dst|$dst, $src}",
4040                          [(set VR128:$dst,
4041                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4042                          VEX, Sched<[WriteVecMoveFromGpr]>;
4043let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4044def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4045                          "movq\t{$src, $dst|$dst, $src}", []>,
4046                          VEX, Sched<[WriteVecLoad]>;
4047let isCodeGenOnly = 1 in
4048def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4049                         "movq\t{$src, $dst|$dst, $src}",
4050                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4051                         VEX, Sched<[WriteVecMoveFromGpr]>;
4052
4053def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4054                      "movd\t{$src, $dst|$dst, $src}",
4055                      [(set VR128:$dst,
4056                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4057                      Sched<[WriteVecMoveFromGpr]>;
4058def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4059                      "movd\t{$src, $dst|$dst, $src}",
4060                      [(set VR128:$dst,
4061                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4062                      Sched<[WriteVecLoad]>;
4063def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4064                        "movq\t{$src, $dst|$dst, $src}",
4065                        [(set VR128:$dst,
4066                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4067                        Sched<[WriteVecMoveFromGpr]>;
4068let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4069def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4070                        "movq\t{$src, $dst|$dst, $src}", []>,
4071                        Sched<[WriteVecLoad]>;
4072let isCodeGenOnly = 1 in
4073def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4074                       "movq\t{$src, $dst|$dst, $src}",
4075                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4076                       Sched<[WriteVecMoveFromGpr]>;
4077} // ExeDomain = SSEPackedInt
4078
4079//===---------------------------------------------------------------------===//
4080// Move Int Doubleword to Single Scalar
4081//
4082let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4083  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4084                        "movd\t{$src, $dst|$dst, $src}",
4085                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4086                        VEX, Sched<[WriteVecMoveFromGpr]>;
4087
4088  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4089                        "movd\t{$src, $dst|$dst, $src}",
4090                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4091                        Sched<[WriteVecMoveFromGpr]>;
4092
4093} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4094
4095//===---------------------------------------------------------------------===//
4096// Move Packed Doubleword Int to Packed Double Int
4097//
4098let ExeDomain = SSEPackedInt in {
4099def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4100                         "movd\t{$src, $dst|$dst, $src}",
4101                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4102                                          (iPTR 0)))]>, VEX,
4103                         Sched<[WriteVecMoveToGpr]>;
4104def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4105                         (ins i32mem:$dst, VR128:$src),
4106                         "movd\t{$src, $dst|$dst, $src}",
4107                         [(store (i32 (extractelt (v4i32 VR128:$src),
4108                                       (iPTR 0))), addr:$dst)]>,
4109                         VEX, Sched<[WriteVecStore]>;
4110def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4111                       "movd\t{$src, $dst|$dst, $src}",
4112                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4113                                        (iPTR 0)))]>,
4114                   Sched<[WriteVecMoveToGpr]>;
4115def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4116                       "movd\t{$src, $dst|$dst, $src}",
4117                       [(store (i32 (extractelt (v4i32 VR128:$src),
4118                                     (iPTR 0))), addr:$dst)]>,
4119                       Sched<[WriteVecStore]>;
4120} // ExeDomain = SSEPackedInt
4121
4122//===---------------------------------------------------------------------===//
4123// Move Packed Doubleword Int first element to Doubleword Int
4124//
4125let ExeDomain = SSEPackedInt in {
4126let SchedRW = [WriteVecMoveToGpr] in {
4127def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4128                          "movq\t{$src, $dst|$dst, $src}",
4129                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4130                                                        (iPTR 0)))]>,
4131                      VEX;
4132
4133def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4134                        "movq\t{$src, $dst|$dst, $src}",
4135                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4136                                                         (iPTR 0)))]>;
4137} //SchedRW
4138
4139let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4140def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4141                          (ins i64mem:$dst, VR128:$src),
4142                          "movq\t{$src, $dst|$dst, $src}", []>,
4143                          VEX, Sched<[WriteVecStore]>;
4144let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4145def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4146                        "movq\t{$src, $dst|$dst, $src}", []>,
4147                        Sched<[WriteVecStore]>;
4148} // ExeDomain = SSEPackedInt
4149
4150//===---------------------------------------------------------------------===//
4151// Bitcast FR64 <-> GR64
4152//
4153let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4154  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4155                           "movq\t{$src, $dst|$dst, $src}",
4156                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4157                           VEX, Sched<[WriteVecMoveToGpr]>;
4158
4159  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4160                         "movq\t{$src, $dst|$dst, $src}",
4161                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4162                         Sched<[WriteVecMoveToGpr]>;
4163} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4164
4165//===---------------------------------------------------------------------===//
4166// Move Scalar Single to Double Int
4167//
4168let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4169  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4170                        "movd\t{$src, $dst|$dst, $src}",
4171                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4172                        VEX, Sched<[WriteVecMoveToGpr]>;
4173  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4174                        "movd\t{$src, $dst|$dst, $src}",
4175                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4176                        Sched<[WriteVecMoveToGpr]>;
4177} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4178
4179let Predicates = [UseAVX] in {
4180  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4181            (VMOVDI2PDIrr GR32:$src)>;
4182
4183  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4184            (VMOV64toPQIrr GR64:$src)>;
4185
4186  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4187  // These instructions also write zeros in the high part of a 256-bit register.
4188  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4189            (VMOVDI2PDIrm addr:$src)>;
4190  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4191            (VMOVDI2PDIrm addr:$src)>;
4192  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4193            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4194}
4195
4196let Predicates = [UseSSE2] in {
4197  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4198            (MOVDI2PDIrr GR32:$src)>;
4199
4200  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4201            (MOV64toPQIrr GR64:$src)>;
4202  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
4203            (MOVDI2PDIrm addr:$src)>;
4204  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4205            (MOVDI2PDIrm addr:$src)>;
4206}
4207
4208// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4209// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4210// these aliases.
4211def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4212                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4213def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4214                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4215// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4216def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4217                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4218def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4219                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4220
4221//===---------------------------------------------------------------------===//
4222// SSE2 - Move Quadword
4223//===---------------------------------------------------------------------===//
4224
4225//===---------------------------------------------------------------------===//
4226// Move Quadword Int to Packed Quadword Int
4227//
4228
4229let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4230def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4231                    "vmovq\t{$src, $dst|$dst, $src}",
4232                    [(set VR128:$dst,
4233                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4234                    VEX, Requires<[UseAVX]>, VEX_WIG;
4235def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4236                    "movq\t{$src, $dst|$dst, $src}",
4237                    [(set VR128:$dst,
4238                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4239                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4240} // ExeDomain, SchedRW
4241
4242//===---------------------------------------------------------------------===//
4243// Move Packed Quadword Int to Quadword Int
4244//
4245let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4246def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4247                        "movq\t{$src, $dst|$dst, $src}",
4248                        [(store (i64 (extractelt (v2i64 VR128:$src),
4249                                      (iPTR 0))), addr:$dst)]>,
4250                        VEX, VEX_WIG;
4251def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4252                      "movq\t{$src, $dst|$dst, $src}",
4253                      [(store (i64 (extractelt (v2i64 VR128:$src),
4254                                    (iPTR 0))), addr:$dst)]>;
4255} // ExeDomain, SchedRW
4256
4257// For disassembler only
4258let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4259    SchedRW = [SchedWriteVecLogic.XMM] in {
4260def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4261                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4262def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4263                      "movq\t{$src, $dst|$dst, $src}", []>;
4264}
4265
4266def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4267                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4268def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4269                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4270
4271let Predicates = [UseAVX] in {
4272  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4273            (VMOVQI2PQIrm addr:$src)>;
4274  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4275            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4276
4277  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4278            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4279}
4280
4281let Predicates = [UseSSE2] in {
4282  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4283
4284  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4285            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4286}
4287
4288//===---------------------------------------------------------------------===//
4289// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4290// IA32 document. movq xmm1, xmm2 does clear the high bits.
4291//
4292let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4293def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4294                        "vmovq\t{$src, $dst|$dst, $src}",
4295                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4296                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4297def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4298                        "movq\t{$src, $dst|$dst, $src}",
4299                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4300                        XS, Requires<[UseSSE2]>;
4301} // ExeDomain, SchedRW
4302
4303let Predicates = [UseAVX] in {
4304  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4305            (VMOVZPQILo2PQIrr VR128:$src)>;
4306}
4307let Predicates = [UseSSE2] in {
4308  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4309            (MOVZPQILo2PQIrr VR128:$src)>;
4310}
4311
4312let Predicates = [UseAVX] in {
4313  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4314            (SUBREG_TO_REG (i32 0),
4315             (v2f64 (VMOVZPQILo2PQIrr
4316                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4317             sub_xmm)>;
4318  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4319            (SUBREG_TO_REG (i32 0),
4320             (v2i64 (VMOVZPQILo2PQIrr
4321                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4322             sub_xmm)>;
4323}
4324
4325//===---------------------------------------------------------------------===//
4326// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4327//===---------------------------------------------------------------------===//
4328
4329multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4330                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4331                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4332def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4333                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4334                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4335                      Sched<[sched]>;
4336def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4337                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4338                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4339                      Sched<[sched.Folded]>;
4340}
4341
4342let Predicates = [HasAVX, NoVLX] in {
4343  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4344                                       v4f32, VR128, loadv4f32, f128mem,
4345                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4346  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4347                                       v4f32, VR128, loadv4f32, f128mem,
4348                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4349  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4350                                       v8f32, VR256, loadv8f32, f256mem,
4351                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4352  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4353                                       v8f32, VR256, loadv8f32, f256mem,
4354                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4355}
4356defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4357                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4358defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4359                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4360
4361let Predicates = [HasAVX, NoVLX] in {
4362  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4363            (VMOVSHDUPrr VR128:$src)>;
4364  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4365            (VMOVSHDUPrm addr:$src)>;
4366  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4367            (VMOVSLDUPrr VR128:$src)>;
4368  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4369            (VMOVSLDUPrm addr:$src)>;
4370  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4371            (VMOVSHDUPYrr VR256:$src)>;
4372  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4373            (VMOVSHDUPYrm addr:$src)>;
4374  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4375            (VMOVSLDUPYrr VR256:$src)>;
4376  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4377            (VMOVSLDUPYrm addr:$src)>;
4378}
4379
4380let Predicates = [UseSSE3] in {
4381  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4382            (MOVSHDUPrr VR128:$src)>;
4383  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4384            (MOVSHDUPrm addr:$src)>;
4385  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4386            (MOVSLDUPrr VR128:$src)>;
4387  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4388            (MOVSLDUPrm addr:$src)>;
4389}
4390
4391//===---------------------------------------------------------------------===//
4392// SSE3 - Replicate Double FP - MOVDDUP
4393//===---------------------------------------------------------------------===//
4394
4395multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4396def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4397                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4398                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4399                    Sched<[sched.XMM]>;
4400def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4401                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4402                    [(set VR128:$dst,
4403                      (v2f64 (X86Movddup
4404                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4405                    Sched<[sched.XMM.Folded]>;
4406}
4407
4408// FIXME: Merge with above classes when there are patterns for the ymm version
4409multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4410def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4411                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4412                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4413                    Sched<[sched.YMM]>;
4414def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4415                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4416                    [(set VR256:$dst,
4417                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4418                    Sched<[sched.YMM.Folded]>;
4419}
4420
4421let Predicates = [HasAVX, NoVLX] in {
4422  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4423                                      VEX, VEX_WIG;
4424  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4425                                        VEX, VEX_L, VEX_WIG;
4426}
4427
4428defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4429
4430
4431let Predicates = [HasAVX, NoVLX] in {
4432  def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
4433            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4434  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4435            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4436}
4437
4438let Predicates = [UseSSE3] in {
4439  // No need for aligned memory as this only loads 64-bits.
4440  def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
4441            (MOVDDUPrm addr:$src)>;
4442  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4443            (MOVDDUPrm addr:$src)>;
4444}
4445
4446//===---------------------------------------------------------------------===//
4447// SSE3 - Move Unaligned Integer
4448//===---------------------------------------------------------------------===//
4449
4450let Predicates = [HasAVX] in {
4451  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4452                      "vlddqu\t{$src, $dst|$dst, $src}",
4453                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4454                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4455  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4456                       "vlddqu\t{$src, $dst|$dst, $src}",
4457                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4458                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4459} // Predicates
4460
4461def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4462                   "lddqu\t{$src, $dst|$dst, $src}",
4463                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4464                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4465
4466//===---------------------------------------------------------------------===//
4467// SSE3 - Arithmetic
4468//===---------------------------------------------------------------------===//
4469
4470multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4471                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4472                       PatFrag ld_frag, bit Is2Addr = 1> {
4473let Uses = [MXCSR], mayRaiseFPException = 1 in {
4474  def rr : I<0xD0, MRMSrcReg,
4475       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4476       !if(Is2Addr,
4477           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4478           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4479       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4480       Sched<[sched]>;
4481  def rm : I<0xD0, MRMSrcMem,
4482       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4483       !if(Is2Addr,
4484           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4485           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4486       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4487       Sched<[sched.Folded, sched.ReadAfterFold]>;
4488}
4489}
4490
4491let Predicates = [HasAVX] in {
4492  let ExeDomain = SSEPackedSingle in {
4493    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4494                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4495                                 XD, VEX_4V, VEX_WIG;
4496    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4497                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4498                                  XD, VEX_4V, VEX_L, VEX_WIG;
4499  }
4500  let ExeDomain = SSEPackedDouble in {
4501    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4502                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4503                                 PD, VEX_4V, VEX_WIG;
4504    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4505                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4506                                  PD, VEX_4V, VEX_L, VEX_WIG;
4507  }
4508}
4509let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4510  let ExeDomain = SSEPackedSingle in
4511  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4512                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4513  let ExeDomain = SSEPackedDouble in
4514  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4515                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4516}
4517
4518//===---------------------------------------------------------------------===//
4519// SSE3 Instructions
4520//===---------------------------------------------------------------------===//
4521
4522// Horizontal ops
4523multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4524                   X86MemOperand x86memop, SDNode OpNode,
4525                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4526                   bit Is2Addr = 1> {
4527let Uses = [MXCSR], mayRaiseFPException = 1 in {
4528  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4529       !if(Is2Addr,
4530         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4531         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4532      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4533      Sched<[sched]>;
4534
4535  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4536       !if(Is2Addr,
4537         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4538         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4539      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4540      Sched<[sched.Folded, sched.ReadAfterFold]>;
4541}
4542}
4543multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4544                  X86MemOperand x86memop, SDNode OpNode,
4545                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4546                  bit Is2Addr = 1> {
4547let Uses = [MXCSR], mayRaiseFPException = 1 in {
4548  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4549       !if(Is2Addr,
4550         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4551         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4552      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4553        Sched<[sched]>;
4554
4555  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4556       !if(Is2Addr,
4557         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4558         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4559      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4560        Sched<[sched.Folded, sched.ReadAfterFold]>;
4561}
4562}
4563
4564let Predicates = [HasAVX] in {
4565  let ExeDomain = SSEPackedSingle in {
4566    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4567                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4568    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4569                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4570    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4571                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4572    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4573                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4574  }
4575  let ExeDomain = SSEPackedDouble in {
4576    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4577                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4578    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4579                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4580    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4581                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4582    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4583                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4584  }
4585}
4586
4587let Constraints = "$src1 = $dst" in {
4588  let ExeDomain = SSEPackedSingle in {
4589    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4590                          WriteFHAdd, memopv4f32>;
4591    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4592                          WriteFHAdd, memopv4f32>;
4593  }
4594  let ExeDomain = SSEPackedDouble in {
4595    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4596                         WriteFHAdd, memopv2f64>;
4597    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4598                         WriteFHAdd, memopv2f64>;
4599  }
4600}
4601
4602//===---------------------------------------------------------------------===//
4603// SSSE3 - Packed Absolute Instructions
4604//===---------------------------------------------------------------------===//
4605
4606/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4607multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4608                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4609  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4610                 (ins VR128:$src),
4611                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4612                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4613                 Sched<[sched.XMM]>;
4614
4615  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4616                 (ins i128mem:$src),
4617                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4618                 [(set VR128:$dst,
4619                   (vt (OpNode (ld_frag addr:$src))))]>,
4620                 Sched<[sched.XMM.Folded]>;
4621}
4622
4623/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4624multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4625                          SDNode OpNode, X86SchedWriteWidths sched> {
4626  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4627                  (ins VR256:$src),
4628                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4629                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4630                  Sched<[sched.YMM]>;
4631
4632  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4633                  (ins i256mem:$src),
4634                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4635                  [(set VR256:$dst,
4636                    (vt (OpNode (load addr:$src))))]>,
4637                  Sched<[sched.YMM.Folded]>;
4638}
4639
4640let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4641  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4642                              load>, VEX, VEX_WIG;
4643  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4644                              load>, VEX, VEX_WIG;
4645}
4646let Predicates = [HasAVX, NoVLX] in {
4647  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4648                              load>, VEX, VEX_WIG;
4649}
4650let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4651  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4652                                VEX, VEX_L, VEX_WIG;
4653  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4654                                VEX, VEX_L, VEX_WIG;
4655}
4656let Predicates = [HasAVX2, NoVLX] in {
4657  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4658                                VEX, VEX_L, VEX_WIG;
4659}
4660
4661defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4662                          memop>;
4663defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4664                          memop>;
4665defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4666                          memop>;
4667
4668//===---------------------------------------------------------------------===//
4669// SSSE3 - Packed Binary Operator Instructions
4670//===---------------------------------------------------------------------===//
4671
4672/// SS3I_binop_rm - Simple SSSE3 bin op
4673multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4674                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4675                         PatFrag memop_frag, X86MemOperand x86memop,
4676                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4677  let isCommutable = 1 in
4678  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4679       (ins RC:$src1, RC:$src2),
4680       !if(Is2Addr,
4681         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4682         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4683       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4684       Sched<[sched]>;
4685  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4686       (ins RC:$src1, x86memop:$src2),
4687       !if(Is2Addr,
4688         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4689         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4690       [(set RC:$dst,
4691         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4692       Sched<[sched.Folded, sched.ReadAfterFold]>;
4693}
4694
4695/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4696multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4697                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4698                             PatFrag ld_frag, bit Is2Addr = 1> {
4699  let isCommutable = 1 in
4700  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4701       (ins VR128:$src1, VR128:$src2),
4702       !if(Is2Addr,
4703         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4704         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4705       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4706       Sched<[sched]>;
4707  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4708       (ins VR128:$src1, i128mem:$src2),
4709       !if(Is2Addr,
4710         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4711         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4712       [(set VR128:$dst,
4713         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4714       Sched<[sched.Folded, sched.ReadAfterFold]>;
4715}
4716
4717multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4718                               Intrinsic IntId256,
4719                               X86FoldableSchedWrite sched> {
4720  let isCommutable = 1 in
4721  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4722       (ins VR256:$src1, VR256:$src2),
4723       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4724       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4725       Sched<[sched]>;
4726  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4727       (ins VR256:$src1, i256mem:$src2),
4728       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4729       [(set VR256:$dst,
4730         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4731       Sched<[sched.Folded, sched.ReadAfterFold]>;
4732}
4733
4734let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4735let isCommutable = 0 in {
4736  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4737                                  VR128, load, i128mem,
4738                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4739  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4740                                  v16i8, VR128, load, i128mem,
4741                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4742}
4743defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4744                                  VR128, load, i128mem,
4745                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4746}
4747
4748let ImmT = NoImm, Predicates = [HasAVX] in {
4749let isCommutable = 0 in {
4750  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4751                                  load, i128mem,
4752                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4753  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4754                                  load, i128mem,
4755                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4756  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4757                                  load, i128mem,
4758                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4759  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4760                                  load, i128mem,
4761                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
4762  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4763                                      int_x86_ssse3_psign_b_128,
4764                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4765  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4766                                      int_x86_ssse3_psign_w_128,
4767                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4768  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4769                                      int_x86_ssse3_psign_d_128,
4770                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4771  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4772                                      int_x86_ssse3_phadd_sw_128,
4773                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4774  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4775                                      int_x86_ssse3_phsub_sw_128,
4776                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4777}
4778}
4779
4780let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4781let isCommutable = 0 in {
4782  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4783                                  VR256, load, i256mem,
4784                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4785  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4786                                   v32i8, VR256, load, i256mem,
4787                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4788}
4789defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4790                                  VR256, load, i256mem,
4791                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4792}
4793
4794let ImmT = NoImm, Predicates = [HasAVX2] in {
4795let isCommutable = 0 in {
4796  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4797                                  VR256, load, i256mem,
4798                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4799  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4800                                  load, i256mem,
4801                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4802  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4803                                  VR256, load, i256mem,
4804                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4805  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4806                                  load, i256mem,
4807                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4808  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4809                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4810  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4811                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4812  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4813                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4814  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4815                                       int_x86_avx2_phadd_sw,
4816                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4817  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4818                                       int_x86_avx2_phsub_sw,
4819                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4820}
4821}
4822
4823// None of these have i8 immediate fields.
4824let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4825let isCommutable = 0 in {
4826  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4827                                 memop, i128mem, SchedWritePHAdd.XMM>;
4828  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4829                                 memop, i128mem, SchedWritePHAdd.XMM>;
4830  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4831                                 memop, i128mem, SchedWritePHAdd.XMM>;
4832  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4833                                 memop, i128mem, SchedWritePHAdd.XMM>;
4834  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4835                                     SchedWriteVecALU.XMM, memop>;
4836  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4837                                     SchedWriteVecALU.XMM, memop>;
4838  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4839                                     SchedWriteVecALU.XMM, memop>;
4840  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4841                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4842  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4843                                     int_x86_ssse3_phadd_sw_128,
4844                                     SchedWritePHAdd.XMM, memop>;
4845  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4846                                     int_x86_ssse3_phsub_sw_128,
4847                                     SchedWritePHAdd.XMM, memop>;
4848  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4849                                 v16i8, VR128, memop, i128mem,
4850                                 SchedWriteVecIMul.XMM>;
4851}
4852defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4853                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4854}
4855
4856//===---------------------------------------------------------------------===//
4857// SSSE3 - Packed Align Instruction Patterns
4858//===---------------------------------------------------------------------===//
4859
4860multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4861                         PatFrag memop_frag, X86MemOperand x86memop,
4862                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4863  let hasSideEffects = 0 in {
4864  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4865      (ins RC:$src1, RC:$src2, u8imm:$src3),
4866      !if(Is2Addr,
4867        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4868        !strconcat(asm,
4869                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4870      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4871      Sched<[sched]>;
4872  let mayLoad = 1 in
4873  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4874      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4875      !if(Is2Addr,
4876        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4877        !strconcat(asm,
4878                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4879      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4880                                     (memop_frag addr:$src2),
4881                                     (i8 timm:$src3))))]>,
4882      Sched<[sched.Folded, sched.ReadAfterFold]>;
4883  }
4884}
4885
4886let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4887  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4888                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4889let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4890  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4891                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4892let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4893  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4894                               SchedWriteShuffle.XMM>;
4895
4896//===---------------------------------------------------------------------===//
4897// SSSE3 - Thread synchronization
4898//===---------------------------------------------------------------------===//
4899
4900let SchedRW = [WriteSystem] in {
4901let Uses = [EAX, ECX, EDX] in
4902def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4903                     TB, Requires<[HasSSE3, Not64BitMode]>;
4904let Uses = [RAX, ECX, EDX] in
4905def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4906                     TB, Requires<[HasSSE3, In64BitMode]>;
4907
4908let Uses = [ECX, EAX] in
4909def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4910                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4911} // SchedRW
4912
4913def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4914def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4915
4916def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4917      Requires<[Not64BitMode]>;
4918def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4919      Requires<[In64BitMode]>;
4920
4921//===----------------------------------------------------------------------===//
4922// SSE4.1 - Packed Move with Sign/Zero Extend
4923// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4924//===----------------------------------------------------------------------===//
4925
4926multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4927                            RegisterClass OutRC, RegisterClass InRC,
4928                            X86FoldableSchedWrite sched> {
4929  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4930                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4931                 Sched<[sched]>;
4932
4933  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4934                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4935                 Sched<[sched.Folded]>;
4936}
4937
4938multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4939                              X86MemOperand MemOp, X86MemOperand MemYOp,
4940                              Predicate prd> {
4941  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4942                               SchedWriteShuffle.XMM>;
4943  let Predicates = [HasAVX, prd] in
4944    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4945                                     VR128, VR128, SchedWriteShuffle.XMM>,
4946                                     VEX, VEX_WIG;
4947  let Predicates = [HasAVX2, prd] in
4948    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4949                                     VR256, VR128, WriteShuffle256>,
4950                                     VEX, VEX_L, VEX_WIG;
4951}
4952
4953multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4954                          X86MemOperand MemYOp, Predicate prd> {
4955  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4956                                        MemOp, MemYOp, prd>;
4957  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4958                                        !strconcat("pmovzx", OpcodeStr),
4959                                        MemOp, MemYOp, prd>;
4960}
4961
4962defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4963defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4964defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4965
4966defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4967defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4968
4969defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4970
4971// AVX2 Patterns
4972multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4973                                     SDNode ExtOp, SDNode InVecOp> {
4974  // Register-Register patterns
4975  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4976  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4977            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4978  }
4979  let Predicates = [HasAVX2, NoVLX] in {
4980  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4981            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4982  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4983            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4984
4985  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4986            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4987  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4988            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4989
4990  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4991            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4992  }
4993
4994  // Simple Register-Memory patterns
4995  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4996  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4997            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4998
4999  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5000            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5001  }
5002
5003  let Predicates = [HasAVX2, NoVLX] in {
5004  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5005            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5006  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5007            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5008
5009  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5010            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5011  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5012            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5013
5014  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5015            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5016  }
5017
5018  // AVX2 Register-Memory patterns
5019  let Predicates = [HasAVX2, NoVLX] in {
5020  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5021            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5022
5023  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5024            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5025  def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
5026            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5027
5028  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5029            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5030
5031  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5032            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5033  def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
5034            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5035
5036  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5037            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5038  def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
5039            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5040  }
5041}
5042
5043defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5044defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5045
5046// SSE4.1/AVX patterns.
5047multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5048                                SDNode ExtOp> {
5049  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5050  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5051            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5052  }
5053  let Predicates = [HasAVX, NoVLX] in {
5054  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5055            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5056  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5057            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5058
5059  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5060            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5061  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5062            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5063
5064  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5065            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5066  }
5067  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5068  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5069            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5070  }
5071  let Predicates = [HasAVX, NoVLX] in {
5072  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5073            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5074  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5075            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5076
5077  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5078            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5079  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5080            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5081
5082  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5083            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5084  }
5085  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5086  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5087            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5088  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5089            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5090  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5091            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5092  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5093            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5094  }
5095  let Predicates = [HasAVX, NoVLX] in {
5096  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5097            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5098  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5099            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5100  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5101            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5102
5103  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5104            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5105  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5106            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5107
5108  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5109            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5110  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5111            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5112  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5113            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5114  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5115            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5116
5117  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5118            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5119  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5120            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5121  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5122            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5123
5124  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5125            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5126  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5127            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5128  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5129            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5130  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5131            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5132  }
5133}
5134
5135defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5136defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5137
5138let Predicates = [UseSSE41] in {
5139  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5140  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5141}
5142
5143//===----------------------------------------------------------------------===//
5144// SSE4.1 - Extract Instructions
5145//===----------------------------------------------------------------------===//
5146
5147/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5148multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5149  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5150                 (ins VR128:$src1, u8imm:$src2),
5151                 !strconcat(OpcodeStr,
5152                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5153                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5154                                         imm:$src2))]>,
5155                  Sched<[WriteVecExtract]>;
5156  let hasSideEffects = 0, mayStore = 1 in
5157  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5158                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5159                 !strconcat(OpcodeStr,
5160                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5161                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5162                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5163}
5164
5165let Predicates = [HasAVX, NoBWI] in
5166  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5167
5168defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5169
5170
5171/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5172multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5173  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5174  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5175                   (ins VR128:$src1, u8imm:$src2),
5176                   !strconcat(OpcodeStr,
5177                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5178                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5179
5180  let hasSideEffects = 0, mayStore = 1 in
5181  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5182                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5183                 !strconcat(OpcodeStr,
5184                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5185                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5186                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5187}
5188
5189let Predicates = [HasAVX, NoBWI] in
5190  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5191
5192defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5193
5194
5195/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5196multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5197  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5198                 (ins VR128:$src1, u8imm:$src2),
5199                 !strconcat(OpcodeStr,
5200                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5201                 [(set GR32:$dst,
5202                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5203                  Sched<[WriteVecExtract]>;
5204  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5205                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5206                 !strconcat(OpcodeStr,
5207                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5208                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5209                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5210}
5211
5212let Predicates = [HasAVX, NoDQI] in
5213  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5214
5215defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5216
5217/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5218multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5219  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5220                 (ins VR128:$src1, u8imm:$src2),
5221                 !strconcat(OpcodeStr,
5222                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5223                 [(set GR64:$dst,
5224                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5225                  Sched<[WriteVecExtract]>;
5226  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5227                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5228                 !strconcat(OpcodeStr,
5229                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5230                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5231                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5232}
5233
5234let Predicates = [HasAVX, NoDQI] in
5235  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5236
5237defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5238
5239/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5240/// destination
5241multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5242  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5243                   (ins VR128:$src1, u8imm:$src2),
5244                   !strconcat(OpcodeStr,
5245                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5246                   [(set GR32orGR64:$dst,
5247                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5248                   Sched<[WriteVecExtract]>;
5249  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5250                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5251                   !strconcat(OpcodeStr,
5252                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5253                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5254                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5255}
5256
5257let ExeDomain = SSEPackedSingle in {
5258  let Predicates = [UseAVX] in
5259    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5260  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5261}
5262
5263//===----------------------------------------------------------------------===//
5264// SSE4.1 - Insert Instructions
5265//===----------------------------------------------------------------------===//
5266
5267multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5268  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5269      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5270      !if(Is2Addr,
5271        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5272        !strconcat(asm,
5273                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5274      [(set VR128:$dst,
5275        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5276      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5277  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5278      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5279      !if(Is2Addr,
5280        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5281        !strconcat(asm,
5282                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5283      [(set VR128:$dst,
5284        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5285                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5286}
5287
5288let Predicates = [HasAVX, NoBWI] in
5289  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5290let Constraints = "$src1 = $dst" in
5291  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5292
5293multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5294  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5295      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5296      !if(Is2Addr,
5297        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5298        !strconcat(asm,
5299                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5300      [(set VR128:$dst,
5301        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5302      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5303  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5304      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5305      !if(Is2Addr,
5306        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5307        !strconcat(asm,
5308                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5309      [(set VR128:$dst,
5310        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5311                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5312}
5313
5314let Predicates = [HasAVX, NoDQI] in
5315  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5316let Constraints = "$src1 = $dst" in
5317  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5318
5319multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5320  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5321      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5322      !if(Is2Addr,
5323        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5324        !strconcat(asm,
5325                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5326      [(set VR128:$dst,
5327        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5328      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5329  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5330      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5331      !if(Is2Addr,
5332        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5333        !strconcat(asm,
5334                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5335      [(set VR128:$dst,
5336        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5337                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5338}
5339
5340let Predicates = [HasAVX, NoDQI] in
5341  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5342let Constraints = "$src1 = $dst" in
5343  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5344
5345// insertps has a few different modes, there's the first two here below which
5346// are optimized inserts that won't zero arbitrary elements in the destination
5347// vector. The next one matches the intrinsic and could zero arbitrary elements
5348// in the target vector.
5349multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5350  let isCommutable = 1 in
5351  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5352      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5353      !if(Is2Addr,
5354        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5355        !strconcat(asm,
5356                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5357      [(set VR128:$dst,
5358        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5359      Sched<[SchedWriteFShuffle.XMM]>;
5360  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5361      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5362      !if(Is2Addr,
5363        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5364        !strconcat(asm,
5365                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5366      [(set VR128:$dst,
5367        (X86insertps VR128:$src1,
5368                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5369                    timm:$src3))]>,
5370      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5371}
5372
5373let ExeDomain = SSEPackedSingle in {
5374  let Predicates = [UseAVX] in
5375    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5376                     VEX_4V, VEX_WIG;
5377  let Constraints = "$src1 = $dst" in
5378    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5379}
5380
5381//===----------------------------------------------------------------------===//
5382// SSE4.1 - Round Instructions
5383//===----------------------------------------------------------------------===//
5384
5385multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5386                           X86MemOperand x86memop, RegisterClass RC,
5387                           ValueType VT, PatFrag mem_frag, SDNode OpNode,
5388                           X86FoldableSchedWrite sched> {
5389  // Intrinsic operation, reg.
5390  // Vector intrinsic operation, reg
5391let Uses = [MXCSR], mayRaiseFPException = 1 in {
5392  def r : SS4AIi8<opc, MRMSrcReg,
5393                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5394                  !strconcat(OpcodeStr,
5395                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5396                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5397                  Sched<[sched]>;
5398
5399  // Vector intrinsic operation, mem
5400  def m : SS4AIi8<opc, MRMSrcMem,
5401                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5402                  !strconcat(OpcodeStr,
5403                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5404                  [(set RC:$dst,
5405                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5406                  Sched<[sched.Folded]>;
5407}
5408}
5409
5410multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5411                          string OpcodeStr, X86FoldableSchedWrite sched> {
5412let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5413  def SSr : SS4AIi8<opcss, MRMSrcReg,
5414        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5415        !strconcat(OpcodeStr,
5416            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5417      []>, Sched<[sched]>;
5418
5419  let mayLoad = 1 in
5420  def SSm : SS4AIi8<opcss, MRMSrcMem,
5421        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5422        !strconcat(OpcodeStr,
5423             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5424        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5425} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5426
5427let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5428  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5429        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5430        !strconcat(OpcodeStr,
5431              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5432        []>, Sched<[sched]>;
5433
5434  let mayLoad = 1 in
5435  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5436        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5437        !strconcat(OpcodeStr,
5438             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5439        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5440} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5441}
5442
5443multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5444                           string OpcodeStr, X86FoldableSchedWrite sched> {
5445let Uses = [MXCSR], mayRaiseFPException = 1 in {
5446let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5447  def SSr : SS4AIi8<opcss, MRMSrcReg,
5448                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5449                    !strconcat(OpcodeStr,
5450                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5451                    []>, Sched<[sched]>;
5452
5453  let mayLoad = 1 in
5454  def SSm : SS4AIi8<opcss, MRMSrcMem,
5455                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5456                    !strconcat(OpcodeStr,
5457                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5458                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5459} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5460
5461let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5462  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5463                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5464                    !strconcat(OpcodeStr,
5465                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5466                    []>, Sched<[sched]>;
5467
5468  let mayLoad = 1 in
5469  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5470                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5471                    !strconcat(OpcodeStr,
5472                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5473                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5474} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5475}
5476}
5477
5478multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5479                            string OpcodeStr, X86FoldableSchedWrite sched,
5480                            ValueType VT32, ValueType VT64,
5481                            SDNode OpNode, bit Is2Addr = 1> {
5482let Uses = [MXCSR], mayRaiseFPException = 1 in {
5483let ExeDomain = SSEPackedSingle in {
5484  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5485        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5486        !if(Is2Addr,
5487            !strconcat(OpcodeStr,
5488                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5489            !strconcat(OpcodeStr,
5490                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5491        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5492        Sched<[sched]>;
5493
5494  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5495        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5496        !if(Is2Addr,
5497            !strconcat(OpcodeStr,
5498                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5499            !strconcat(OpcodeStr,
5500                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5501        [(set VR128:$dst,
5502             (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
5503        Sched<[sched.Folded, sched.ReadAfterFold]>;
5504} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5505
5506let ExeDomain = SSEPackedDouble in {
5507  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5508        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5509        !if(Is2Addr,
5510            !strconcat(OpcodeStr,
5511                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5512            !strconcat(OpcodeStr,
5513                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5514        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5515        Sched<[sched]>;
5516
5517  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5518        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5519        !if(Is2Addr,
5520            !strconcat(OpcodeStr,
5521                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5522            !strconcat(OpcodeStr,
5523                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5524        [(set VR128:$dst,
5525              (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
5526        Sched<[sched.Folded, sched.ReadAfterFold]>;
5527} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5528}
5529}
5530
5531// FP round - roundss, roundps, roundsd, roundpd
5532let Predicates = [HasAVX, NoVLX] in {
5533  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5534    // Intrinsic form
5535    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5536                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5537                                   VEX, VEX_WIG;
5538    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5539                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5540                                   VEX, VEX_L, VEX_WIG;
5541  }
5542
5543  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5544    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5545                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5546                                   VEX, VEX_WIG;
5547    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5548                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5549                                   VEX, VEX_L, VEX_WIG;
5550  }
5551}
5552let Predicates = [UseAVX] in {
5553  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5554                                  v4f32, v2f64, X86RndScales, 0>,
5555                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5556  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5557                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5558}
5559
5560let Predicates = [UseAVX] in {
5561  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5562            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5563  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5564            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5565}
5566
5567let Predicates = [UseAVX, OptForSize] in {
5568  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5569            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5570  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5571            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5572}
5573
5574let ExeDomain = SSEPackedSingle in
5575defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5576                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5577let ExeDomain = SSEPackedDouble in
5578defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5579                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5580
5581defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5582
5583let Constraints = "$src1 = $dst" in
5584defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5585                               v4f32, v2f64, X86RndScales>;
5586
5587let Predicates = [UseSSE41] in {
5588  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5589            (ROUNDSSr FR32:$src1, timm:$src2)>;
5590  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5591            (ROUNDSDr FR64:$src1, timm:$src2)>;
5592}
5593
5594let Predicates = [UseSSE41, OptForSize] in {
5595  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5596            (ROUNDSSm addr:$src1, timm:$src2)>;
5597  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5598            (ROUNDSDm addr:$src1, timm:$src2)>;
5599}
5600
5601//===----------------------------------------------------------------------===//
5602// SSE4.1 - Packed Bit Test
5603//===----------------------------------------------------------------------===//
5604
5605// ptest instruction we'll lower to this in X86ISelLowering primarily from
5606// the intel intrinsic that corresponds to this.
5607let Defs = [EFLAGS], Predicates = [HasAVX] in {
5608def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5609                "vptest\t{$src2, $src1|$src1, $src2}",
5610                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5611                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5612def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5613                "vptest\t{$src2, $src1|$src1, $src2}",
5614                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5615                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5616                VEX, VEX_WIG;
5617
5618def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5619                "vptest\t{$src2, $src1|$src1, $src2}",
5620                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5621                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5622def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5623                "vptest\t{$src2, $src1|$src1, $src2}",
5624                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5625                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5626                VEX, VEX_L, VEX_WIG;
5627}
5628
5629let Defs = [EFLAGS] in {
5630def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5631              "ptest\t{$src2, $src1|$src1, $src2}",
5632              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5633              Sched<[SchedWriteVecTest.XMM]>;
5634def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5635              "ptest\t{$src2, $src1|$src1, $src2}",
5636              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5637              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5638}
5639
5640// The bit test instructions below are AVX only
5641multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5642                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5643                       X86FoldableSchedWrite sched> {
5644  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5645            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5646            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5647            Sched<[sched]>, VEX;
5648  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5649            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5650            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5651            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5652}
5653
5654let Defs = [EFLAGS], Predicates = [HasAVX] in {
5655let ExeDomain = SSEPackedSingle in {
5656defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5657                            SchedWriteFTest.XMM>;
5658defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5659                            SchedWriteFTest.YMM>, VEX_L;
5660}
5661let ExeDomain = SSEPackedDouble in {
5662defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5663                            SchedWriteFTest.XMM>;
5664defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5665                            SchedWriteFTest.YMM>, VEX_L;
5666}
5667}
5668
5669//===----------------------------------------------------------------------===//
5670// SSE4.1 - Misc Instructions
5671//===----------------------------------------------------------------------===//
5672
5673let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5674  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5675                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5676                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5677                     Sched<[WritePOPCNT]>, OpSize16, XS;
5678  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5679                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5680                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5681                      (implicit EFLAGS)]>,
5682                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5683
5684  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5685                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5686                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5687                     Sched<[WritePOPCNT]>, OpSize32, XS;
5688
5689  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5690                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5691                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5692                      (implicit EFLAGS)]>,
5693                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5694
5695  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5696                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5697                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5698                      Sched<[WritePOPCNT]>, XS;
5699  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5700                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5701                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5702                       (implicit EFLAGS)]>,
5703                       Sched<[WritePOPCNT.Folded]>, XS;
5704}
5705
5706// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5707multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5708                                 SDNode OpNode, PatFrag ld_frag,
5709                                 X86FoldableSchedWrite Sched> {
5710  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5711                 (ins VR128:$src),
5712                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5713                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5714                 Sched<[Sched]>;
5715  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5716                  (ins i128mem:$src),
5717                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5718                  [(set VR128:$dst,
5719                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5720                 Sched<[Sched.Folded]>;
5721}
5722
5723// PHMIN has the same profile as PSAD, thus we use the same scheduling
5724// model, although the naming is misleading.
5725let Predicates = [HasAVX] in
5726defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5727                                         X86phminpos, load,
5728                                         WritePHMINPOS>, VEX, VEX_WIG;
5729defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5730                                         X86phminpos, memop,
5731                                         WritePHMINPOS>;
5732
5733/// SS48I_binop_rm - Simple SSE41 binary operator.
5734multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5735                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5736                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5737                          bit Is2Addr = 1> {
5738  let isCommutable = 1 in
5739  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5740       (ins RC:$src1, RC:$src2),
5741       !if(Is2Addr,
5742           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5743           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5744       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5745       Sched<[sched]>;
5746  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5747       (ins RC:$src1, x86memop:$src2),
5748       !if(Is2Addr,
5749           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5750           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5751       [(set RC:$dst,
5752         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5753       Sched<[sched.Folded, sched.ReadAfterFold]>;
5754}
5755
5756let Predicates = [HasAVX, NoVLX] in {
5757  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5758                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5759                                  VEX_4V, VEX_WIG;
5760  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5761                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5762                                  VEX_4V, VEX_WIG;
5763  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5764                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5765                                  VEX_4V, VEX_WIG;
5766  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5767                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5768                                  VEX_4V, VEX_WIG;
5769  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5770                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5771                                  VEX_4V, VEX_WIG;
5772}
5773let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5774  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5775                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5776                                  VEX_4V, VEX_WIG;
5777  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5778                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5779                                  VEX_4V, VEX_WIG;
5780  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5781                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5782                                  VEX_4V, VEX_WIG;
5783  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5784                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5785                                  VEX_4V, VEX_WIG;
5786}
5787
5788let Predicates = [HasAVX2, NoVLX] in {
5789  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5790                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5791                                  VEX_4V, VEX_L, VEX_WIG;
5792  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5793                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5794                                  VEX_4V, VEX_L, VEX_WIG;
5795  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5796                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5797                                  VEX_4V, VEX_L, VEX_WIG;
5798  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5799                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5800                                  VEX_4V, VEX_L, VEX_WIG;
5801  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5802                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5803                                  VEX_4V, VEX_L, VEX_WIG;
5804}
5805let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5806  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5807                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5808                                  VEX_4V, VEX_L, VEX_WIG;
5809  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5810                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5811                                  VEX_4V, VEX_L, VEX_WIG;
5812  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5813                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5814                                  VEX_4V, VEX_L, VEX_WIG;
5815  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5816                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5817                                  VEX_4V, VEX_L, VEX_WIG;
5818}
5819
5820let Constraints = "$src1 = $dst" in {
5821  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5822                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5823  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5824                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5825  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5826                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5827  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5828                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5829  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5830                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5831  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5832                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5833  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5834                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5835  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5836                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5837  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5838                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5839}
5840
5841let Predicates = [HasAVX, NoVLX] in
5842  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5843                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5844                                 VEX_4V, VEX_WIG;
5845let Predicates = [HasAVX] in
5846  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5847                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5848                                 VEX_4V, VEX_WIG;
5849
5850let Predicates = [HasAVX2, NoVLX] in
5851  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5852                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5853                                  VEX_4V, VEX_L, VEX_WIG;
5854let Predicates = [HasAVX2] in
5855  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5856                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5857                                  VEX_4V, VEX_L, VEX_WIG;
5858
5859let Constraints = "$src1 = $dst" in {
5860  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5861                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5862  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5863                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5864}
5865
5866/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5867multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5868                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5869                 X86MemOperand x86memop, bit Is2Addr,
5870                 X86FoldableSchedWrite sched> {
5871  let isCommutable = 1 in
5872  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5873        (ins RC:$src1, RC:$src2, u8imm:$src3),
5874        !if(Is2Addr,
5875            !strconcat(OpcodeStr,
5876                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5877            !strconcat(OpcodeStr,
5878                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5879        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5880        Sched<[sched]>;
5881  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5882        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5883        !if(Is2Addr,
5884            !strconcat(OpcodeStr,
5885                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5886            !strconcat(OpcodeStr,
5887                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5888        [(set RC:$dst,
5889          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5890        Sched<[sched.Folded, sched.ReadAfterFold]>;
5891}
5892
5893/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5894multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5895                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5896                           X86MemOperand x86memop, bit Is2Addr,
5897                           X86FoldableSchedWrite sched> {
5898  let isCommutable = 1 in
5899  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5900        (ins RC:$src1, RC:$src2, u8imm:$src3),
5901        !if(Is2Addr,
5902            !strconcat(OpcodeStr,
5903                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5904            !strconcat(OpcodeStr,
5905                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5906        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5907        Sched<[sched]>;
5908  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5909        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5910        !if(Is2Addr,
5911            !strconcat(OpcodeStr,
5912                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5913            !strconcat(OpcodeStr,
5914                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5915        [(set RC:$dst,
5916          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5917        Sched<[sched.Folded, sched.ReadAfterFold]>;
5918}
5919
5920def BlendCommuteImm2 : SDNodeXForm<timm, [{
5921  uint8_t Imm = N->getZExtValue() & 0x03;
5922  return getI8Imm(Imm ^ 0x03, SDLoc(N));
5923}]>;
5924
5925def BlendCommuteImm4 : SDNodeXForm<timm, [{
5926  uint8_t Imm = N->getZExtValue() & 0x0f;
5927  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5928}]>;
5929
5930def BlendCommuteImm8 : SDNodeXForm<timm, [{
5931  uint8_t Imm = N->getZExtValue() & 0xff;
5932  return getI8Imm(Imm ^ 0xff, SDLoc(N));
5933}]>;
5934
5935// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5936def BlendScaleImm4 : SDNodeXForm<timm, [{
5937  uint8_t Imm = N->getZExtValue();
5938  uint8_t NewImm = 0;
5939  for (unsigned i = 0; i != 4; ++i) {
5940    if (Imm & (1 << i))
5941      NewImm |= 0x3 << (i * 2);
5942  }
5943  return getI8Imm(NewImm, SDLoc(N));
5944}]>;
5945
5946// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5947def BlendScaleImm2 : SDNodeXForm<timm, [{
5948  uint8_t Imm = N->getZExtValue();
5949  uint8_t NewImm = 0;
5950  for (unsigned i = 0; i != 2; ++i) {
5951    if (Imm & (1 << i))
5952      NewImm |= 0xf << (i * 4);
5953  }
5954  return getI8Imm(NewImm, SDLoc(N));
5955}]>;
5956
5957// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5958def BlendScaleImm2to4 : SDNodeXForm<timm, [{
5959  uint8_t Imm = N->getZExtValue();
5960  uint8_t NewImm = 0;
5961  for (unsigned i = 0; i != 2; ++i) {
5962    if (Imm & (1 << i))
5963      NewImm |= 0x3 << (i * 2);
5964  }
5965  return getI8Imm(NewImm, SDLoc(N));
5966}]>;
5967
5968// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5969def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
5970  uint8_t Imm = N->getZExtValue();
5971  uint8_t NewImm = 0;
5972  for (unsigned i = 0; i != 4; ++i) {
5973    if (Imm & (1 << i))
5974      NewImm |= 0x3 << (i * 2);
5975  }
5976  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5977}]>;
5978
5979// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5980def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
5981  uint8_t Imm = N->getZExtValue();
5982  uint8_t NewImm = 0;
5983  for (unsigned i = 0; i != 2; ++i) {
5984    if (Imm & (1 << i))
5985      NewImm |= 0xf << (i * 4);
5986  }
5987  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5988}]>;
5989
5990// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5991def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
5992  uint8_t Imm = N->getZExtValue();
5993  uint8_t NewImm = 0;
5994  for (unsigned i = 0; i != 2; ++i) {
5995    if (Imm & (1 << i))
5996      NewImm |= 0x3 << (i * 2);
5997  }
5998  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
5999}]>;
6000
6001let Predicates = [HasAVX] in {
6002  let isCommutable = 0 in {
6003    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6004                                        VR128, load, i128mem, 0,
6005                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6006  }
6007
6008let Uses = [MXCSR], mayRaiseFPException = 1 in {
6009  let ExeDomain = SSEPackedSingle in
6010  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6011                                   VR128, load, f128mem, 0,
6012                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6013  let ExeDomain = SSEPackedDouble in
6014  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6015                                   VR128, load, f128mem, 0,
6016                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6017  let ExeDomain = SSEPackedSingle in
6018  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6019                                    VR256, load, i256mem, 0,
6020                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6021}
6022}
6023
6024let Predicates = [HasAVX2] in {
6025  let isCommutable = 0 in {
6026  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6027                                  VR256, load, i256mem, 0,
6028                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6029  }
6030}
6031
6032let Constraints = "$src1 = $dst" in {
6033  let isCommutable = 0 in {
6034  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6035                                     VR128, memop, i128mem, 1,
6036                                     SchedWriteMPSAD.XMM>;
6037  }
6038
6039  let ExeDomain = SSEPackedSingle in
6040  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6041                                  VR128, memop, f128mem, 1,
6042                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6043  let ExeDomain = SSEPackedDouble in
6044  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6045                                  VR128, memop, f128mem, 1,
6046                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6047}
6048
6049/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6050multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6051                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6052                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6053                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6054let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6055  let isCommutable = 1 in
6056  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6057        (ins RC:$src1, RC:$src2, u8imm:$src3),
6058        !if(Is2Addr,
6059            !strconcat(OpcodeStr,
6060                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6061            !strconcat(OpcodeStr,
6062                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6063        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6064        Sched<[sched]>;
6065  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6066        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6067        !if(Is2Addr,
6068            !strconcat(OpcodeStr,
6069                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6070            !strconcat(OpcodeStr,
6071                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6072        [(set RC:$dst,
6073          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6074        Sched<[sched.Folded, sched.ReadAfterFold]>;
6075}
6076
6077  // Pattern to commute if load is in first source.
6078  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6079            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6080                                            (commuteXForm timm:$src3))>;
6081}
6082
6083let Predicates = [HasAVX] in {
6084  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6085                                  VR128, load, f128mem, 0, SSEPackedSingle,
6086                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6087                                  VEX_4V, VEX_WIG;
6088  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6089                                   VR256, load, f256mem, 0, SSEPackedSingle,
6090                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6091                                   VEX_4V, VEX_L, VEX_WIG;
6092  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6093                                  VR128, load, f128mem, 0, SSEPackedDouble,
6094                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6095                                  VEX_4V, VEX_WIG;
6096  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6097                                   VR256, load, f256mem, 0, SSEPackedDouble,
6098                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6099                                   VEX_4V, VEX_L, VEX_WIG;
6100  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6101                                  VR128, load, i128mem, 0, SSEPackedInt,
6102                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6103                                  VEX_4V, VEX_WIG;
6104}
6105
6106let Predicates = [HasAVX2] in {
6107  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6108                                   VR256, load, i256mem, 0, SSEPackedInt,
6109                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6110                                   VEX_4V, VEX_L, VEX_WIG;
6111}
6112
6113// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6114// ExecutionDomainFixPass will cleanup domains later on.
6115let Predicates = [HasAVX1Only] in {
6116def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6117          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6118def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6119          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6120def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6121          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6122
6123// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6124// it from becoming movsd via commuting under optsize.
6125def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6126          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6127def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6128          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6129def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6130          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6131
6132def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6133          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6134def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6135          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6136def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6137          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6138
6139// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6140// it from becoming movss via commuting under optsize.
6141def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6142          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6143def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6144          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6145def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6146          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6147}
6148
6149defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6150                               VR128, memop, f128mem, 1, SSEPackedSingle,
6151                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6152defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6153                               VR128, memop, f128mem, 1, SSEPackedDouble,
6154                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6155defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6156                               VR128, memop, i128mem, 1, SSEPackedInt,
6157                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6158
6159let Predicates = [UseSSE41] in {
6160// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6161// it from becoming movss via commuting under optsize.
6162def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6163          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6164def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6165          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6166def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6167          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6168
6169def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6170          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6171def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6172          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6173def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6174          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6175}
6176
6177// For insertion into the zero index (low half) of a 256-bit vector, it is
6178// more efficient to generate a blend with immediate instead of an insert*128.
6179let Predicates = [HasAVX] in {
6180def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6181          (VBLENDPDYrri VR256:$src1,
6182                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6183                                       VR128:$src2, sub_xmm), 0x3)>;
6184def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6185          (VBLENDPSYrri VR256:$src1,
6186                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6187                                       VR128:$src2, sub_xmm), 0xf)>;
6188
6189def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6190          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6191                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6192def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6193          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6194                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6195}
6196
6197/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6198multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6199                                X86MemOperand x86memop, ValueType VT,
6200                                PatFrag mem_frag, SDNode OpNode,
6201                                X86FoldableSchedWrite sched> {
6202  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6203                  (ins RC:$src1, RC:$src2, RC:$src3),
6204                  !strconcat(OpcodeStr,
6205                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6206                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6207                  SSEPackedInt>, TAPD, VEX_4V,
6208                Sched<[sched]>;
6209
6210  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6211                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6212                  !strconcat(OpcodeStr,
6213                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6214                  [(set RC:$dst,
6215                        (OpNode RC:$src3, (mem_frag addr:$src2),
6216                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6217                Sched<[sched.Folded, sched.ReadAfterFold,
6218                       // x86memop:$src2
6219                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6220                       ReadDefault,
6221                       // RC::$src3
6222                       sched.ReadAfterFold]>;
6223}
6224
6225let Predicates = [HasAVX] in {
6226let ExeDomain = SSEPackedDouble in {
6227defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6228                                       v2f64, loadv2f64, X86Blendv,
6229                                       SchedWriteFVarBlend.XMM>;
6230defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6231                                       v4f64, loadv4f64, X86Blendv,
6232                                       SchedWriteFVarBlend.YMM>, VEX_L;
6233} // ExeDomain = SSEPackedDouble
6234let ExeDomain = SSEPackedSingle in {
6235defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6236                                       v4f32, loadv4f32, X86Blendv,
6237                                       SchedWriteFVarBlend.XMM>;
6238defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6239                                       v8f32, loadv8f32, X86Blendv,
6240                                       SchedWriteFVarBlend.YMM>, VEX_L;
6241} // ExeDomain = SSEPackedSingle
6242defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6243                                       v16i8, loadv16i8, X86Blendv,
6244                                       SchedWriteVarBlend.XMM>;
6245}
6246
6247let Predicates = [HasAVX2] in {
6248defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6249                                       v32i8, loadv32i8, X86Blendv,
6250                                       SchedWriteVarBlend.YMM>, VEX_L;
6251}
6252
6253let Predicates = [HasAVX] in {
6254  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6255                              (v4i32 VR128:$src2))),
6256            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6257  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6258                              (v2i64 VR128:$src2))),
6259            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6260  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6261                              (v8i32 VR256:$src2))),
6262            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6263  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6264                              (v4i64 VR256:$src2))),
6265            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6266}
6267
6268// Prefer a movss or movsd over a blendps when optimizing for size. these were
6269// changed to use blends because blends have better throughput on sandybridge
6270// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6271let Predicates = [HasAVX, OptForSpeed] in {
6272  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6273            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6274  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6275            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6276
6277  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6278            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6279  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6280            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6281  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6282            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6283
6284  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6285            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6286  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6287            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6288  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6289            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6290
6291  // Move low f32 and clear high bits.
6292  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6293            (SUBREG_TO_REG (i32 0),
6294             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6295                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6296                          (i8 1))), sub_xmm)>;
6297  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6298            (SUBREG_TO_REG (i32 0),
6299             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6300                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6301                          (i8 3))), sub_xmm)>;
6302}
6303
6304// Prefer a movss or movsd over a blendps when optimizing for size. these were
6305// changed to use blends because blends have better throughput on sandybridge
6306// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6307let Predicates = [UseSSE41, OptForSpeed] in {
6308  // With SSE41 we can use blends for these patterns.
6309  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6310            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6311  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6312            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6313
6314  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6315            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6316  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6317            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6318  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6319            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6320
6321  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6322            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6323  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6324            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6325  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6326            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6327}
6328
6329
6330/// SS41I_ternary - SSE 4.1 ternary operator
6331let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6332  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6333                           PatFrag mem_frag, X86MemOperand x86memop,
6334                           SDNode OpNode, X86FoldableSchedWrite sched> {
6335    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6336                    (ins VR128:$src1, VR128:$src2),
6337                    !strconcat(OpcodeStr,
6338                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6339                    [(set VR128:$dst,
6340                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6341                    Sched<[sched]>;
6342
6343    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6344                    (ins VR128:$src1, x86memop:$src2),
6345                    !strconcat(OpcodeStr,
6346                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6347                    [(set VR128:$dst,
6348                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6349                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6350  }
6351}
6352
6353let ExeDomain = SSEPackedDouble in
6354defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6355                              X86Blendv, SchedWriteFVarBlend.XMM>;
6356let ExeDomain = SSEPackedSingle in
6357defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6358                              X86Blendv, SchedWriteFVarBlend.XMM>;
6359defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6360                              X86Blendv, SchedWriteVarBlend.XMM>;
6361
6362// Aliases with the implicit xmm0 argument
6363def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6364                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6365def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6366                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6367def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6368                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6369def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6370                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6371def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6372                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6373def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6374                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6375
6376let Predicates = [UseSSE41] in {
6377  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6378                              (v4i32 VR128:$src2))),
6379            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6380  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6381                              (v2i64 VR128:$src2))),
6382            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6383}
6384
6385let AddedComplexity = 400 in { // Prefer non-temporal versions
6386
6387let Predicates = [HasAVX, NoVLX] in
6388def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6389                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6390                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6391let Predicates = [HasAVX2, NoVLX] in
6392def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6393                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6394                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6395def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6396                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6397                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6398
6399let Predicates = [HasAVX2, NoVLX] in {
6400  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6401            (VMOVNTDQAYrm addr:$src)>;
6402  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6403            (VMOVNTDQAYrm addr:$src)>;
6404  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6405            (VMOVNTDQAYrm addr:$src)>;
6406  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6407            (VMOVNTDQAYrm addr:$src)>;
6408  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6409            (VMOVNTDQAYrm addr:$src)>;
6410  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6411            (VMOVNTDQAYrm addr:$src)>;
6412}
6413
6414let Predicates = [HasAVX, NoVLX] in {
6415  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6416            (VMOVNTDQArm addr:$src)>;
6417  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6418            (VMOVNTDQArm addr:$src)>;
6419  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6420            (VMOVNTDQArm addr:$src)>;
6421  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6422            (VMOVNTDQArm addr:$src)>;
6423  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6424            (VMOVNTDQArm addr:$src)>;
6425  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6426            (VMOVNTDQArm addr:$src)>;
6427}
6428
6429let Predicates = [UseSSE41] in {
6430  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6431            (MOVNTDQArm addr:$src)>;
6432  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6433            (MOVNTDQArm addr:$src)>;
6434  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6435            (MOVNTDQArm addr:$src)>;
6436  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6437            (MOVNTDQArm addr:$src)>;
6438  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6439            (MOVNTDQArm addr:$src)>;
6440  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6441            (MOVNTDQArm addr:$src)>;
6442}
6443
6444} // AddedComplexity
6445
6446//===----------------------------------------------------------------------===//
6447// SSE4.2 - Compare Instructions
6448//===----------------------------------------------------------------------===//
6449
6450/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6451multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6452                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6453                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6454                          bit Is2Addr = 1> {
6455  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6456       (ins RC:$src1, RC:$src2),
6457       !if(Is2Addr,
6458           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6459           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6460       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6461       Sched<[sched]>;
6462  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6463       (ins RC:$src1, x86memop:$src2),
6464       !if(Is2Addr,
6465           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6466           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6467       [(set RC:$dst,
6468         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6469       Sched<[sched.Folded, sched.ReadAfterFold]>;
6470}
6471
6472let Predicates = [HasAVX] in
6473  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6474                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6475                                 VEX_4V, VEX_WIG;
6476
6477let Predicates = [HasAVX2] in
6478  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6479                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6480                                  VEX_4V, VEX_L, VEX_WIG;
6481
6482let Constraints = "$src1 = $dst" in
6483  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6484                                memop, i128mem, SchedWriteVecALU.XMM>;
6485
6486//===----------------------------------------------------------------------===//
6487// SSE4.2 - String/text Processing Instructions
6488//===----------------------------------------------------------------------===//
6489
6490multiclass pcmpistrm_SS42AI<string asm> {
6491  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6492    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6493    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6494    []>, Sched<[WritePCmpIStrM]>;
6495  let mayLoad = 1 in
6496  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6497    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6498    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6499    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6500}
6501
6502let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6503  let Predicates = [HasAVX] in
6504  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6505  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6506}
6507
6508multiclass SS42AI_pcmpestrm<string asm> {
6509  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6510    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6511    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6512    []>, Sched<[WritePCmpEStrM]>;
6513  let mayLoad = 1 in
6514  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6515    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6516    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6517    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6518}
6519
6520let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6521  let Predicates = [HasAVX] in
6522  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6523  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6524}
6525
6526multiclass SS42AI_pcmpistri<string asm> {
6527  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6528    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6529    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6530    []>, Sched<[WritePCmpIStrI]>;
6531  let mayLoad = 1 in
6532  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6533    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6534    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6535    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6536}
6537
6538let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6539  let Predicates = [HasAVX] in
6540  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6541  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6542}
6543
6544multiclass SS42AI_pcmpestri<string asm> {
6545  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6546    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6547    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6548    []>, Sched<[WritePCmpEStrI]>;
6549  let mayLoad = 1 in
6550  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6551    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6552    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6553    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6554}
6555
6556let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6557  let Predicates = [HasAVX] in
6558  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6559  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6560}
6561
6562//===----------------------------------------------------------------------===//
6563// SSE4.2 - CRC Instructions
6564//===----------------------------------------------------------------------===//
6565
6566// No CRC instructions have AVX equivalents
6567
6568// crc intrinsic instruction
6569// This set of instructions are only rm, the only difference is the size
6570// of r and m.
6571class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6572                   RegisterClass RCIn, SDPatternOperator Int> :
6573  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6574         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6575         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6576         Sched<[WriteCRC32]>;
6577
6578class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6579                   X86MemOperand x86memop, SDPatternOperator Int> :
6580  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6581         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6582         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6583         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6584
6585let Constraints = "$src1 = $dst" in {
6586  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6587                                 int_x86_sse42_crc32_32_8>;
6588  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6589                                 int_x86_sse42_crc32_32_8>;
6590  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6591                                 int_x86_sse42_crc32_32_16>, OpSize16;
6592  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6593                                 int_x86_sse42_crc32_32_16>, OpSize16;
6594  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6595                                 int_x86_sse42_crc32_32_32>, OpSize32;
6596  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6597                                 int_x86_sse42_crc32_32_32>, OpSize32;
6598  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6599                                 int_x86_sse42_crc32_64_64>, REX_W;
6600  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6601                                 int_x86_sse42_crc32_64_64>, REX_W;
6602  let hasSideEffects = 0 in {
6603    let mayLoad = 1 in
6604    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6605                                   null_frag>, REX_W;
6606    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6607                                   null_frag>, REX_W;
6608  }
6609}
6610
6611//===----------------------------------------------------------------------===//
6612// SHA-NI Instructions
6613//===----------------------------------------------------------------------===//
6614
6615// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6616multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6617                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6618  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6619             (ins VR128:$src1, VR128:$src2),
6620             !if(UsesXMM0,
6621                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6622                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6623             [!if(UsesXMM0,
6624                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6625                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6626             T8, Sched<[sched]>;
6627
6628  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6629             (ins VR128:$src1, i128mem:$src2),
6630             !if(UsesXMM0,
6631                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6632                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6633             [!if(UsesXMM0,
6634                  (set VR128:$dst, (IntId VR128:$src1,
6635                    (memop addr:$src2), XMM0)),
6636                  (set VR128:$dst, (IntId VR128:$src1,
6637                    (memop addr:$src2))))]>, T8,
6638             Sched<[sched.Folded, sched.ReadAfterFold]>;
6639}
6640
6641let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6642  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6643                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6644                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6645                         [(set VR128:$dst,
6646                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6647                            (i8 timm:$src3)))]>, TA,
6648                         Sched<[SchedWriteVecIMul.XMM]>;
6649  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6650                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6651                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6652                         [(set VR128:$dst,
6653                           (int_x86_sha1rnds4 VR128:$src1,
6654                            (memop addr:$src2),
6655                            (i8 timm:$src3)))]>, TA,
6656                         Sched<[SchedWriteVecIMul.XMM.Folded,
6657                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6658
6659  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6660                              SchedWriteVecIMul.XMM>;
6661  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6662                              SchedWriteVecIMul.XMM>;
6663  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6664                              SchedWriteVecIMul.XMM>;
6665
6666  let Uses=[XMM0] in
6667  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6668                                SchedWriteVecIMul.XMM, 1>;
6669
6670  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6671                               SchedWriteVecIMul.XMM>;
6672  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6673                               SchedWriteVecIMul.XMM>;
6674}
6675
6676// Aliases with explicit %xmm0
6677def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6678                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6679def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6680                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6681
6682//===----------------------------------------------------------------------===//
6683// AES-NI Instructions
6684//===----------------------------------------------------------------------===//
6685
6686multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6687                             Intrinsic IntId, PatFrag ld_frag,
6688                             bit Is2Addr = 0, RegisterClass RC = VR128,
6689                             X86MemOperand MemOp = i128mem> {
6690  let AsmString = OpcodeStr##
6691                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6692                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6693    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6694                   (ins RC:$src1, RC:$src2), "",
6695                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6696                   Sched<[WriteAESDecEnc]>;
6697    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6698                   (ins RC:$src1, MemOp:$src2), "",
6699                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6700                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6701  }
6702}
6703
6704// Perform One Round of an AES Encryption/Decryption Flow
6705let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6706  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6707                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6708  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6709                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6710  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6711                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6712  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6713                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6714}
6715
6716let Predicates = [NoVLX, HasVAES] in {
6717  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6718                         int_x86_aesni_aesenc_256, load, 0, VR256,
6719                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6720  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6721                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6722                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6723  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6724                         int_x86_aesni_aesdec_256, load, 0, VR256,
6725                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6726  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6727                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6728                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6729}
6730
6731let Constraints = "$src1 = $dst" in {
6732  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6733                         int_x86_aesni_aesenc, memop, 1>;
6734  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6735                         int_x86_aesni_aesenclast, memop, 1>;
6736  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6737                         int_x86_aesni_aesdec, memop, 1>;
6738  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6739                         int_x86_aesni_aesdeclast, memop, 1>;
6740}
6741
6742// Perform the AES InvMixColumn Transformation
6743let Predicates = [HasAVX, HasAES] in {
6744  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6745      (ins VR128:$src1),
6746      "vaesimc\t{$src1, $dst|$dst, $src1}",
6747      [(set VR128:$dst,
6748        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6749      VEX, VEX_WIG;
6750  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6751      (ins i128mem:$src1),
6752      "vaesimc\t{$src1, $dst|$dst, $src1}",
6753      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6754      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6755}
6756def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6757  (ins VR128:$src1),
6758  "aesimc\t{$src1, $dst|$dst, $src1}",
6759  [(set VR128:$dst,
6760    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6761def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6762  (ins i128mem:$src1),
6763  "aesimc\t{$src1, $dst|$dst, $src1}",
6764  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6765  Sched<[WriteAESIMC.Folded]>;
6766
6767// AES Round Key Generation Assist
6768let Predicates = [HasAVX, HasAES] in {
6769  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6770      (ins VR128:$src1, u8imm:$src2),
6771      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6772      [(set VR128:$dst,
6773        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6774      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6775  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6776      (ins i128mem:$src1, u8imm:$src2),
6777      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6778      [(set VR128:$dst,
6779        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6780      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6781}
6782def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6783  (ins VR128:$src1, u8imm:$src2),
6784  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6785  [(set VR128:$dst,
6786    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6787  Sched<[WriteAESKeyGen]>;
6788def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6789  (ins i128mem:$src1, u8imm:$src2),
6790  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6791  [(set VR128:$dst,
6792    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6793  Sched<[WriteAESKeyGen.Folded]>;
6794
6795//===----------------------------------------------------------------------===//
6796// PCLMUL Instructions
6797//===----------------------------------------------------------------------===//
6798
6799// Immediate transform to help with commuting.
6800def PCLMULCommuteImm : SDNodeXForm<timm, [{
6801  uint8_t Imm = N->getZExtValue();
6802  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6803}]>;
6804
6805// SSE carry-less Multiplication instructions
6806let Predicates = [NoAVX, HasPCLMUL] in {
6807  let Constraints = "$src1 = $dst" in {
6808    let isCommutable = 1 in
6809    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6810              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6811              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6812              [(set VR128:$dst,
6813                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6814                Sched<[WriteCLMul]>;
6815
6816    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6817              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6818              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6819              [(set VR128:$dst,
6820                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6821                  timm:$src3))]>,
6822              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6823  } // Constraints = "$src1 = $dst"
6824
6825  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6826                                (i8 timm:$src3)),
6827            (PCLMULQDQrm VR128:$src1, addr:$src2,
6828                          (PCLMULCommuteImm timm:$src3))>;
6829} // Predicates = [NoAVX, HasPCLMUL]
6830
6831// SSE aliases
6832foreach HI = ["hq","lq"] in
6833foreach LO = ["hq","lq"] in {
6834  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6835                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6836                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6837  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6838                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6839                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6840}
6841
6842// AVX carry-less Multiplication instructions
6843multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6844                      PatFrag LdFrag, Intrinsic IntId> {
6845  let isCommutable = 1 in
6846  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6847            (ins RC:$src1, RC:$src2, u8imm:$src3),
6848            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6849            [(set RC:$dst,
6850              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6851            Sched<[WriteCLMul]>;
6852
6853  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6854            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6855            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6856            [(set RC:$dst,
6857               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6858            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6859
6860  // We can commute a load in the first operand by swapping the sources and
6861  // rotating the immediate.
6862  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6863            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6864                                           (PCLMULCommuteImm timm:$src3))>;
6865}
6866
6867let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6868defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6869                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6870
6871let Predicates = [NoVLX, HasVPCLMULQDQ] in
6872defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6873                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6874
6875multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6876                                   X86MemOperand MemOp, string Hi, string Lo> {
6877  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6878                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6879                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6880  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6881                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6882                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6883}
6884
6885multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6886                              X86MemOperand MemOp> {
6887  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6888  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6889  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6890  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6891}
6892
6893// AVX aliases
6894defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6895defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6896
6897//===----------------------------------------------------------------------===//
6898// SSE4A Instructions
6899//===----------------------------------------------------------------------===//
6900
6901let Predicates = [HasSSE4A] in {
6902
6903let ExeDomain = SSEPackedInt in {
6904let Constraints = "$src = $dst" in {
6905def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6906                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6907                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6908                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6909                                    timm:$idx))]>,
6910                 PD, Sched<[SchedWriteVecALU.XMM]>;
6911def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6912              (ins VR128:$src, VR128:$mask),
6913              "extrq\t{$mask, $src|$src, $mask}",
6914              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6915                                 VR128:$mask))]>,
6916              PD, Sched<[SchedWriteVecALU.XMM]>;
6917
6918def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6919                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6920                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6921                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6922                                      timm:$len, timm:$idx))]>,
6923                   XD, Sched<[SchedWriteVecALU.XMM]>;
6924def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6925                 (ins VR128:$src, VR128:$mask),
6926                 "insertq\t{$mask, $src|$src, $mask}",
6927                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6928                                    VR128:$mask))]>,
6929                 XD, Sched<[SchedWriteVecALU.XMM]>;
6930}
6931} // ExeDomain = SSEPackedInt
6932
6933// Non-temporal (unaligned) scalar stores.
6934let AddedComplexity = 400 in { // Prefer non-temporal versions
6935let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6936def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6937                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6938
6939def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6940                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6941} // SchedRW
6942
6943def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6944          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6945
6946def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6947          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6948
6949} // AddedComplexity
6950} // HasSSE4A
6951
6952//===----------------------------------------------------------------------===//
6953// AVX Instructions
6954//===----------------------------------------------------------------------===//
6955
6956//===----------------------------------------------------------------------===//
6957// VBROADCAST - Load from memory and broadcast to all elements of the
6958//              destination operand
6959//
6960class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6961                           X86MemOperand x86memop, ValueType VT,
6962                           PatFrag bcast_frag, SchedWrite Sched> :
6963  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6964        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6965        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
6966        Sched<[Sched]>, VEX;
6967
6968// AVX2 adds register forms
6969class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6970                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6971  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6972         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6973         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6974         Sched<[Sched]>, VEX;
6975
6976let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6977  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6978                                         f32mem, v4f32, X86VBroadcastld32,
6979                                         SchedWriteFShuffle.XMM.Folded>;
6980  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6981                                         f32mem, v8f32, X86VBroadcastld32,
6982                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
6983}
6984let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6985def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6986                                        v4f64, X86VBroadcastld64,
6987                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
6988
6989let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6990  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6991                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
6992  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6993                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
6994}
6995let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6996def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6997                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
6998
6999//===----------------------------------------------------------------------===//
7000// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7001//                  halves of a 256-bit vector.
7002//
7003let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7004def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7005                           (ins i128mem:$src),
7006                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7007                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7008
7009let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7010    ExeDomain = SSEPackedSingle in
7011def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7012                           (ins f128mem:$src),
7013                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7014                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7015
7016let Predicates = [HasAVX, NoVLX] in {
7017def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7018          (VBROADCASTF128 addr:$src)>;
7019def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7020          (VBROADCASTF128 addr:$src)>;
7021}
7022
7023// NOTE: We're using FP instructions here, but execution domain fixing can
7024// convert to integer when profitable.
7025let Predicates = [HasAVX, NoVLX] in {
7026def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7027          (VBROADCASTF128 addr:$src)>;
7028def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7029          (VBROADCASTF128 addr:$src)>;
7030def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7031          (VBROADCASTF128 addr:$src)>;
7032def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7033          (VBROADCASTF128 addr:$src)>;
7034}
7035
7036//===----------------------------------------------------------------------===//
7037// VINSERTF128 - Insert packed floating-point values
7038//
7039let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7040def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7041          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7042          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7043          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7044let mayLoad = 1 in
7045def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7046          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7047          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7048          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7049}
7050
7051// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7052// with YMM register containing zero.
7053// FIXME: Avoid producing vxorps to clear the fake inputs.
7054let Predicates = [HasAVX1Only] in {
7055def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7056}
7057
7058multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7059                            PatFrag memop_frag> {
7060  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7061                                   (iPTR imm)),
7062            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7063                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7064  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7065                                    (From (memop_frag addr:$src2)),
7066                                    (iPTR imm)),
7067            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7068                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7069}
7070
7071let Predicates = [HasAVX, NoVLX] in {
7072  defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7073  defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7074}
7075
7076let Predicates = [HasAVX1Only] in {
7077  defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
7078  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
7079  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7080  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
7081}
7082
7083//===----------------------------------------------------------------------===//
7084// VEXTRACTF128 - Extract packed floating-point values
7085//
7086let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7087def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7088          (ins VR256:$src1, u8imm:$src2),
7089          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7090          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7091let mayStore = 1 in
7092def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7093          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7094          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7095          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7096}
7097
7098multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7099  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7100            (To (!cast<Instruction>(InstrStr#rr)
7101                                    (From VR256:$src1),
7102                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7103  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7104                                                 (iPTR imm))), addr:$dst),
7105            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7106             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7107}
7108
7109// AVX1 patterns
7110let Predicates = [HasAVX, NoVLX] in {
7111  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7112  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7113}
7114
7115let Predicates = [HasAVX1Only] in {
7116  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7117  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7118  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7119  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7120}
7121
7122//===----------------------------------------------------------------------===//
7123// VMASKMOV - Conditional SIMD Packed Loads and Stores
7124//
7125multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7126                          Intrinsic IntLd, Intrinsic IntLd256,
7127                          Intrinsic IntSt, Intrinsic IntSt256,
7128                          X86SchedWriteMaskMove schedX,
7129                          X86SchedWriteMaskMove schedY> {
7130  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7131             (ins VR128:$src1, f128mem:$src2),
7132             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7133             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7134             VEX_4V, Sched<[schedX.RM]>;
7135  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7136             (ins VR256:$src1, f256mem:$src2),
7137             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7138             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7139             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7140  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7141             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7142             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7143             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7144             VEX_4V, Sched<[schedX.MR]>;
7145  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7146             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7147             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7148             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7149             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7150}
7151
7152let ExeDomain = SSEPackedSingle in
7153defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7154                                 int_x86_avx_maskload_ps,
7155                                 int_x86_avx_maskload_ps_256,
7156                                 int_x86_avx_maskstore_ps,
7157                                 int_x86_avx_maskstore_ps_256,
7158                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7159let ExeDomain = SSEPackedDouble in
7160defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7161                                 int_x86_avx_maskload_pd,
7162                                 int_x86_avx_maskload_pd_256,
7163                                 int_x86_avx_maskstore_pd,
7164                                 int_x86_avx_maskstore_pd_256,
7165                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7166
7167//===----------------------------------------------------------------------===//
7168// VPERMIL - Permute Single and Double Floating-Point Values
7169//
7170
7171multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7172                      RegisterClass RC, X86MemOperand x86memop_f,
7173                      X86MemOperand x86memop_i,
7174                      ValueType f_vt, ValueType i_vt,
7175                      X86FoldableSchedWrite sched,
7176                      X86FoldableSchedWrite varsched> {
7177  let Predicates = [HasAVX, NoVLX] in {
7178    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7179               (ins RC:$src1, RC:$src2),
7180               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7181               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7182               Sched<[varsched]>;
7183    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7184               (ins RC:$src1, x86memop_i:$src2),
7185               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7186               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7187                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7188               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7189
7190    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7191             (ins RC:$src1, u8imm:$src2),
7192             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7193             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7194             Sched<[sched]>;
7195    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7196             (ins x86memop_f:$src1, u8imm:$src2),
7197             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7198             [(set RC:$dst,
7199               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7200             Sched<[sched.Folded]>;
7201  }// Predicates = [HasAVX, NoVLX]
7202}
7203
7204let ExeDomain = SSEPackedSingle in {
7205  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7206                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7207                               SchedWriteFVarShuffle.XMM>;
7208  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7209                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7210                               SchedWriteFVarShuffle.YMM>, VEX_L;
7211}
7212let ExeDomain = SSEPackedDouble in {
7213  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7214                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7215                               SchedWriteFVarShuffle.XMM>;
7216  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7217                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7218                               SchedWriteFVarShuffle.YMM>, VEX_L;
7219}
7220
7221//===----------------------------------------------------------------------===//
7222// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7223//
7224
7225let ExeDomain = SSEPackedSingle in {
7226let isCommutable = 1 in
7227def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7228          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7229          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7230          [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7231                              (i8 timm:$src3))))]>, VEX_4V, VEX_L,
7232          Sched<[WriteFShuffle256]>;
7233def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7234          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7235          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7236          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7237                             (i8 timm:$src3)))]>, VEX_4V, VEX_L,
7238          Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7239}
7240
7241// Immediate transform to help with commuting.
7242def Perm2XCommuteImm : SDNodeXForm<timm, [{
7243  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7244}]>;
7245
7246let Predicates = [HasAVX] in {
7247// Pattern with load in other operand.
7248def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7249                                VR256:$src1, (i8 timm:$imm))),
7250          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7251}
7252
7253let Predicates = [HasAVX1Only] in {
7254def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7255          (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
7256def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7257                  (loadv4i64 addr:$src2), (i8 timm:$imm))),
7258          (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
7259// Pattern with load in other operand.
7260def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7261                                VR256:$src1, (i8 timm:$imm))),
7262          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7263}
7264
7265//===----------------------------------------------------------------------===//
7266// VZERO - Zero YMM registers
7267// Note: These instruction do not affect the YMM16-YMM31.
7268//
7269
7270let SchedRW = [WriteSystem] in {
7271let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7272            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7273  // Zero All YMM registers
7274  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7275                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7276                  Requires<[HasAVX]>, VEX_WIG;
7277
7278  // Zero Upper bits of YMM registers
7279  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7280                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7281                     Requires<[HasAVX]>, VEX_WIG;
7282} // Defs
7283} // SchedRW
7284
7285//===----------------------------------------------------------------------===//
7286// Half precision conversion instructions
7287//
7288
7289multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7290                      X86FoldableSchedWrite sched> {
7291  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7292             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7293             [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
7294             T8PD, VEX, Sched<[sched]>;
7295  let hasSideEffects = 0, mayLoad = 1 in
7296  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7297             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7298             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
7299             T8PD, VEX, Sched<[sched.Folded]>;
7300}
7301
7302multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7303                      SchedWrite RR, SchedWrite MR> {
7304  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7305               (ins RC:$src1, i32u8imm:$src2),
7306               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7307               [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
7308               TAPD, VEX, Sched<[RR]>;
7309  let hasSideEffects = 0, mayStore = 1 in
7310  def mr : Ii8<0x1D, MRMDestMem, (outs),
7311               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7312               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7313               TAPD, VEX, Sched<[MR]>;
7314}
7315
7316let Predicates = [HasF16C, NoVLX] in {
7317  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7318  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7319  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7320                               WriteCvtPS2PHSt>, SIMD_EXC;
7321  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7322                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7323
7324  // Pattern match vcvtph2ps of a scalar i64 load.
7325  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7326            (VCVTPH2PSrm addr:$src)>;
7327  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
7328              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7329            (VCVTPH2PSrm addr:$src)>;
7330
7331  def : Pat<(store (f64 (extractelt
7332                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
7333                         (iPTR 0))), addr:$dst),
7334            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7335  def : Pat<(store (i64 (extractelt
7336                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
7337                         (iPTR 0))), addr:$dst),
7338            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7339  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7340            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7341}
7342
7343// Patterns for  matching conversions from float to half-float and vice versa.
7344let Predicates = [HasF16C, NoVLX] in {
7345  // Use MXCSR.RC for rounding instead of explicitly specifying the default
7346  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
7347  // configurations we support (the default). However, falling back to MXCSR is
7348  // more consistent with other instructions, which are always controlled by it.
7349  // It's encoded as 0b100.
7350  def : Pat<(fp_to_f16 FR32:$src),
7351            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
7352              (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
7353
7354  def : Pat<(f16_to_fp GR16:$src),
7355            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7356              (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
7357
7358  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
7359            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
7360             (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
7361}
7362
7363//===----------------------------------------------------------------------===//
7364// AVX2 Instructions
7365//===----------------------------------------------------------------------===//
7366
7367/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7368multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7369                          ValueType OpVT, X86FoldableSchedWrite sched,
7370                          RegisterClass RC,
7371                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7372  let isCommutable = 1 in
7373  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7374        (ins RC:$src1, RC:$src2, u8imm:$src3),
7375        !strconcat(OpcodeStr,
7376            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7377        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7378        Sched<[sched]>, VEX_4V;
7379  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7380        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7381        !strconcat(OpcodeStr,
7382            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7383        [(set RC:$dst,
7384          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7385        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7386
7387  // Pattern to commute if load is in first source.
7388  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7389            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7390                                            (commuteXForm timm:$src3))>;
7391}
7392
7393let Predicates = [HasAVX2] in {
7394defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7395                               SchedWriteBlend.XMM, VR128, i128mem,
7396                               BlendCommuteImm4>;
7397defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7398                                SchedWriteBlend.YMM, VR256, i256mem,
7399                                BlendCommuteImm8>, VEX_L;
7400
7401def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7402          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7403def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7404          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7405def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7406          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7407
7408def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7409          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7410def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7411          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7412def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7413          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7414}
7415
7416// For insertion into the zero index (low half) of a 256-bit vector, it is
7417// more efficient to generate a blend with immediate instead of an insert*128.
7418// NOTE: We're using FP instructions here, but exeuction domain fixing should
7419// take care of using integer instructions when profitable.
7420let Predicates = [HasAVX] in {
7421def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7422          (VBLENDPSYrri VR256:$src1,
7423                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7424                                       VR128:$src2, sub_xmm), 0xf)>;
7425def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7426          (VBLENDPSYrri VR256:$src1,
7427                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7428                                       VR128:$src2, sub_xmm), 0xf)>;
7429def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7430          (VBLENDPSYrri VR256:$src1,
7431                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7432                                       VR128:$src2, sub_xmm), 0xf)>;
7433def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7434          (VBLENDPSYrri VR256:$src1,
7435                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7436                                       VR128:$src2, sub_xmm), 0xf)>;
7437
7438def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7439          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7440                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7441def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7442          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7443                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7444def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7445          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7446                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7447def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7448          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7449                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7450}
7451
7452//===----------------------------------------------------------------------===//
7453// VPBROADCAST - Load from memory and broadcast to all elements of the
7454//               destination operand
7455//
7456multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7457                          X86MemOperand x86memop, PatFrag bcast_frag,
7458                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7459  let Predicates = [HasAVX2, prd] in {
7460    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7461                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7462                  [(set VR128:$dst,
7463                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7464                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7465    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7466                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7467                  [(set VR128:$dst,
7468                   (OpVT128 (bcast_frag addr:$src)))]>,
7469                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7470    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7471                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7472                   [(set VR256:$dst,
7473                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7474                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7475    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7476                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7477                   [(set VR256:$dst,
7478                    (OpVT256 (bcast_frag addr:$src)))]>,
7479                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7480
7481    // Provide aliases for broadcast from the same register class that
7482    // automatically does the extract.
7483    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7484              (!cast<Instruction>(NAME#"Yrr")
7485                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7486  }
7487}
7488
7489defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7490                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7491defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7492                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7493defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7494                                    v4i32, v8i32, NoVLX>;
7495defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7496                                    v2i64, v4i64, NoVLX>;
7497
7498let Predicates = [HasAVX2, NoVLX] in {
7499  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
7500  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7501            (VPBROADCASTQrm addr:$src)>;
7502  def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
7503            (VPBROADCASTQYrm addr:$src)>;
7504
7505  // FIXME this is to handle aligned extloads from i8/i16.
7506  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
7507            (VPBROADCASTDrm addr:$src)>;
7508  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
7509            (VPBROADCASTDYrm addr:$src)>;
7510}
7511let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7512  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
7513  // This means we'll encounter truncated i32 loads; match that here.
7514  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7515            (VPBROADCASTWrm addr:$src)>;
7516  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
7517            (VPBROADCASTWYrm addr:$src)>;
7518  def : Pat<(v8i16 (X86VBroadcast
7519              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7520            (VPBROADCASTWrm addr:$src)>;
7521  def : Pat<(v8i16 (X86VBroadcast
7522              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7523            (VPBROADCASTWrm addr:$src)>;
7524  def : Pat<(v16i16 (X86VBroadcast
7525              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
7526            (VPBROADCASTWYrm addr:$src)>;
7527  def : Pat<(v16i16 (X86VBroadcast
7528              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
7529            (VPBROADCASTWYrm addr:$src)>;
7530
7531  // FIXME this is to handle aligned extloads from i8.
7532  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
7533            (VPBROADCASTWrm addr:$src)>;
7534  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
7535            (VPBROADCASTWYrm addr:$src)>;
7536}
7537
7538let Predicates = [HasAVX2, NoVLX] in {
7539  // Provide fallback in case the load node that is used in the patterns above
7540  // is used by additional users, which prevents the pattern selection.
7541    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7542              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7543    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7544              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7545    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7546              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7547}
7548
7549let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7550  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7551        (VPBROADCASTBrr (VMOVDI2PDIrr
7552                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7553                                             GR8:$src, sub_8bit))))>;
7554  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7555        (VPBROADCASTBYrr (VMOVDI2PDIrr
7556                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7557                                              GR8:$src, sub_8bit))))>;
7558
7559  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7560        (VPBROADCASTWrr (VMOVDI2PDIrr
7561                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7562                                             GR16:$src, sub_16bit))))>;
7563  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7564        (VPBROADCASTWYrr (VMOVDI2PDIrr
7565                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7566                                              GR16:$src, sub_16bit))))>;
7567}
7568let Predicates = [HasAVX2, NoVLX] in {
7569  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7570            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7571  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7572            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7573  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7574            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7575  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7576            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7577}
7578
7579// AVX1 broadcast patterns
7580let Predicates = [HasAVX1Only] in {
7581def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7582          (VBROADCASTSSYrm addr:$src)>;
7583def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7584          (VBROADCASTSDYrm addr:$src)>;
7585def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7586          (VBROADCASTSSrm addr:$src)>;
7587}
7588
7589  // Provide fallback in case the load node that is used in the patterns above
7590  // is used by additional users, which prevents the pattern selection.
7591let Predicates = [HasAVX, NoVLX] in {
7592  // 128bit broadcasts:
7593  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7594            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7595  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7596            (VMOVDDUPrm addr:$src)>;
7597
7598  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7599            (VMOVDDUPrr VR128:$src)>;
7600  def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
7601            (VMOVDDUPrm addr:$src)>;
7602  def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
7603            (VMOVDDUPrm addr:$src)>;
7604}
7605
7606let Predicates = [HasAVX1Only] in {
7607  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7608            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7609  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7610            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7611              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7612              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7613  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7614            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7615              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7616              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7617
7618  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7619            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7620  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7621            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7622              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7623              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7624  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7625            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7626              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7627              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7628
7629  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7630            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7631  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7632            (VMOVDDUPrm addr:$src)>;
7633}
7634
7635//===----------------------------------------------------------------------===//
7636// VPERM - Permute instructions
7637//
7638
7639multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7640                     ValueType OpVT, X86FoldableSchedWrite Sched,
7641                     X86MemOperand memOp> {
7642  let Predicates = [HasAVX2, NoVLX] in {
7643    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7644                     (ins VR256:$src1, VR256:$src2),
7645                     !strconcat(OpcodeStr,
7646                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7647                     [(set VR256:$dst,
7648                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7649                     Sched<[Sched]>, VEX_4V, VEX_L;
7650    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7651                     (ins VR256:$src1, memOp:$src2),
7652                     !strconcat(OpcodeStr,
7653                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7654                     [(set VR256:$dst,
7655                       (OpVT (X86VPermv VR256:$src1,
7656                              (load addr:$src2))))]>,
7657                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7658  }
7659}
7660
7661defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7662let ExeDomain = SSEPackedSingle in
7663defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7664
7665multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7666                         ValueType OpVT, X86FoldableSchedWrite Sched,
7667                         X86MemOperand memOp> {
7668  let Predicates = [HasAVX2, NoVLX] in {
7669    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7670                       (ins VR256:$src1, u8imm:$src2),
7671                       !strconcat(OpcodeStr,
7672                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7673                       [(set VR256:$dst,
7674                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7675                       Sched<[Sched]>, VEX, VEX_L;
7676    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7677                       (ins memOp:$src1, u8imm:$src2),
7678                       !strconcat(OpcodeStr,
7679                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7680                       [(set VR256:$dst,
7681                         (OpVT (X86VPermi (mem_frag addr:$src1),
7682                                (i8 timm:$src2))))]>,
7683                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7684  }
7685}
7686
7687defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7688                            WriteShuffle256, i256mem>, VEX_W;
7689let ExeDomain = SSEPackedDouble in
7690defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7691                             WriteFShuffle256, f256mem>, VEX_W;
7692
7693//===----------------------------------------------------------------------===//
7694// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7695//
7696let isCommutable = 1 in
7697def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7698          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7699          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7700          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7701                            (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
7702          VEX_4V, VEX_L;
7703def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7704          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7705          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7706          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7707                             (i8 timm:$src3)))]>,
7708          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7709
7710let Predicates = [HasAVX2] in
7711def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7712                                VR256:$src1, (i8 timm:$imm))),
7713          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7714
7715
7716//===----------------------------------------------------------------------===//
7717// VINSERTI128 - Insert packed integer values
7718//
7719let hasSideEffects = 0 in {
7720def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7721          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7722          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7723          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7724let mayLoad = 1 in
7725def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7726          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7727          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7728          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7729}
7730
7731let Predicates = [HasAVX2, NoVLX] in {
7732  defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
7733  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
7734  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7735  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
7736}
7737
7738//===----------------------------------------------------------------------===//
7739// VEXTRACTI128 - Extract packed integer values
7740//
7741def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7742          (ins VR256:$src1, u8imm:$src2),
7743          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7744          Sched<[WriteShuffle256]>, VEX, VEX_L;
7745let hasSideEffects = 0, mayStore = 1 in
7746def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7747          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7748          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7749          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7750
7751let Predicates = [HasAVX2, NoVLX] in {
7752  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7753  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7754  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7755  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7756}
7757
7758//===----------------------------------------------------------------------===//
7759// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7760//
7761multiclass avx2_pmovmask<string OpcodeStr,
7762                         Intrinsic IntLd128, Intrinsic IntLd256,
7763                         Intrinsic IntSt128, Intrinsic IntSt256> {
7764  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7765             (ins VR128:$src1, i128mem:$src2),
7766             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7767             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7768             VEX_4V, Sched<[WriteVecMaskedLoad]>;
7769  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7770             (ins VR256:$src1, i256mem:$src2),
7771             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7772             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7773             VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
7774  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7775             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7776             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7777             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7778             VEX_4V, Sched<[WriteVecMaskedStore]>;
7779  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7780             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7781             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7782             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7783             VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
7784}
7785
7786defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7787                                int_x86_avx2_maskload_d,
7788                                int_x86_avx2_maskload_d_256,
7789                                int_x86_avx2_maskstore_d,
7790                                int_x86_avx2_maskstore_d_256>;
7791defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7792                                int_x86_avx2_maskload_q,
7793                                int_x86_avx2_maskload_q_256,
7794                                int_x86_avx2_maskstore_q,
7795                                int_x86_avx2_maskstore_q_256>, VEX_W;
7796
7797multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7798                          ValueType MaskVT> {
7799    // masked store
7800    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7801             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7802    // masked load
7803    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7804             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7805    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7806                              (VT immAllZerosV))),
7807             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7808}
7809let Predicates = [HasAVX] in {
7810  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7811  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7812  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7813  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7814}
7815let Predicates = [HasAVX1Only] in {
7816  // load/store i32/i64 not supported use ps/pd version
7817  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7818  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7819  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7820  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7821}
7822let Predicates = [HasAVX2] in {
7823  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7824  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7825  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7826  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7827}
7828
7829//===----------------------------------------------------------------------===//
7830// SubVector Broadcasts
7831// Provide fallback in case the load node that is used in the patterns above
7832// is used by additional users, which prevents the pattern selection.
7833
7834let Predicates = [HasAVX, NoVLX] in {
7835def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7836          (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7837                         (v2f64 VR128:$src), 1)>;
7838def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7839          (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7840                         (v4f32 VR128:$src), 1)>;
7841}
7842
7843// NOTE: We're using FP instructions here, but execution domain fixing can
7844// convert to integer when profitable.
7845let Predicates = [HasAVX, NoVLX] in {
7846def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7847          (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7848                         (v2i64 VR128:$src), 1)>;
7849def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7850          (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7851                         (v4i32 VR128:$src), 1)>;
7852def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7853          (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7854                         (v8i16 VR128:$src), 1)>;
7855def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7856          (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7857                         (v16i8 VR128:$src), 1)>;
7858}
7859
7860//===----------------------------------------------------------------------===//
7861// Variable Bit Shifts
7862//
7863multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7864                          ValueType vt128, ValueType vt256> {
7865  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7866             (ins VR128:$src1, VR128:$src2),
7867             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7868             [(set VR128:$dst,
7869               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7870             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7871  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7872             (ins VR128:$src1, i128mem:$src2),
7873             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7874             [(set VR128:$dst,
7875               (vt128 (OpNode VR128:$src1,
7876                       (vt128 (load addr:$src2)))))]>,
7877             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7878                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7879  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7880             (ins VR256:$src1, VR256:$src2),
7881             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7882             [(set VR256:$dst,
7883               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7884             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7885  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7886             (ins VR256:$src1, i256mem:$src2),
7887             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7888             [(set VR256:$dst,
7889               (vt256 (OpNode VR256:$src1,
7890                       (vt256 (load addr:$src2)))))]>,
7891             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7892                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7893}
7894
7895let Predicates = [HasAVX2, NoVLX] in {
7896  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7897  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7898  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7899  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7900  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7901}
7902
7903//===----------------------------------------------------------------------===//
7904// VGATHER - GATHER Operations
7905
7906// FIXME: Improve scheduling of gather instructions.
7907multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7908                       ValueType VTy, PatFrag GatherNode128,
7909                       PatFrag GatherNode256, RegisterClass RC256,
7910                       X86MemOperand memop128, X86MemOperand memop256,
7911                       ValueType MTx = VTx, ValueType MTy = VTy> {
7912  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7913            (ins VR128:$src1, memop128:$src2, VR128:$mask),
7914            !strconcat(OpcodeStr,
7915              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7916            [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
7917                  (GatherNode128 VR128:$src1, VR128:$mask,
7918                                vectoraddr:$src2))]>,
7919            VEX, Sched<[WriteLoad]>;
7920  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7921            (ins RC256:$src1, memop256:$src2, RC256:$mask),
7922            !strconcat(OpcodeStr,
7923              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7924            [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
7925                  (GatherNode256 RC256:$src1, RC256:$mask,
7926                                vectoraddr:$src2))]>,
7927            VEX, VEX_L, Sched<[WriteLoad]>;
7928}
7929
7930let Predicates = [HasAVX2] in {
7931  let mayLoad = 1, hasSideEffects = 0, Constraints
7932    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7933    in {
7934    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
7935                        mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
7936    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
7937                        mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
7938    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
7939                        mgatherv8i32, VR256, vx128mem, vy256mem>;
7940    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
7941                        mgatherv4i64, VR128, vx64mem, vy128mem>;
7942
7943    let ExeDomain = SSEPackedDouble in {
7944      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
7945                          mgatherv4i32, VR256, vx128mem, vx256mem,
7946                          v2i64, v4i64>, VEX_W;
7947      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
7948                          mgatherv4i64, VR256, vx128mem, vy256mem,
7949                          v2i64, v4i64>, VEX_W;
7950    }
7951
7952    let ExeDomain = SSEPackedSingle in {
7953      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
7954                          mgatherv8i32, VR256, vx128mem, vy256mem,
7955                          v4i32, v8i32>;
7956      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
7957                          mgatherv4i64, VR128, vx64mem, vy128mem,
7958                          v4i32, v4i32>;
7959    }
7960  }
7961}
7962
7963//===----------------------------------------------------------------------===//
7964// GFNI instructions
7965//===----------------------------------------------------------------------===//
7966
7967multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7968                        RegisterClass RC, PatFrag MemOpFrag,
7969                        X86MemOperand X86MemOp, bit Is2Addr = 0> {
7970  let ExeDomain = SSEPackedInt,
7971      AsmString = !if(Is2Addr,
7972        OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
7973        OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7974    let isCommutable = 1 in
7975    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7976                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7977             Sched<[SchedWriteVecALU.XMM]>, T8PD;
7978
7979    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7980                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7981                                 (MemOpFrag addr:$src2))))]>,
7982             Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7983  }
7984}
7985
7986multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7987                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7988                           X86MemOperand X86MemOp, bit Is2Addr = 0> {
7989  let AsmString = !if(Is2Addr,
7990      OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7991      OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7992  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7993              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7994              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
7995              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7996  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7997              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7998              [(set RC:$dst, (OpVT (OpNode RC:$src1,
7999                                    (MemOpFrag addr:$src2),
8000                              timm:$src3)))], SSEPackedInt>,
8001              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
8002  }
8003}
8004
8005multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
8006  let Constraints = "$src1 = $dst",
8007      Predicates  = [HasGFNI, UseSSE2] in
8008  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
8009                                      VR128, load, i128mem, 1>;
8010  let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8011    defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
8012                                      load, i128mem>, VEX_4V, VEX_W;
8013    defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
8014                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
8015  }
8016}
8017
8018// GF2P8MULB
8019let Constraints = "$src1 = $dst",
8020    Predicates  = [HasGFNI, UseSSE2] in
8021defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8022                                    i128mem, 1>;
8023let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8024  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8025                                   i128mem>, VEX_4V;
8026  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8027                                   i256mem>, VEX_4V, VEX_L;
8028}
8029// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8030let isCommutable = 0 in {
8031  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8032                                             X86GF2P8affineinvqb>, TAPD;
8033  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8034                                             X86GF2P8affineqb>, TAPD;
8035}
8036
8037