xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision ca53e5aedfebcc1b4091b68e01b2d5cae923f85e)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               PatFrags mem_frags, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
119  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
120                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
121}
122
123//===----------------------------------------------------------------------===//
124// AVX & SSE - Zero/One Vectors
125//===----------------------------------------------------------------------===//
126
127// Alias instruction that maps zero vector to pxor / xorp* for sse.
128// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
129// swizzled by ExecutionDomainFix to pxor.
130// We set canFoldAsLoad because this can be converted to a constant-pool
131// load of an all-zeros value if folding it would be beneficial.
132let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
133    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
134def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
135               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
136}
137
138let Predicates = [NoAVX512] in {
139def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
140def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
141def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
142def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
143def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
144}
145
146
147// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
148// and doesn't need it because on sandy bridge the register is set to zero
149// at the rename stage without using any execution unit, so SET0PSY
150// and SET0PDY can be used for vector int instructions without penalty
151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
153def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
154                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
155}
156
157let Predicates = [NoAVX512] in {
158def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
159def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
160def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
161def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
163}
164
165// We set canFoldAsLoad because this can be converted to a constant-pool
166// load of an all-ones value if folding it would be beneficial.
167let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
168    isPseudo = 1, SchedRW = [WriteZero] in {
169  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
170                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
171  let Predicates = [HasAVX1Only, OptForMinSize] in {
172  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
173                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
174  }
175  let Predicates = [HasAVX2] in
176  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178}
179
180//===----------------------------------------------------------------------===//
181// SSE 1 & 2 - Move FP Scalar Instructions
182//
183// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
184// register copies because it's a partial register update; Register-to-register
185// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
186// that the insert be implementable in terms of a copy, and just mentioned, we
187// don't use movss/movsd for copies.
188//===----------------------------------------------------------------------===//
189
190multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
191                         X86MemOperand x86memop, string base_opc,
192                         string asm_opr, Domain d, string Name> {
193  let isCommutable = 1 in
194  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
195              (ins VR128:$src1, VR128:$src2),
196              !strconcat(base_opc, asm_opr),
197              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
198              Sched<[SchedWriteFShuffle.XMM]>;
199
200  // For the disassembler
201  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
202  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
203                  (ins VR128:$src1, VR128:$src2),
204                  !strconcat(base_opc, asm_opr), []>,
205                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
206}
207
208multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
209                      X86MemOperand x86memop, string OpcodeStr,
210                      Domain d, string Name, Predicate pred> {
211  // AVX
212  let Predicates = [UseAVX, OptForSize] in
213  defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
214                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
215                              "V"#Name>,
216                              VEX_4V, VEX_LIG, VEX_WIG;
217
218  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
219                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
220                     [(store RC:$src, addr:$dst)], d>,
221                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
222  // SSE1 & 2
223  let Constraints = "$src1 = $dst" in {
224    let Predicates = [pred, NoSSE41_Or_OptForSize] in
225    defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
226                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
227  }
228
229  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
230                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
231                     [(store RC:$src, addr:$dst)], d>,
232                     Sched<[WriteFStore]>;
233
234  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
235                  (!cast<Instruction>("V"#NAME#"rr_REV")
236                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
237  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
238                  (!cast<Instruction>(NAME#"rr_REV")
239                   VR128:$dst, VR128:$src2), 0>;
240}
241
242// Loading from memory automatically zeroing upper bits.
243multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
244                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
245                         Domain d> {
246  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
247                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
248                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
249                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
250  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
251                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
252                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
253                     Sched<[WriteFLoad]>;
254
255  // _alt version uses FR32/FR64 register class.
256  let isCodeGenOnly = 1 in {
257  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
258                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
259                         [(set RC:$dst, (mem_pat addr:$src))], d>,
260                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
261  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
262                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
263                         [(set RC:$dst, (mem_pat addr:$src))], d>,
264                         Sched<[WriteFLoad]>;
265  }
266}
267
268defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
269                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
270defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
271                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
272
273let canFoldAsLoad = 1, isReMaterializable = 1 in {
274  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
275                             SSEPackedSingle>, XS;
276  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
277                             SSEPackedDouble>, XD;
278}
279
280// Patterns
281let Predicates = [UseAVX] in {
282  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
283            (VMOVSSrm addr:$src)>;
284  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
285            (VMOVSDrm addr:$src)>;
286
287  // Represent the same patterns above but in the form they appear for
288  // 256-bit types
289  def : Pat<(v8f32 (X86vzload32 addr:$src)),
290            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
291  def : Pat<(v4f64 (X86vzload64 addr:$src)),
292            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
293}
294
295let Predicates = [UseAVX, OptForSize] in {
296  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
297  // MOVSS to the lower bits.
298  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
299            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
300  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
301            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
302
303  // Move low f32 and clear high bits.
304  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
305            (SUBREG_TO_REG (i32 0),
306             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
307              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
308  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
309            (SUBREG_TO_REG (i32 0),
310             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
311              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
312}
313
314let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
315// Move scalar to XMM zero-extended, zeroing a VR128 then do a
316// MOVSS to the lower bits.
317def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
318          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
319def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
320          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
321}
322
323let Predicates = [UseSSE2] in
324def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
325          (MOVSDrm addr:$src)>;
326
327let Predicates = [UseSSE1] in
328def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
329          (MOVSSrm addr:$src)>;
330
331//===----------------------------------------------------------------------===//
332// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
333//===----------------------------------------------------------------------===//
334
335multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
336                            X86MemOperand x86memop, PatFrag ld_frag,
337                            string asm, Domain d,
338                            X86SchedWriteMoveLS sched> {
339let hasSideEffects = 0, isMoveReg = 1 in
340  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
341              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
342           Sched<[sched.RR]>;
343let canFoldAsLoad = 1, isReMaterializable = 1 in
344  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
345              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
346                   [(set RC:$dst, (ld_frag addr:$src))], d>,
347           Sched<[sched.RM]>;
348}
349
350let Predicates = [HasAVX, NoVLX] in {
351defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
352                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
353                                PS, VEX, VEX_WIG;
354defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
355                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
356                                PD, VEX, VEX_WIG;
357defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
358                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
359                                PS, VEX, VEX_WIG;
360defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
361                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
362                                PD, VEX, VEX_WIG;
363
364defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
365                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
366                                 PS, VEX, VEX_L, VEX_WIG;
367defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
368                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
369                                 PD, VEX, VEX_L, VEX_WIG;
370defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
371                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
372                                 PS, VEX, VEX_L, VEX_WIG;
373defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
374                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
375                                 PD, VEX, VEX_L, VEX_WIG;
376}
377
378let Predicates = [UseSSE1] in {
379defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
380                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
381                               PS;
382defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
383                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
384                               PS;
385}
386let Predicates = [UseSSE2] in {
387defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
388                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
389                               PD;
390defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
391                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
392                               PD;
393}
394
395let Predicates = [HasAVX, NoVLX]  in {
396let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
397def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
398                   "movaps\t{$src, $dst|$dst, $src}",
399                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
400                   VEX, VEX_WIG;
401def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
402                   "movapd\t{$src, $dst|$dst, $src}",
403                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
404                   VEX, VEX_WIG;
405def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
406                   "movups\t{$src, $dst|$dst, $src}",
407                   [(store (v4f32 VR128:$src), addr:$dst)]>,
408                   VEX, VEX_WIG;
409def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
410                   "movupd\t{$src, $dst|$dst, $src}",
411                   [(store (v2f64 VR128:$src), addr:$dst)]>,
412                   VEX, VEX_WIG;
413} // SchedRW
414
415let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
416def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
417                   "movaps\t{$src, $dst|$dst, $src}",
418                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
419                   VEX, VEX_L, VEX_WIG;
420def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
421                   "movapd\t{$src, $dst|$dst, $src}",
422                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
423                   VEX, VEX_L, VEX_WIG;
424def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
425                   "movups\t{$src, $dst|$dst, $src}",
426                   [(store (v8f32 VR256:$src), addr:$dst)]>,
427                   VEX, VEX_L, VEX_WIG;
428def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
429                   "movupd\t{$src, $dst|$dst, $src}",
430                   [(store (v4f64 VR256:$src), addr:$dst)]>,
431                   VEX, VEX_L, VEX_WIG;
432} // SchedRW
433} // Predicate
434
435// For disassembler
436let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
437    isMoveReg = 1 in {
438let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
439  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
440                          (ins VR128:$src),
441                          "movaps\t{$src, $dst|$dst, $src}", []>,
442                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
443  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
444                           (ins VR128:$src),
445                           "movapd\t{$src, $dst|$dst, $src}", []>,
446                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
447  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
448                           (ins VR128:$src),
449                           "movups\t{$src, $dst|$dst, $src}", []>,
450                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
451  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
452                           (ins VR128:$src),
453                           "movupd\t{$src, $dst|$dst, $src}", []>,
454                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
455} // SchedRW
456
457let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
458  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
459                            (ins VR256:$src),
460                            "movaps\t{$src, $dst|$dst, $src}", []>,
461                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
462  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
463                            (ins VR256:$src),
464                            "movapd\t{$src, $dst|$dst, $src}", []>,
465                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
466  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
467                            (ins VR256:$src),
468                            "movups\t{$src, $dst|$dst, $src}", []>,
469                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
470  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
471                            (ins VR256:$src),
472                            "movupd\t{$src, $dst|$dst, $src}", []>,
473                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
474} // SchedRW
475} // Predicate
476
477// Reversed version with ".s" suffix for GAS compatibility.
478def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
479                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
480def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
481                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
482def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
483                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
484def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
485                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
486def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
487                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
488def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
489                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
490def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
491                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
492def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
493                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
494
495let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
496def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
497                   "movaps\t{$src, $dst|$dst, $src}",
498                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
499def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
500                   "movapd\t{$src, $dst|$dst, $src}",
501                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
502def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
503                   "movups\t{$src, $dst|$dst, $src}",
504                   [(store (v4f32 VR128:$src), addr:$dst)]>;
505def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
506                   "movupd\t{$src, $dst|$dst, $src}",
507                   [(store (v2f64 VR128:$src), addr:$dst)]>;
508} // SchedRW
509
510// For disassembler
511let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
512    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
513  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
514                         "movaps\t{$src, $dst|$dst, $src}", []>,
515                         FoldGenData<"MOVAPSrr">;
516  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
517                         "movapd\t{$src, $dst|$dst, $src}", []>,
518                         FoldGenData<"MOVAPDrr">;
519  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
520                         "movups\t{$src, $dst|$dst, $src}", []>,
521                         FoldGenData<"MOVUPSrr">;
522  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
523                         "movupd\t{$src, $dst|$dst, $src}", []>,
524                         FoldGenData<"MOVUPDrr">;
525}
526
527// Reversed version with ".s" suffix for GAS compatibility.
528def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
529                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
530def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
531                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
532def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
533                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
534def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
535                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
536
537let Predicates = [HasAVX, NoVLX] in {
538  // 256-bit load/store need to use floating point load/store in case we don't
539  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
540  // available and changing the domain is beneficial.
541  def : Pat<(alignedloadv4i64 addr:$src),
542            (VMOVAPSYrm addr:$src)>;
543  def : Pat<(alignedloadv8i32 addr:$src),
544            (VMOVAPSYrm addr:$src)>;
545  def : Pat<(alignedloadv16i16 addr:$src),
546            (VMOVAPSYrm addr:$src)>;
547  def : Pat<(alignedloadv32i8 addr:$src),
548            (VMOVAPSYrm addr:$src)>;
549  def : Pat<(loadv4i64 addr:$src),
550            (VMOVUPSYrm addr:$src)>;
551  def : Pat<(loadv8i32 addr:$src),
552            (VMOVUPSYrm addr:$src)>;
553  def : Pat<(loadv16i16 addr:$src),
554            (VMOVUPSYrm addr:$src)>;
555  def : Pat<(loadv32i8 addr:$src),
556            (VMOVUPSYrm addr:$src)>;
557
558  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
559            (VMOVAPSYmr addr:$dst, VR256:$src)>;
560  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
561            (VMOVAPSYmr addr:$dst, VR256:$src)>;
562  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
563            (VMOVAPSYmr addr:$dst, VR256:$src)>;
564  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
565            (VMOVAPSYmr addr:$dst, VR256:$src)>;
566  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
567            (VMOVUPSYmr addr:$dst, VR256:$src)>;
568  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
569            (VMOVUPSYmr addr:$dst, VR256:$src)>;
570  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
571            (VMOVUPSYmr addr:$dst, VR256:$src)>;
572  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
573            (VMOVUPSYmr addr:$dst, VR256:$src)>;
574}
575
576// Use movaps / movups for SSE integer load / store (one byte shorter).
577// The instructions selected below are then converted to MOVDQA/MOVDQU
578// during the SSE domain pass.
579let Predicates = [UseSSE1] in {
580  def : Pat<(alignedloadv2i64 addr:$src),
581            (MOVAPSrm addr:$src)>;
582  def : Pat<(alignedloadv4i32 addr:$src),
583            (MOVAPSrm addr:$src)>;
584  def : Pat<(alignedloadv8i16 addr:$src),
585            (MOVAPSrm addr:$src)>;
586  def : Pat<(alignedloadv16i8 addr:$src),
587            (MOVAPSrm addr:$src)>;
588  def : Pat<(loadv2i64 addr:$src),
589            (MOVUPSrm addr:$src)>;
590  def : Pat<(loadv4i32 addr:$src),
591            (MOVUPSrm addr:$src)>;
592  def : Pat<(loadv8i16 addr:$src),
593            (MOVUPSrm addr:$src)>;
594  def : Pat<(loadv16i8 addr:$src),
595            (MOVUPSrm addr:$src)>;
596
597  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
598            (MOVAPSmr addr:$dst, VR128:$src)>;
599  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
600            (MOVAPSmr addr:$dst, VR128:$src)>;
601  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
602            (MOVAPSmr addr:$dst, VR128:$src)>;
603  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
604            (MOVAPSmr addr:$dst, VR128:$src)>;
605  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
606            (MOVUPSmr addr:$dst, VR128:$src)>;
607  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
608            (MOVUPSmr addr:$dst, VR128:$src)>;
609  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
610            (MOVUPSmr addr:$dst, VR128:$src)>;
611  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
612            (MOVUPSmr addr:$dst, VR128:$src)>;
613}
614
615//===----------------------------------------------------------------------===//
616// SSE 1 & 2 - Move Low packed FP Instructions
617//===----------------------------------------------------------------------===//
618
619multiclass sse12_mov_hilo_packed_base<bits<8>opc,  SDNode pdnode,
620                                      string base_opc, string asm_opr> {
621  // No pattern as they need be special cased between high and low.
622  let hasSideEffects = 0, mayLoad = 1 in
623  def PSrm : PI<opc, MRMSrcMem,
624                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
625                !strconcat(base_opc, "s", asm_opr),
626                [], SSEPackedSingle>, PS,
627                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
628
629  def PDrm : PI<opc, MRMSrcMem,
630         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
631         !strconcat(base_opc, "d", asm_opr),
632     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
633                              (scalar_to_vector (loadf64 addr:$src2)))))],
634              SSEPackedDouble>, PD,
635     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
636}
637
638multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
639                                 string base_opc> {
640  let Predicates = [UseAVX] in
641    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
642                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
643                                    VEX_4V, VEX_WIG;
644
645  let Constraints = "$src1 = $dst" in
646    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
647                                    "\t{$src2, $dst|$dst, $src2}">;
648}
649
650defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
651
652let SchedRW = [WriteFStore] in {
653let Predicates = [UseAVX] in {
654let mayStore = 1, hasSideEffects = 0 in
655def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
656                     "movlps\t{$src, $dst|$dst, $src}",
657                     []>,
658                     VEX, VEX_WIG;
659def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
660                     "movlpd\t{$src, $dst|$dst, $src}",
661                     [(store (f64 (extractelt (v2f64 VR128:$src),
662                                   (iPTR 0))), addr:$dst)]>,
663                     VEX, VEX_WIG;
664}// UseAVX
665let mayStore = 1, hasSideEffects = 0 in
666def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
667                   "movlps\t{$src, $dst|$dst, $src}",
668                   []>;
669def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
670                   "movlpd\t{$src, $dst|$dst, $src}",
671                   [(store (f64 (extractelt (v2f64 VR128:$src),
672                                 (iPTR 0))), addr:$dst)]>;
673} // SchedRW
674
675let Predicates = [UseSSE1] in {
676  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
677  // end up with a movsd or blend instead of shufp.
678  // No need for aligned load, we're only loading 64-bits.
679  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
680                      (i8 -28)),
681            (MOVLPSrm VR128:$src1, addr:$src2)>;
682  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
683            (MOVLPSrm VR128:$src1, addr:$src2)>;
684
685  def : Pat<(v4f32 (X86vzload64 addr:$src)),
686            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
687  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
688            (MOVLPSmr addr:$dst, VR128:$src)>;
689}
690
691//===----------------------------------------------------------------------===//
692// SSE 1 & 2 - Move Hi packed FP Instructions
693//===----------------------------------------------------------------------===//
694
695defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
696
697let SchedRW = [WriteFStore] in {
698// v2f64 extract element 1 is always custom lowered to unpack high to low
699// and extract element 0 so the non-store version isn't too horrible.
700let Predicates = [UseAVX] in {
701let mayStore = 1, hasSideEffects = 0 in
702def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
703                   "movhps\t{$src, $dst|$dst, $src}",
704                   []>, VEX, VEX_WIG;
705def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
706                   "movhpd\t{$src, $dst|$dst, $src}",
707                   [(store (f64 (extractelt
708                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
709                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
710} // UseAVX
711let mayStore = 1, hasSideEffects = 0 in
712def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
713                   "movhps\t{$src, $dst|$dst, $src}",
714                   []>;
715def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
716                   "movhpd\t{$src, $dst|$dst, $src}",
717                   [(store (f64 (extractelt
718                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
719                                 (iPTR 0))), addr:$dst)]>;
720} // SchedRW
721
722let Predicates = [UseAVX] in {
723  // MOVHPD patterns
724  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
725            (VMOVHPDrm VR128:$src1, addr:$src2)>;
726
727  def : Pat<(store (f64 (extractelt
728                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
729                          (iPTR 0))), addr:$dst),
730            (VMOVHPDmr addr:$dst, VR128:$src)>;
731
732  // MOVLPD patterns
733  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
734            (VMOVLPDrm VR128:$src1, addr:$src2)>;
735}
736
737let Predicates = [UseSSE1] in {
738  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
739  // end up with a movsd or blend instead of shufp.
740  // No need for aligned load, we're only loading 64-bits.
741  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
742            (MOVHPSrm VR128:$src1, addr:$src2)>;
743  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
744            (MOVHPSrm VR128:$src1, addr:$src2)>;
745
746  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
747                                addr:$dst),
748            (MOVHPSmr addr:$dst, VR128:$src)>;
749}
750
751let Predicates = [UseSSE2] in {
752  // MOVHPD patterns
753  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
754            (MOVHPDrm VR128:$src1, addr:$src2)>;
755
756  def : Pat<(store (f64 (extractelt
757                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
758                          (iPTR 0))), addr:$dst),
759            (MOVHPDmr addr:$dst, VR128:$src)>;
760
761  // MOVLPD patterns
762  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
763            (MOVLPDrm VR128:$src1, addr:$src2)>;
764}
765
766let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
767  // Use MOVLPD to load into the low bits from a full vector unless we can use
768  // BLENDPD.
769  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
770            (MOVLPDrm VR128:$src1, addr:$src2)>;
771}
772
773//===----------------------------------------------------------------------===//
774// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
775//===----------------------------------------------------------------------===//
776
777let Predicates = [UseAVX] in {
778  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
779                                       (ins VR128:$src1, VR128:$src2),
780                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
781                      [(set VR128:$dst,
782                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
783                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
784  let isCommutable = 1 in
785  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
786                                       (ins VR128:$src1, VR128:$src2),
787                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
788                      [(set VR128:$dst,
789                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
790                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
791                      NotMemoryFoldable;
792}
793let Constraints = "$src1 = $dst" in {
794  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
795                                       (ins VR128:$src1, VR128:$src2),
796                      "movlhps\t{$src2, $dst|$dst, $src2}",
797                      [(set VR128:$dst,
798                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
799                      Sched<[SchedWriteFShuffle.XMM]>;
800  let isCommutable = 1 in
801  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
802                                       (ins VR128:$src1, VR128:$src2),
803                      "movhlps\t{$src2, $dst|$dst, $src2}",
804                      [(set VR128:$dst,
805                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
806                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
807}
808
809//===----------------------------------------------------------------------===//
810// SSE 1 & 2 - Conversion Instructions
811//===----------------------------------------------------------------------===//
812
813multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
814                     SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
815                     string asm, string mem, X86FoldableSchedWrite sched,
816                     Domain d,
817                     SchedRead Int2Fpu = ReadDefault> {
818  let ExeDomain = d in {
819  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
820              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
821              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
822              Sched<[sched, Int2Fpu]>;
823  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
824              mem#"\t{$src, $dst|$dst, $src}",
825              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
826              Sched<[sched.Folded]>;
827  }
828}
829
830multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
831                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
832                       string asm, Domain d, X86FoldableSchedWrite sched> {
833let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
834  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
835             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
836             Sched<[sched]>;
837  let mayLoad = 1 in
838  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
839             [(set RC:$dst, (DstTy (any_sint_to_fp
840                                    (SrcTy (ld_frag addr:$src)))))], d>,
841             Sched<[sched.Folded]>;
842}
843}
844
845multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
846                          X86MemOperand x86memop, string asm, string mem,
847                          X86FoldableSchedWrite sched, Domain d> {
848let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
849  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
850              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
851              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
852  let mayLoad = 1 in
853  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
854              (ins DstRC:$src1, x86memop:$src),
855              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
856           Sched<[sched.Folded, sched.ReadAfterFold]>;
857} // hasSideEffects = 0
858}
859
860let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
861defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
862                                "cvttss2si", "cvttss2si",
863                                WriteCvtSS2I, SSEPackedSingle>,
864                                XS, VEX, VEX_LIG;
865defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
866                                "cvttss2si", "cvttss2si",
867                                WriteCvtSS2I, SSEPackedSingle>,
868                                XS, VEX, VEX_W, VEX_LIG;
869defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
870                                "cvttsd2si", "cvttsd2si",
871                                WriteCvtSD2I, SSEPackedDouble>,
872                                XD, VEX, VEX_LIG;
873defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
874                                "cvttsd2si", "cvttsd2si",
875                                WriteCvtSD2I, SSEPackedDouble>,
876                                XD, VEX, VEX_W, VEX_LIG;
877
878defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
879                               "cvtss2si", "cvtss2si",
880                               WriteCvtSS2I, SSEPackedSingle>,
881                               XS, VEX, VEX_LIG;
882defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
883                               "cvtss2si", "cvtss2si",
884                               WriteCvtSS2I, SSEPackedSingle>,
885                               XS, VEX, VEX_W, VEX_LIG;
886defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
887                               "cvtsd2si", "cvtsd2si",
888                               WriteCvtSD2I, SSEPackedDouble>,
889                               XD, VEX, VEX_LIG;
890defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
891                               "cvtsd2si", "cvtsd2si",
892                               WriteCvtSD2I, SSEPackedDouble>,
893                               XD, VEX, VEX_W, VEX_LIG;
894}
895
896// The assembler can recognize rr 64-bit instructions by seeing a rxx
897// register, but the same isn't true when only using memory operands,
898// provide other assembly "l" and "q" forms to address this explicitly
899// where appropriate to do so.
900let isCodeGenOnly = 1 in {
901defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
902                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
903                                  VEX_LIG, SIMD_EXC;
904defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
905                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
906                                  VEX_W, VEX_LIG, SIMD_EXC;
907defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
908                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
909                                  VEX_LIG;
910defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
911                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
912                                  VEX_W, VEX_LIG, SIMD_EXC;
913} // isCodeGenOnly = 1
914
915let Predicates = [UseAVX] in {
916  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
917            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
918  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
919            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
920  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
921            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
922  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
923            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
924
925  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
926            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
927  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
928            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
929  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
930            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
931  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
932            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
933
934  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
935  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
936
937  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
938  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
939}
940
941let isCodeGenOnly = 1 in {
942defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
943                      "cvttss2si", "cvttss2si",
944                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
945defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
946                      "cvttss2si", "cvttss2si",
947                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
948defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
949                      "cvttsd2si", "cvttsd2si",
950                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
951defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
952                      "cvttsd2si", "cvttsd2si",
953                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
954
955defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
956                     "cvtss2si", "cvtss2si",
957                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
958defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
959                     "cvtss2si", "cvtss2si",
960                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
961defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
962                     "cvtsd2si", "cvtsd2si",
963                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
964defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
965                     "cvtsd2si", "cvtsd2si",
966                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
967
968defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
969                      "cvtsi2ss", "cvtsi2ss{l}",
970                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
971defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
972                      "cvtsi2ss", "cvtsi2ss{q}",
973                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
974defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
975                      "cvtsi2sd", "cvtsi2sd{l}",
976                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
977defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
978                      "cvtsi2sd", "cvtsi2sd{q}",
979                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
980} // isCodeGenOnly = 1
981
982let Predicates = [UseSSE1] in {
983  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
984  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
985}
986
987let Predicates = [UseSSE2] in {
988  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
989  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
990}
991
992// Conversion Instructions Intrinsics - Match intrinsics which expect MM
993// and/or XMM operand(s).
994
995multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
996                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
997                          Operand memop, PatFrags mem_frags, string asm,
998                          X86FoldableSchedWrite sched, Domain d> {
999let ExeDomain = d in {
1000  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1001                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1002                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1003               Sched<[sched]>;
1004  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1005                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1006                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1007               Sched<[sched.Folded]>;
1008}
1009}
1010
1011multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1012                    RegisterClass DstRC, X86MemOperand x86memop,
1013                    string asm, string mem, X86FoldableSchedWrite sched,
1014                    Domain d, bit Is2Addr = 1> {
1015let hasSideEffects = 0, ExeDomain = d in {
1016  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1017                  !if(Is2Addr,
1018                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1019                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1020                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1021  let mayLoad = 1 in
1022  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1023                  (ins DstRC:$src1, x86memop:$src2),
1024                  !if(Is2Addr,
1025                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1026                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1027                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1028}
1029}
1030
1031let Uses = [MXCSR], mayRaiseFPException = 1 in {
1032let Predicates = [UseAVX] in {
1033defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1034                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1035                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1036defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1037                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1038                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1039}
1040defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1041                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1042                 SSEPackedDouble>, XD;
1043defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1044                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1045                   SSEPackedDouble>, XD, REX_W;
1046}
1047
1048let Predicates = [UseAVX] in {
1049defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1050          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1051          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1052defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1053          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1054          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1055defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1056          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1057          XD, VEX_4V, VEX_LIG;
1058defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1059          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1060          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1061}
1062let Constraints = "$src1 = $dst" in {
1063  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1064                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1065                        XS, SIMD_EXC;
1066  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1067                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1068                        XS, REX_W, SIMD_EXC;
1069  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1070                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1071                        XD;
1072  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1073                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1074                        XD, REX_W, SIMD_EXC;
1075}
1076
1077def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1078               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1079def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1080               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1081def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1082               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1083def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1084               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1085
1086def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1087              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1088def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1089              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1090
1091def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1092                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1093def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1094                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1095def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1096                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1097def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1098                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1099
1100def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1101                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1102def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1103                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1104
1105/// SSE 1 Only
1106
1107// Aliases for intrinsics
1108let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1109defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1110                                ssmem, sse_load_f32, "cvttss2si",
1111                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1112defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1113                               X86cvtts2Int, ssmem, sse_load_f32,
1114                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1115                               XS, VEX, VEX_LIG, VEX_W;
1116defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1117                                sdmem, sse_load_f64, "cvttsd2si",
1118                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1119defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1120                              X86cvtts2Int, sdmem, sse_load_f64,
1121                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1122                              XD, VEX, VEX_LIG, VEX_W;
1123}
1124let Uses = [MXCSR], mayRaiseFPException = 1 in {
1125defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1126                                    ssmem, sse_load_f32, "cvttss2si",
1127                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1128defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1129                                   X86cvtts2Int, ssmem, sse_load_f32,
1130                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1131                                   XS, REX_W;
1132defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1133                                    sdmem, sse_load_f64, "cvttsd2si",
1134                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1135defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1136                                  X86cvtts2Int, sdmem, sse_load_f64,
1137                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1138                                  XD, REX_W;
1139}
1140
1141def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1142                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1143def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1144                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1145def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1146                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1147def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1148                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1149def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1150                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1151def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1152                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1153def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1154                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1155def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1156                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1157
1158def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1159                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1160def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1161                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1162def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1163                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1164def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1165                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1166def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1167                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1168def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1169                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1170def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1171                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1172def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1173                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1174
1175let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1176defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1177                                  ssmem, sse_load_f32, "cvtss2si",
1178                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1179defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1180                                  ssmem, sse_load_f32, "cvtss2si",
1181                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1182}
1183let Uses = [MXCSR], mayRaiseFPException = 1 in {
1184defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1185                               ssmem, sse_load_f32, "cvtss2si",
1186                               WriteCvtSS2I, SSEPackedSingle>, XS;
1187defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1188                                 ssmem, sse_load_f32, "cvtss2si",
1189                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1190
1191defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1192                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1193                               SSEPackedSingle, WriteCvtI2PS>,
1194                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1195defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1196                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1197                               SSEPackedSingle, WriteCvtI2PSY>,
1198                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1199
1200defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1201                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1202                            SSEPackedSingle, WriteCvtI2PS>,
1203                            PS, Requires<[UseSSE2]>;
1204}
1205
1206// AVX aliases
1207def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1208                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1209def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1210                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1211def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1212                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1213def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1214                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1215def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1216                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1217def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1218                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1219def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1220                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1221def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1222                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1223
1224// SSE aliases
1225def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1226                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1227def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1228                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1229def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1230                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1231def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1232                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1233def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1234                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1235def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1236                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1237def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1238                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1239def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1240                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1241
1242/// SSE 2 Only
1243
1244// Convert scalar double to scalar single
1245let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in {
1246def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1247                        (ins FR32:$src1, FR64:$src2),
1248                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1249                        VEX_4V, VEX_LIG, VEX_WIG,
1250                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1251let mayLoad = 1 in
1252def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1253                     (ins FR32:$src1, f64mem:$src2),
1254                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1255                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1256                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1257}
1258
1259def : Pat<(f32 (any_fpround FR64:$src)),
1260            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1261          Requires<[UseAVX]>;
1262
1263let isCodeGenOnly = 1 in {
1264def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1265                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1266                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1267                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1268def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1269                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1270                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1271                    XD, Requires<[UseSSE2, OptForSize]>,
1272                    Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
1273}
1274
1275let Uses = [MXCSR], mayRaiseFPException = 1 in {
1276def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1277                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1278                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1279                       [(set VR128:$dst,
1280                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1281                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1282                       Sched<[WriteCvtSD2SS]>;
1283def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1284                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1285                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1286                       [(set VR128:$dst,
1287                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1288                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1289                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1290let Constraints = "$src1 = $dst" in {
1291def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1292                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1293                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1294                       [(set VR128:$dst,
1295                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1296                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1297def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1298                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1299                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1300                       [(set VR128:$dst,
1301                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1302                       XD, Requires<[UseSSE2]>,
1303                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1304}
1305}
1306
1307// Convert scalar single to scalar double
1308// SSE2 instructions with XS prefix
1309let isCodeGenOnly = 1, hasSideEffects = 0 in {
1310def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1311                    (ins FR64:$src1, FR32:$src2),
1312                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1313                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1314                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1315let mayLoad = 1 in
1316def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1317                    (ins FR64:$src1, f32mem:$src2),
1318                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1319                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1320                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1321                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1322} // isCodeGenOnly = 1, hasSideEffects = 0
1323
1324def : Pat<(f64 (any_fpextend FR32:$src)),
1325    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1326def : Pat<(any_fpextend (loadf32 addr:$src)),
1327    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1328
1329let isCodeGenOnly = 1 in {
1330def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1331                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1332                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1333                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1334def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1335                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1336                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1337                   XS, Requires<[UseSSE2, OptForSize]>,
1338                   Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
1339} // isCodeGenOnly = 1
1340
1341let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
1342def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1343                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1344                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1345                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1346                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1347let mayLoad = 1 in
1348def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1349                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1350                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1351                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1352                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1353let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1354def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1355                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1356                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1357                    []>, XS, Requires<[UseSSE2]>,
1358                    Sched<[WriteCvtSS2SD]>;
1359let mayLoad = 1 in
1360def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1361                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1362                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1363                    []>, XS, Requires<[UseSSE2]>,
1364                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1365}
1366} // hasSideEffects = 0
1367
1368// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1369// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1370// vmovs{s,d} instructions
1371let Predicates = [UseAVX] in {
1372def : Pat<(v4f32 (X86Movss
1373                   (v4f32 VR128:$dst),
1374                   (v4f32 (scalar_to_vector
1375                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1376          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1377
1378def : Pat<(v2f64 (X86Movsd
1379                   (v2f64 VR128:$dst),
1380                   (v2f64 (scalar_to_vector
1381                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1382          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1383
1384def : Pat<(v4f32 (X86Movss
1385                   (v4f32 VR128:$dst),
1386                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1387          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1388
1389def : Pat<(v4f32 (X86Movss
1390                   (v4f32 VR128:$dst),
1391                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1392          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1393
1394def : Pat<(v4f32 (X86Movss
1395                   (v4f32 VR128:$dst),
1396                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1397          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1398
1399def : Pat<(v4f32 (X86Movss
1400                   (v4f32 VR128:$dst),
1401                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1402          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1403
1404def : Pat<(v2f64 (X86Movsd
1405                   (v2f64 VR128:$dst),
1406                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1407          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1408
1409def : Pat<(v2f64 (X86Movsd
1410                   (v2f64 VR128:$dst),
1411                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1412          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1413
1414def : Pat<(v2f64 (X86Movsd
1415                   (v2f64 VR128:$dst),
1416                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1417          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1418
1419def : Pat<(v2f64 (X86Movsd
1420                   (v2f64 VR128:$dst),
1421                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1422          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1423} // Predicates = [UseAVX]
1424
1425let Predicates = [UseSSE2] in {
1426def : Pat<(v4f32 (X86Movss
1427                   (v4f32 VR128:$dst),
1428                   (v4f32 (scalar_to_vector
1429                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1430          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1431
1432def : Pat<(v2f64 (X86Movsd
1433                   (v2f64 VR128:$dst),
1434                   (v2f64 (scalar_to_vector
1435                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1436          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1437
1438def : Pat<(v2f64 (X86Movsd
1439                   (v2f64 VR128:$dst),
1440                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1441          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1442
1443def : Pat<(v2f64 (X86Movsd
1444                   (v2f64 VR128:$dst),
1445                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1446          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1447
1448def : Pat<(v2f64 (X86Movsd
1449                   (v2f64 VR128:$dst),
1450                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1451          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1452
1453def : Pat<(v2f64 (X86Movsd
1454                   (v2f64 VR128:$dst),
1455                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1456          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1457} // Predicates = [UseSSE2]
1458
1459let Predicates = [UseSSE1] in {
1460def : Pat<(v4f32 (X86Movss
1461                   (v4f32 VR128:$dst),
1462                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1463          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1464
1465def : Pat<(v4f32 (X86Movss
1466                   (v4f32 VR128:$dst),
1467                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1468          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1469
1470def : Pat<(v4f32 (X86Movss
1471                   (v4f32 VR128:$dst),
1472                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1473          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1474
1475def : Pat<(v4f32 (X86Movss
1476                   (v4f32 VR128:$dst),
1477                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1478          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1479} // Predicates = [UseSSE1]
1480
1481let Predicates = [HasAVX, NoVLX] in {
1482// Convert packed single/double fp to doubleword
1483def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1484                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1485                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1486                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1487def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1488                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1489                       [(set VR128:$dst,
1490                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1491                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1492def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1493                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1494                        [(set VR256:$dst,
1495                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1496                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1497def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1498                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1499                        [(set VR256:$dst,
1500                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1501                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1502}
1503def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1504                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1505                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1506                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1507def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1508                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1509                     [(set VR128:$dst,
1510                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1511                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1512
1513
1514// Convert Packed Double FP to Packed DW Integers
1515let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1516// The assembler can recognize rr 256-bit instructions by seeing a ymm
1517// register, but the same isn't true when using memory operands instead.
1518// Provide other assembly rr and rm forms to address this explicitly.
1519def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1520                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1521                       [(set VR128:$dst,
1522                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1523                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1524
1525// XMM only
1526def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1527                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1528                      [(set VR128:$dst,
1529                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1530                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1531
1532// YMM only
1533def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1534                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1535                       [(set VR128:$dst,
1536                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1537                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1538def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1539                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1540                       [(set VR128:$dst,
1541                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1542                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1543}
1544
1545def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1546                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1547def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1548                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1549
1550def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1551                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1552                      [(set VR128:$dst,
1553                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1554                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1555def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1556                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1557                      [(set VR128:$dst,
1558                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1559                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1560
1561// Convert with truncation packed single/double fp to doubleword
1562// SSE2 packed instructions with XS prefix
1563let Uses = [MXCSR], mayRaiseFPException = 1 in {
1564let Predicates = [HasAVX, NoVLX] in {
1565def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1566                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1567                         [(set VR128:$dst,
1568                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1569                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1570def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1571                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1572                         [(set VR128:$dst,
1573                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1574                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1575def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1576                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1577                          [(set VR256:$dst,
1578                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1579                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1580def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1581                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1582                          [(set VR256:$dst,
1583                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1584                          VEX, VEX_L,
1585                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1586}
1587
1588def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1589                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1590                       [(set VR128:$dst,
1591                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1592                       Sched<[WriteCvtPS2I]>;
1593def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1594                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1595                       [(set VR128:$dst,
1596                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1597                       Sched<[WriteCvtPS2ILd]>;
1598}
1599
1600// The assembler can recognize rr 256-bit instructions by seeing a ymm
1601// register, but the same isn't true when using memory operands instead.
1602// Provide other assembly rr and rm forms to address this explicitly.
1603let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1604// XMM only
1605def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1606                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1607                        [(set VR128:$dst,
1608                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1609                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1610def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1611                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1612                        [(set VR128:$dst,
1613                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1614                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1615
1616// YMM only
1617def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1618                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1619                         [(set VR128:$dst,
1620                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1621                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1622def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1623                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1624                         [(set VR128:$dst,
1625                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1626                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1627} // Predicates = [HasAVX, NoVLX]
1628
1629def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1630                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1631def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1632                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1633
1634let Predicates = [HasAVX, NoVLX] in {
1635  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1636            (VCVTTPD2DQYrr VR256:$src)>;
1637  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1638            (VCVTTPD2DQYrm addr:$src)>;
1639}
1640
1641def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1642                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1643                      [(set VR128:$dst,
1644                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1645                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1646def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1647                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1648                      [(set VR128:$dst,
1649                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1650                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1651
1652// Convert packed single to packed double
1653let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1654                  // SSE2 instructions without OpSize prefix
1655def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1656                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1657                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1658                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1659def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1660                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1661                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1662                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1663def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1664                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1665                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1666                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1667def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1668                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1669                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1670                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1671}
1672
1673let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1674def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1675                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1676                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1677                   PS, Sched<[WriteCvtPS2PD]>;
1678def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1679                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1680                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1681                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1682}
1683
1684// Convert Packed DW Integers to Packed Double FP
1685let Predicates = [HasAVX, NoVLX] in {
1686let hasSideEffects = 0, mayLoad = 1 in
1687def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1688                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1689                        [(set VR128:$dst,
1690                          (v2f64 (X86any_VSintToFP
1691                                  (bc_v4i32
1692                                   (v2i64 (scalar_to_vector
1693                                           (loadi64 addr:$src)))))))]>,
1694                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1695def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1696                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1697                        [(set VR128:$dst,
1698                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1699                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1700def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1701                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1702                         [(set VR256:$dst,
1703                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1704                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1705                         VEX_WIG;
1706def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1707                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1708                         [(set VR256:$dst,
1709                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1710                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1711}
1712
1713let hasSideEffects = 0, mayLoad = 1 in
1714def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1715                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1716                       [(set VR128:$dst,
1717                         (v2f64 (X86any_VSintToFP
1718                                 (bc_v4i32
1719                                  (v2i64 (scalar_to_vector
1720                                          (loadi64 addr:$src)))))))]>,
1721                       Sched<[WriteCvtI2PDLd]>;
1722def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1723                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1724                       [(set VR128:$dst,
1725                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1726                       Sched<[WriteCvtI2PD]>;
1727
1728// AVX register conversion intrinsics
1729let Predicates = [HasAVX, NoVLX] in {
1730  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1731            (VCVTDQ2PDrm addr:$src)>;
1732} // Predicates = [HasAVX, NoVLX]
1733
1734// SSE2 register conversion intrinsics
1735let Predicates = [UseSSE2] in {
1736  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1737            (CVTDQ2PDrm addr:$src)>;
1738} // Predicates = [UseSSE2]
1739
1740// Convert packed double to packed single
1741// The assembler can recognize rr 256-bit instructions by seeing a ymm
1742// register, but the same isn't true when using memory operands instead.
1743// Provide other assembly rr and rm forms to address this explicitly.
1744let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1745// XMM only
1746def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1747                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1748                       [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1749                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1750def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1751                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1752                       [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
1753                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1754
1755def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1756                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1757                        [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
1758                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1759def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1760                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1761                        [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
1762                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1763} // Predicates = [HasAVX, NoVLX]
1764
1765def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1766                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1767def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1768                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1769
1770def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1771                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1772                     [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
1773                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1774def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1775                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1776                     [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
1777                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1778
1779//===----------------------------------------------------------------------===//
1780// SSE 1 & 2 - Compare Instructions
1781//===----------------------------------------------------------------------===//
1782
1783// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1784multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1785                            Operand memop, SDNode OpNode, ValueType VT,
1786                            PatFrag ld_frag, string asm,
1787                            X86FoldableSchedWrite sched,
1788                            PatFrags mem_frags> {
1789  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1790                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1791                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1792                                              VR128:$src2, timm:$cc))]>,
1793           Sched<[sched]>, SIMD_EXC;
1794  let mayLoad = 1 in
1795  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1796                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1797                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1798                                              (mem_frags addr:$src2), timm:$cc))]>,
1799           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1800
1801  let isCodeGenOnly = 1 in {
1802    let isCommutable = 1 in
1803    def rr : SIi8<0xC2, MRMSrcReg,
1804                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1805                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1806                  Sched<[sched]>, SIMD_EXC;
1807    def rm : SIi8<0xC2, MRMSrcMem,
1808                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1809                  [(set RC:$dst, (OpNode RC:$src1,
1810                                         (ld_frag addr:$src2), timm:$cc))]>,
1811                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1812  }
1813}
1814
1815let ExeDomain = SSEPackedSingle in
1816defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1817                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1818                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1819                 XS, VEX_4V, VEX_LIG, VEX_WIG;
1820let ExeDomain = SSEPackedDouble in
1821defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1822                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1823                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1824                 XD, VEX_4V, VEX_LIG, VEX_WIG;
1825
1826let Constraints = "$src1 = $dst" in {
1827  let ExeDomain = SSEPackedSingle in
1828  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1829                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1830                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1831  let ExeDomain = SSEPackedDouble in
1832  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1833                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1834                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1835}
1836
1837// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1838multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
1839                         ValueType vt, X86MemOperand x86memop,
1840                         PatFrag ld_frag, string OpcodeStr, Domain d,
1841                         X86FoldableSchedWrite sched = WriteFComX> {
1842  let ExeDomain = d in {
1843  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1844                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1845                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1846          Sched<[sched]>, SIMD_EXC;
1847  let mayLoad = 1 in
1848  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1849                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1850                     [(set EFLAGS, (OpNode (vt RC:$src1),
1851                                           (ld_frag addr:$src2)))]>,
1852          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1853}
1854}
1855
1856// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1857multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1858                             ValueType vt, Operand memop,
1859                             PatFrags mem_frags, string OpcodeStr,
1860                             Domain d,
1861                             X86FoldableSchedWrite sched = WriteFComX> {
1862let ExeDomain = d in {
1863  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1864                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1865                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1866          Sched<[sched]>, SIMD_EXC;
1867let mayLoad = 1 in
1868  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1869                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1870                     [(set EFLAGS, (OpNode (vt RC:$src1),
1871                                           (mem_frags addr:$src2)))]>,
1872          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1873}
1874}
1875
1876let Defs = [EFLAGS] in {
1877  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1878                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1879  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1880                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1881  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1882                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1883  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1884                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1885
1886  let isCodeGenOnly = 1 in {
1887    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1888                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1889    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1890                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1891
1892    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1893                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1894    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1895                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1896  }
1897  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1898                                  "ucomiss", SSEPackedSingle>, PS;
1899  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1900                                  "ucomisd", SSEPackedDouble>, PD;
1901  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1902                                  "comiss", SSEPackedSingle>, PS;
1903  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1904                                  "comisd", SSEPackedDouble>, PD;
1905
1906  let isCodeGenOnly = 1 in {
1907    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1908                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1909    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1910                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1911
1912    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1913                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1914    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1915                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1916  }
1917} // Defs = [EFLAGS]
1918
1919// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1920multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1921                            ValueType VT, string asm,
1922                            X86FoldableSchedWrite sched,
1923                            Domain d, PatFrag ld_frag> {
1924  let isCommutable = 1 in
1925  def rri : PIi8<0xC2, MRMSrcReg,
1926             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1927             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1928            Sched<[sched]>, SIMD_EXC;
1929  def rmi : PIi8<0xC2, MRMSrcMem,
1930             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1931             [(set RC:$dst,
1932               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1933            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1934}
1935
1936defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1937               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1938               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1939defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1940               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1941               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1942defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1943               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1944               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1945defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1946               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1947               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1948let Constraints = "$src1 = $dst" in {
1949  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1950                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1951                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1952  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1953                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1954                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1955}
1956
1957def CommutableCMPCC : PatLeaf<(timm), [{
1958  uint64_t Imm = N->getZExtValue() & 0x7;
1959  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1960}]>;
1961
1962// Patterns to select compares with loads in first operand.
1963let Predicates = [HasAVX] in {
1964  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1965                                CommutableCMPCC:$cc)),
1966            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1967
1968  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
1969                                CommutableCMPCC:$cc)),
1970            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1971
1972  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
1973                                CommutableCMPCC:$cc)),
1974            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1975
1976  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
1977                                CommutableCMPCC:$cc)),
1978            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1979
1980  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1981                          CommutableCMPCC:$cc)),
1982            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1983
1984  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1985                          CommutableCMPCC:$cc)),
1986            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1987}
1988
1989let Predicates = [UseSSE2] in {
1990  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
1991                                CommutableCMPCC:$cc)),
1992            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1993
1994  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1995                          CommutableCMPCC:$cc)),
1996            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1997}
1998
1999let Predicates = [UseSSE1] in {
2000  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2001                                CommutableCMPCC:$cc)),
2002            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2003
2004  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2005                          CommutableCMPCC:$cc)),
2006            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2007}
2008
2009//===----------------------------------------------------------------------===//
2010// SSE 1 & 2 - Shuffle Instructions
2011//===----------------------------------------------------------------------===//
2012
2013/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2014multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2015                         ValueType vt, string asm, PatFrag mem_frag,
2016                         X86FoldableSchedWrite sched, Domain d,
2017                         bit IsCommutable = 0> {
2018  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2019                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2020                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2021                                       (i8 timm:$src3))))], d>,
2022            Sched<[sched.Folded, sched.ReadAfterFold]>;
2023  let isCommutable = IsCommutable in
2024  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2025                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2026                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2027                                     (i8 timm:$src3))))], d>,
2028            Sched<[sched]>;
2029}
2030
2031let Predicates = [HasAVX, NoVLX] in {
2032  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2033           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2034           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2035           PS, VEX_4V, VEX_WIG;
2036  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2037           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2038           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2039           PS, VEX_4V, VEX_L, VEX_WIG;
2040  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2041           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2042           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2043           PD, VEX_4V, VEX_WIG;
2044  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2045           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2046           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2047           PD, VEX_4V, VEX_L, VEX_WIG;
2048}
2049let Constraints = "$src1 = $dst" in {
2050  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2051                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2052                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2053  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2054                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2055                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2056}
2057
2058//===----------------------------------------------------------------------===//
2059// SSE 1 & 2 - Unpack FP Instructions
2060//===----------------------------------------------------------------------===//
2061
2062/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2063multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2064                                   PatFrag mem_frag, RegisterClass RC,
2065                                   X86MemOperand x86memop, string asm,
2066                                   X86FoldableSchedWrite sched, Domain d,
2067                                   bit IsCommutable = 0> {
2068    let isCommutable = IsCommutable in
2069    def rr : PI<opc, MRMSrcReg,
2070                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2071                asm, [(set RC:$dst,
2072                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2073                Sched<[sched]>;
2074    def rm : PI<opc, MRMSrcMem,
2075                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2076                asm, [(set RC:$dst,
2077                           (vt (OpNode RC:$src1,
2078                                       (mem_frag addr:$src2))))], d>,
2079             Sched<[sched.Folded, sched.ReadAfterFold]>;
2080}
2081
2082let Predicates = [HasAVX, NoVLX] in {
2083defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2084      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2085                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2086defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2087      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2088                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2089defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2090      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2091                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2092defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2093      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2094                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2095
2096defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2097      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2098                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2099defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2100      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2101                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2102defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2103      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2104                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2105defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2106      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2107                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2108}// Predicates = [HasAVX, NoVLX]
2109
2110let Constraints = "$src1 = $dst" in {
2111  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2112        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2113                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2114  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2115        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2116                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2117  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2118        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2119                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2120  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2121        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2122                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2123} // Constraints = "$src1 = $dst"
2124
2125let Predicates = [HasAVX1Only] in {
2126  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2127            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2128  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2129            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2130  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2131            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2132  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2133            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2134
2135  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2136            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2137  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2138            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2139  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2140            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2141  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2142            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2143}
2144
2145let Predicates = [UseSSE2] in {
2146  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2147  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2148                              (v2f64 (simple_load addr:$src2)))),
2149            (MOVHPDrm VR128:$src1, addr:$src2)>;
2150}
2151
2152//===----------------------------------------------------------------------===//
2153// SSE 1 & 2 - Extract Floating-Point Sign mask
2154//===----------------------------------------------------------------------===//
2155
2156/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2157multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2158                                string asm, Domain d> {
2159  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2160              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2161              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2162              Sched<[WriteFMOVMSK]>;
2163}
2164
2165let Predicates = [HasAVX] in {
2166  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2167                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2168  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2169                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2170  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2171                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2172  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2173                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2174
2175  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2176  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2177            (VMOVMSKPSrr VR128:$src)>;
2178  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2179            (VMOVMSKPDrr VR128:$src)>;
2180  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2181            (VMOVMSKPSYrr VR256:$src)>;
2182  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2183            (VMOVMSKPDYrr VR256:$src)>;
2184}
2185
2186defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2187                                     SSEPackedSingle>, PS;
2188defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2189                                     SSEPackedDouble>, PD;
2190
2191let Predicates = [UseSSE2] in {
2192  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2193  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2194            (MOVMSKPSrr VR128:$src)>;
2195  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2196            (MOVMSKPDrr VR128:$src)>;
2197}
2198
2199//===---------------------------------------------------------------------===//
2200// SSE2 - Packed Integer Logical Instructions
2201//===---------------------------------------------------------------------===//
2202
2203let ExeDomain = SSEPackedInt in { // SSE integer instructions
2204
2205/// PDI_binop_rm - Simple SSE2 binary operator.
2206multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2207                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2208                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2209                        bit IsCommutable, bit Is2Addr> {
2210  let isCommutable = IsCommutable in
2211  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2212       (ins RC:$src1, RC:$src2),
2213       !if(Is2Addr,
2214           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2215           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2216       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2217       Sched<[sched]>;
2218  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2219       (ins RC:$src1, x86memop:$src2),
2220       !if(Is2Addr,
2221           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2222           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2223       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2224       Sched<[sched.Folded, sched.ReadAfterFold]>;
2225}
2226} // ExeDomain = SSEPackedInt
2227
2228multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2229                         ValueType OpVT128, ValueType OpVT256,
2230                         X86SchedWriteWidths sched, bit IsCommutable,
2231                         Predicate prd> {
2232let Predicates = [HasAVX, prd] in
2233  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2234                             VR128, load, i128mem, sched.XMM,
2235                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2236
2237let Constraints = "$src1 = $dst" in
2238  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2239                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2240
2241let Predicates = [HasAVX2, prd] in
2242  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2243                               OpVT256, VR256, load, i256mem, sched.YMM,
2244                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2245}
2246
2247// These are ordered here for pattern ordering requirements with the fp versions
2248
2249defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2250                           SchedWriteVecLogic, 1, NoVLX>;
2251defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2252                           SchedWriteVecLogic, 1, NoVLX>;
2253defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2254                           SchedWriteVecLogic, 1, NoVLX>;
2255defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2256                           SchedWriteVecLogic, 0, NoVLX>;
2257
2258//===----------------------------------------------------------------------===//
2259// SSE 1 & 2 - Logical Instructions
2260//===----------------------------------------------------------------------===//
2261
2262/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2263///
2264/// There are no patterns here because isel prefers integer versions for SSE2
2265/// and later. There are SSE1 v4f32 patterns later.
2266multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2267                                   SDNode OpNode, X86SchedWriteWidths sched> {
2268  let Predicates = [HasAVX, NoVLX] in {
2269  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2270        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2271        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2272
2273  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2274        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2275        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2276
2277  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2278       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2279       [], [], 0>, PS, VEX_4V, VEX_WIG;
2280
2281  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2282       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2283       [], [], 0>, PD, VEX_4V, VEX_WIG;
2284  }
2285
2286  let Constraints = "$src1 = $dst" in {
2287    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2288         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2289         [], []>, PS;
2290
2291    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2292         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2293         [], []>, PD;
2294  }
2295}
2296
2297defm AND  : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>;
2298defm OR   : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>;
2299defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>;
2300let isCommutable = 0 in
2301  defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>;
2302
2303let Predicates = [HasAVX2, NoVLX] in {
2304  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2305            (VPANDYrr VR256:$src1, VR256:$src2)>;
2306  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2307            (VPANDYrr VR256:$src1, VR256:$src2)>;
2308  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2309            (VPANDYrr VR256:$src1, VR256:$src2)>;
2310
2311  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2312            (VPORYrr VR256:$src1, VR256:$src2)>;
2313  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2314            (VPORYrr VR256:$src1, VR256:$src2)>;
2315  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2316            (VPORYrr VR256:$src1, VR256:$src2)>;
2317
2318  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2319            (VPXORYrr VR256:$src1, VR256:$src2)>;
2320  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2321            (VPXORYrr VR256:$src1, VR256:$src2)>;
2322  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2323            (VPXORYrr VR256:$src1, VR256:$src2)>;
2324
2325  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2326            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2327  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2328            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2329  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2330            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2331
2332  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2333            (VPANDYrm VR256:$src1, addr:$src2)>;
2334  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2335            (VPANDYrm VR256:$src1, addr:$src2)>;
2336  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2337            (VPANDYrm VR256:$src1, addr:$src2)>;
2338
2339  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2340            (VPORYrm VR256:$src1, addr:$src2)>;
2341  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2342            (VPORYrm VR256:$src1, addr:$src2)>;
2343  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2344            (VPORYrm VR256:$src1, addr:$src2)>;
2345
2346  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2347            (VPXORYrm VR256:$src1, addr:$src2)>;
2348  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2349            (VPXORYrm VR256:$src1, addr:$src2)>;
2350  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2351            (VPXORYrm VR256:$src1, addr:$src2)>;
2352
2353  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2354            (VPANDNYrm VR256:$src1, addr:$src2)>;
2355  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2356            (VPANDNYrm VR256:$src1, addr:$src2)>;
2357  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2358            (VPANDNYrm VR256:$src1, addr:$src2)>;
2359}
2360
2361// If only AVX1 is supported, we need to handle integer operations with
2362// floating point instructions since the integer versions aren't available.
2363let Predicates = [HasAVX1Only] in {
2364  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2365            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2366  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2367            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2368  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2369            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2370  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2371            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2372
2373  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2374            (VORPSYrr VR256:$src1, VR256:$src2)>;
2375  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2376            (VORPSYrr VR256:$src1, VR256:$src2)>;
2377  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2378            (VORPSYrr VR256:$src1, VR256:$src2)>;
2379  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2380            (VORPSYrr VR256:$src1, VR256:$src2)>;
2381
2382  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2383            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2384  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2385            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2386  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2387            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2388  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2389            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2390
2391  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2392            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2393  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2394            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2395  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2396            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2397  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2398            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2399
2400  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2401            (VANDPSYrm VR256:$src1, addr:$src2)>;
2402  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2403            (VANDPSYrm VR256:$src1, addr:$src2)>;
2404  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2405            (VANDPSYrm VR256:$src1, addr:$src2)>;
2406  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2407            (VANDPSYrm VR256:$src1, addr:$src2)>;
2408
2409  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2410            (VORPSYrm VR256:$src1, addr:$src2)>;
2411  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2412            (VORPSYrm VR256:$src1, addr:$src2)>;
2413  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2414            (VORPSYrm VR256:$src1, addr:$src2)>;
2415  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2416            (VORPSYrm VR256:$src1, addr:$src2)>;
2417
2418  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2419            (VXORPSYrm VR256:$src1, addr:$src2)>;
2420  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2421            (VXORPSYrm VR256:$src1, addr:$src2)>;
2422  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2423            (VXORPSYrm VR256:$src1, addr:$src2)>;
2424  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2425            (VXORPSYrm VR256:$src1, addr:$src2)>;
2426
2427  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2428            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2429  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2430            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2431  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2432            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2433  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2434            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2435}
2436
2437let Predicates = [HasAVX, NoVLX] in {
2438  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2439            (VPANDrr VR128:$src1, VR128:$src2)>;
2440  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2441            (VPANDrr VR128:$src1, VR128:$src2)>;
2442  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2443            (VPANDrr VR128:$src1, VR128:$src2)>;
2444
2445  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2446            (VPORrr VR128:$src1, VR128:$src2)>;
2447  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2448            (VPORrr VR128:$src1, VR128:$src2)>;
2449  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2450            (VPORrr VR128:$src1, VR128:$src2)>;
2451
2452  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2453            (VPXORrr VR128:$src1, VR128:$src2)>;
2454  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2455            (VPXORrr VR128:$src1, VR128:$src2)>;
2456  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2457            (VPXORrr VR128:$src1, VR128:$src2)>;
2458
2459  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2460            (VPANDNrr VR128:$src1, VR128:$src2)>;
2461  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2462            (VPANDNrr VR128:$src1, VR128:$src2)>;
2463  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2464            (VPANDNrr VR128:$src1, VR128:$src2)>;
2465
2466  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2467            (VPANDrm VR128:$src1, addr:$src2)>;
2468  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2469            (VPANDrm VR128:$src1, addr:$src2)>;
2470  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2471            (VPANDrm VR128:$src1, addr:$src2)>;
2472
2473  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2474            (VPORrm VR128:$src1, addr:$src2)>;
2475  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2476            (VPORrm VR128:$src1, addr:$src2)>;
2477  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2478            (VPORrm VR128:$src1, addr:$src2)>;
2479
2480  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2481            (VPXORrm VR128:$src1, addr:$src2)>;
2482  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2483            (VPXORrm VR128:$src1, addr:$src2)>;
2484  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2485            (VPXORrm VR128:$src1, addr:$src2)>;
2486
2487  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2488            (VPANDNrm VR128:$src1, addr:$src2)>;
2489  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2490            (VPANDNrm VR128:$src1, addr:$src2)>;
2491  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2492            (VPANDNrm VR128:$src1, addr:$src2)>;
2493}
2494
2495let Predicates = [UseSSE2] in {
2496  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2497            (PANDrr VR128:$src1, VR128:$src2)>;
2498  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2499            (PANDrr VR128:$src1, VR128:$src2)>;
2500  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2501            (PANDrr VR128:$src1, VR128:$src2)>;
2502
2503  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2504            (PORrr VR128:$src1, VR128:$src2)>;
2505  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2506            (PORrr VR128:$src1, VR128:$src2)>;
2507  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2508            (PORrr VR128:$src1, VR128:$src2)>;
2509
2510  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2511            (PXORrr VR128:$src1, VR128:$src2)>;
2512  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2513            (PXORrr VR128:$src1, VR128:$src2)>;
2514  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2515            (PXORrr VR128:$src1, VR128:$src2)>;
2516
2517  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2518            (PANDNrr VR128:$src1, VR128:$src2)>;
2519  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2520            (PANDNrr VR128:$src1, VR128:$src2)>;
2521  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2522            (PANDNrr VR128:$src1, VR128:$src2)>;
2523
2524  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2525            (PANDrm VR128:$src1, addr:$src2)>;
2526  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2527            (PANDrm VR128:$src1, addr:$src2)>;
2528  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2529            (PANDrm VR128:$src1, addr:$src2)>;
2530
2531  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2532            (PORrm VR128:$src1, addr:$src2)>;
2533  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2534            (PORrm VR128:$src1, addr:$src2)>;
2535  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2536            (PORrm VR128:$src1, addr:$src2)>;
2537
2538  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2539            (PXORrm VR128:$src1, addr:$src2)>;
2540  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2541            (PXORrm VR128:$src1, addr:$src2)>;
2542  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2543            (PXORrm VR128:$src1, addr:$src2)>;
2544
2545  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2546            (PANDNrm VR128:$src1, addr:$src2)>;
2547  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2548            (PANDNrm VR128:$src1, addr:$src2)>;
2549  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2550            (PANDNrm VR128:$src1, addr:$src2)>;
2551}
2552
2553// Patterns for packed operations when we don't have integer type available.
2554def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2555          (ANDPSrr VR128:$src1, VR128:$src2)>;
2556def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2557          (ORPSrr VR128:$src1, VR128:$src2)>;
2558def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2559          (XORPSrr VR128:$src1, VR128:$src2)>;
2560def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2561          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2562
2563def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2564          (ANDPSrm VR128:$src1, addr:$src2)>;
2565def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2566          (ORPSrm VR128:$src1, addr:$src2)>;
2567def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2568          (XORPSrm VR128:$src1, addr:$src2)>;
2569def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2570          (ANDNPSrm VR128:$src1, addr:$src2)>;
2571
2572//===----------------------------------------------------------------------===//
2573// SSE 1 & 2 - Arithmetic Instructions
2574//===----------------------------------------------------------------------===//
2575
2576/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2577/// vector forms.
2578///
2579/// In addition, we also have a special variant of the scalar form here to
2580/// represent the associated intrinsic operation.  This form is unlike the
2581/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2582/// and leaves the top elements unmodified (therefore these cannot be commuted).
2583///
2584/// These three forms can each be reg+reg or reg+mem.
2585///
2586
2587/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2588/// classes below
2589multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2590                                  SDNode OpNode, X86SchedWriteSizes sched> {
2591let Uses = [MXCSR], mayRaiseFPException = 1 in {
2592  let Predicates = [HasAVX, NoVLX] in {
2593  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2594                               VR128, v4f32, f128mem, loadv4f32,
2595                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2596  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2597                               VR128, v2f64, f128mem, loadv2f64,
2598                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2599
2600  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2601                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2602                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2603  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2604                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2605                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2606  }
2607
2608  let Constraints = "$src1 = $dst" in {
2609    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2610                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2611                              sched.PS.XMM>, PS;
2612    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2613                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2614                              sched.PD.XMM>, PD;
2615  }
2616}
2617}
2618
2619multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2620                                  X86SchedWriteSizes sched> {
2621let Uses = [MXCSR], mayRaiseFPException = 1 in {
2622  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2623                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2624                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2625  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2626                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2627                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2628
2629  let Constraints = "$src1 = $dst" in {
2630    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2631                              OpNode, FR32, f32mem, SSEPackedSingle,
2632                              sched.PS.Scl>, XS;
2633    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2634                              OpNode, FR64, f64mem, SSEPackedDouble,
2635                              sched.PD.Scl>, XD;
2636  }
2637}
2638}
2639
2640multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2641                                      SDPatternOperator OpNode,
2642                                      X86SchedWriteSizes sched> {
2643let Uses = [MXCSR], mayRaiseFPException = 1 in {
2644  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2645                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2646                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2647  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2648                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2649                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2650
2651  let Constraints = "$src1 = $dst" in {
2652    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
2653                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2654                   SSEPackedSingle, sched.PS.Scl>, XS;
2655    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
2656                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2657                   SSEPackedDouble, sched.PD.Scl>, XD;
2658  }
2659}
2660}
2661
2662// Binary Arithmetic instructions
2663defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2664           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2665           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2666defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2667           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2668           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2669let isCommutable = 0 in {
2670  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2671             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2672             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2673  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2674             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2675             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2676  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2677             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2678             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2679  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2680             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2681             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2682}
2683
2684let isCodeGenOnly = 1 in {
2685  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2686             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2687  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2688             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2689}
2690
2691// Patterns used to select SSE scalar fp arithmetic instructions from
2692// either:
2693//
2694// (1) a scalar fp operation followed by a blend
2695//
2696// The effect is that the backend no longer emits unnecessary vector
2697// insert instructions immediately after SSE scalar fp instructions
2698// like addss or mulss.
2699//
2700// For example, given the following code:
2701//   __m128 foo(__m128 A, __m128 B) {
2702//     A[0] += B[0];
2703//     return A;
2704//   }
2705//
2706// Previously we generated:
2707//   addss %xmm0, %xmm1
2708//   movss %xmm1, %xmm0
2709//
2710// We now generate:
2711//   addss %xmm1, %xmm0
2712//
2713// (2) a vector packed single/double fp operation followed by a vector insert
2714//
2715// The effect is that the backend converts the packed fp instruction
2716// followed by a vector insert into a single SSE scalar fp instruction.
2717//
2718// For example, given the following code:
2719//   __m128 foo(__m128 A, __m128 B) {
2720//     __m128 C = A + B;
2721//     return (__m128) {c[0], a[1], a[2], a[3]};
2722//   }
2723//
2724// Previously we generated:
2725//   addps %xmm0, %xmm1
2726//   movss %xmm1, %xmm0
2727//
2728// We now generate:
2729//   addss %xmm1, %xmm0
2730
2731// TODO: Some canonicalization in lowering would simplify the number of
2732// patterns we have to try to match.
2733multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
2734                                    ValueType VT, ValueType EltTy,
2735                                    RegisterClass RC, PatFrag ld_frag,
2736                                    Predicate BasePredicate> {
2737  let Predicates = [BasePredicate] in {
2738    // extracted scalar math op with insert via movss/movsd
2739    def : Pat<(VT (Move (VT VR128:$dst),
2740                        (VT (scalar_to_vector
2741                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2742                                 RC:$src))))),
2743              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2744               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2745    def : Pat<(VT (Move (VT VR128:$dst),
2746                        (VT (scalar_to_vector
2747                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2748                                 (ld_frag addr:$src)))))),
2749              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2750  }
2751
2752  // Repeat for AVX versions of the instructions.
2753  let Predicates = [UseAVX] in {
2754    // extracted scalar math op with insert via movss/movsd
2755    def : Pat<(VT (Move (VT VR128:$dst),
2756                        (VT (scalar_to_vector
2757                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2758                                 RC:$src))))),
2759              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2760               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2761    def : Pat<(VT (Move (VT VR128:$dst),
2762                        (VT (scalar_to_vector
2763                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2764                                 (ld_frag addr:$src)))))),
2765              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2766  }
2767}
2768
2769defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2770defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2771defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2772defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2773
2774defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2775defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2776defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2777defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2778
2779/// Unop Arithmetic
2780/// In addition, we also have a special variant of the scalar form here to
2781/// represent the associated intrinsic operation.  This form is unlike the
2782/// plain scalar form, in that it takes an entire vector (instead of a
2783/// scalar) and leaves the top elements undefined.
2784///
2785/// And, we have a special variant form for a full-vector intrinsic form.
2786
2787/// sse_fp_unop_s - SSE1 unops in scalar form
2788/// For the non-AVX defs, we need $src1 to be tied to $dst because
2789/// the HW instructions are 2 operand / destructive.
2790multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2791                          ValueType ScalarVT, X86MemOperand x86memop,
2792                          Operand intmemop, SDNode OpNode, Domain d,
2793                          X86FoldableSchedWrite sched, Predicate target> {
2794  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2795  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2796              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2797            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2798            Requires<[target]>;
2799  let mayLoad = 1 in
2800  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2801            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2802            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2803            Sched<[sched.Folded]>,
2804            Requires<[target, OptForSize]>;
2805  }
2806
2807  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2808  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2809                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2810                Sched<[sched]>;
2811  let mayLoad = 1 in
2812  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2813                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2814                Sched<[sched.Folded, sched.ReadAfterFold]>;
2815  }
2816
2817}
2818
2819multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
2820                              PatFrags mem_frags, Intrinsic Intr,
2821                              Predicate target, string Suffix> {
2822  let Predicates = [target] in {
2823  // These are unary operations, but they are modeled as having 2 source operands
2824  // because the high elements of the destination are unchanged in SSE.
2825  def : Pat<(Intr VR128:$src),
2826            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2827  }
2828  // We don't want to fold scalar loads into these instructions unless
2829  // optimizing for size. This is because the folded instruction will have a
2830  // partial register update, while the unfolded sequence will not, e.g.
2831  // movss mem, %xmm0
2832  // rcpss %xmm0, %xmm0
2833  // which has a clobber before the rcp, vs.
2834  // rcpss mem, %xmm0
2835  let Predicates = [target, OptForSize] in {
2836    def : Pat<(Intr (mem_frags addr:$src2)),
2837               (!cast<Instruction>(NAME#m_Int)
2838                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2839  }
2840}
2841
2842multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags,
2843                              Intrinsic Intr, Predicate target> {
2844  let Predicates = [target] in {
2845   def : Pat<(Intr VR128:$src),
2846             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2847                                 VR128:$src)>;
2848  }
2849  let Predicates = [target, OptForSize] in {
2850    def : Pat<(Intr (mem_frags addr:$src2)),
2851              (!cast<Instruction>(NAME#m_Int)
2852                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2853  }
2854}
2855
2856multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2857                          ValueType ScalarVT, X86MemOperand x86memop,
2858                          Operand intmemop, SDNode OpNode, Domain d,
2859                          X86FoldableSchedWrite sched, Predicate target> {
2860  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2861  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2862            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2863            [], d>, Sched<[sched]>;
2864  let mayLoad = 1 in
2865  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2866             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2867            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2868  }
2869  let hasSideEffects = 0, ExeDomain = d in {
2870  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2871                (ins VR128:$src1, VR128:$src2),
2872             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2873             []>, Sched<[sched]>;
2874  let mayLoad = 1 in
2875  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2876                (ins VR128:$src1, intmemop:$src2),
2877             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2878             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2879  }
2880
2881  // We don't want to fold scalar loads into these instructions unless
2882  // optimizing for size. This is because the folded instruction will have a
2883  // partial register update, while the unfolded sequence will not, e.g.
2884  // vmovss mem, %xmm0
2885  // vrcpss %xmm0, %xmm0, %xmm0
2886  // which has a clobber before the rcp, vs.
2887  // vrcpss mem, %xmm0, %xmm0
2888  // TODO: In theory, we could fold the load, and avoid the stall caused by
2889  // the partial register store, either in BreakFalseDeps or with smarter RA.
2890  let Predicates = [target] in {
2891   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2892                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2893  }
2894  let Predicates = [target, OptForSize] in {
2895    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2896              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2897            addr:$src)>;
2898  }
2899}
2900
2901/// sse1_fp_unop_p - SSE1 unops in packed form.
2902multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
2903                          X86SchedWriteWidths sched, list<Predicate> prds> {
2904let Predicates = prds in {
2905  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2906                       !strconcat("v", OpcodeStr,
2907                                  "ps\t{$src, $dst|$dst, $src}"),
2908                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2909                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2910  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2911                       !strconcat("v", OpcodeStr,
2912                                  "ps\t{$src, $dst|$dst, $src}"),
2913                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2914                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2915  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2916                        !strconcat("v", OpcodeStr,
2917                                   "ps\t{$src, $dst|$dst, $src}"),
2918                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2919                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2920  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2921                        !strconcat("v", OpcodeStr,
2922                                   "ps\t{$src, $dst|$dst, $src}"),
2923                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2924                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2925}
2926
2927  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2928                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2929                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2930                Sched<[sched.XMM]>;
2931  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2932                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2933                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2934                Sched<[sched.XMM.Folded]>;
2935}
2936
2937/// sse2_fp_unop_p - SSE2 unops in vector forms.
2938multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2939                          SDNode OpNode, X86SchedWriteWidths sched> {
2940let Predicates = [HasAVX, NoVLX] in {
2941  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2942                       !strconcat("v", OpcodeStr,
2943                                  "pd\t{$src, $dst|$dst, $src}"),
2944                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2945                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2946  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2947                       !strconcat("v", OpcodeStr,
2948                                  "pd\t{$src, $dst|$dst, $src}"),
2949                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2950                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2951  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2952                        !strconcat("v", OpcodeStr,
2953                                   "pd\t{$src, $dst|$dst, $src}"),
2954                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2955                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2956  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2957                        !strconcat("v", OpcodeStr,
2958                                   "pd\t{$src, $dst|$dst, $src}"),
2959                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2960                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2961}
2962
2963  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2964                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2965                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2966                Sched<[sched.XMM]>;
2967  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2968                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2969                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2970                Sched<[sched.XMM.Folded]>;
2971}
2972
2973multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
2974                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2975  defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2976                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
2977                      UseSSE1, "SS">, XS;
2978  defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
2979                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
2980                      AVXTarget>,
2981                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2982}
2983
2984multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2985                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2986  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
2987                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2988  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
2989                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2990                       XS, VEX_4V, VEX_LIG, VEX_WIG;
2991}
2992
2993multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
2994                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2995  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
2996                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2997  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
2998                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2999                         XD, VEX_4V, VEX_LIG, VEX_WIG;
3000}
3001
3002// Square root.
3003defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3004             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3005             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3006             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3007
3008// Reciprocal approximations. Note that these typically require refinement
3009// in order to obtain suitable precision.
3010defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3011             sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3012             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3013defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3014             sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3015             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3016
3017// There is no f64 version of the reciprocal approximation instructions.
3018
3019multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
3020                                      ValueType VT, Predicate BasePredicate> {
3021  let Predicates = [BasePredicate] in {
3022    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3023                                  (OpNode (extractelt VT:$src, 0))))),
3024              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3025  }
3026
3027  // Repeat for AVX versions of the instructions.
3028  let Predicates = [UseAVX] in {
3029    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3030                                  (OpNode (extractelt VT:$src, 0))))),
3031              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3032  }
3033}
3034
3035defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3036defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3037
3038multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3039                                           SDNode Move, ValueType VT,
3040                                           Predicate BasePredicate> {
3041  let Predicates = [BasePredicate] in {
3042    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3043              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3044  }
3045
3046  // Repeat for AVX versions of the instructions.
3047  let Predicates = [HasAVX] in {
3048    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3049              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3050  }
3051}
3052
3053defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3054                                       v4f32, UseSSE1>;
3055defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3056                                       v4f32, UseSSE1>;
3057
3058
3059//===----------------------------------------------------------------------===//
3060// SSE 1 & 2 - Non-temporal stores
3061//===----------------------------------------------------------------------===//
3062
3063let AddedComplexity = 400 in { // Prefer non-temporal versions
3064let Predicates = [HasAVX, NoVLX] in {
3065let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3066def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3067                     (ins f128mem:$dst, VR128:$src),
3068                     "movntps\t{$src, $dst|$dst, $src}",
3069                     [(alignednontemporalstore (v4f32 VR128:$src),
3070                                               addr:$dst)]>, VEX, VEX_WIG;
3071def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3072                     (ins f128mem:$dst, VR128:$src),
3073                     "movntpd\t{$src, $dst|$dst, $src}",
3074                     [(alignednontemporalstore (v2f64 VR128:$src),
3075                                               addr:$dst)]>, VEX, VEX_WIG;
3076} // SchedRW
3077
3078let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3079def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3080                     (ins f256mem:$dst, VR256:$src),
3081                     "movntps\t{$src, $dst|$dst, $src}",
3082                     [(alignednontemporalstore (v8f32 VR256:$src),
3083                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3084def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3085                     (ins f256mem:$dst, VR256:$src),
3086                     "movntpd\t{$src, $dst|$dst, $src}",
3087                     [(alignednontemporalstore (v4f64 VR256:$src),
3088                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3089} // SchedRW
3090
3091let ExeDomain = SSEPackedInt in {
3092def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3093                         (ins i128mem:$dst, VR128:$src),
3094                         "movntdq\t{$src, $dst|$dst, $src}",
3095                         [(alignednontemporalstore (v2i64 VR128:$src),
3096                                                   addr:$dst)]>, VEX, VEX_WIG,
3097                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3098def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3099                    (ins i256mem:$dst, VR256:$src),
3100                    "movntdq\t{$src, $dst|$dst, $src}",
3101                    [(alignednontemporalstore (v4i64 VR256:$src),
3102                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3103                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3104} // ExeDomain
3105} // Predicates
3106
3107let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3108def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3109                    "movntps\t{$src, $dst|$dst, $src}",
3110                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3111def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3112                    "movntpd\t{$src, $dst|$dst, $src}",
3113                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3114} // SchedRW
3115
3116let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3117def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3118                    "movntdq\t{$src, $dst|$dst, $src}",
3119                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3120
3121let SchedRW = [WriteStoreNT] in {
3122// There is no AVX form for instructions below this point
3123def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3124                 "movnti{l}\t{$src, $dst|$dst, $src}",
3125                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3126               PS, Requires<[HasSSE2]>;
3127def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3128                     "movnti{q}\t{$src, $dst|$dst, $src}",
3129                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3130                  PS, Requires<[HasSSE2]>;
3131} // SchedRW = [WriteStoreNT]
3132
3133let Predicates = [HasAVX, NoVLX] in {
3134  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3135            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3136  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3137            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3138  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3139            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3140
3141  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3142            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3143  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3144            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3145  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3146            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3147}
3148
3149let Predicates = [UseSSE2] in {
3150  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3151            (MOVNTDQmr addr:$dst, VR128:$src)>;
3152  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3153            (MOVNTDQmr addr:$dst, VR128:$src)>;
3154  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3155            (MOVNTDQmr addr:$dst, VR128:$src)>;
3156}
3157
3158} // AddedComplexity
3159
3160//===----------------------------------------------------------------------===//
3161// SSE 1 & 2 - Prefetch and memory fence
3162//===----------------------------------------------------------------------===//
3163
3164// Prefetch intrinsic.
3165let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3166def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3167    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3168def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3169    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3170def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3171    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3172def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3173    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3174}
3175
3176// FIXME: How should flush instruction be modeled?
3177let SchedRW = [WriteLoad] in {
3178// Flush cache
3179def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3180               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3181               PS, Requires<[HasSSE2]>;
3182}
3183
3184let SchedRW = [WriteNop] in {
3185// Pause. This "instruction" is encoded as "rep; nop", so even though it
3186// was introduced with SSE2, it's backward compatible.
3187def PAUSE : I<0x90, RawFrm, (outs), (ins),
3188              "pause", [(int_x86_sse2_pause)]>, OBXS;
3189}
3190
3191let SchedRW = [WriteFence] in {
3192// Load, store, and memory fence
3193// TODO: As with mfence, we may want to ease the availability of sfence/lfence
3194// to include any 64-bit target.
3195def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3196               PS, Requires<[HasSSE1]>;
3197def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3198               PS, Requires<[HasSSE2]>;
3199def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3200               PS, Requires<[HasMFence]>;
3201} // SchedRW
3202
3203def : Pat<(X86MFence), (MFENCE)>;
3204
3205//===----------------------------------------------------------------------===//
3206// SSE 1 & 2 - Load/Store XCSR register
3207//===----------------------------------------------------------------------===//
3208
3209let mayLoad=1, hasSideEffects=1 in
3210def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3211               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3212               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3213let mayStore=1, hasSideEffects=1 in
3214def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3215               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3216               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3217
3218let mayLoad=1, hasSideEffects=1 in
3219def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3220              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3221              PS, Sched<[WriteLDMXCSR]>;
3222let mayStore=1, hasSideEffects=1 in
3223def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3224              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3225              PS, Sched<[WriteSTMXCSR]>;
3226
3227//===---------------------------------------------------------------------===//
3228// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3229//===---------------------------------------------------------------------===//
3230
3231let ExeDomain = SSEPackedInt in { // SSE integer instructions
3232
3233let hasSideEffects = 0 in {
3234def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3235                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3236                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3237def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3238                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3239                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3240def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3241                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3242                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3243def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3244                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3245                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3246}
3247
3248// For Disassembler
3249let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3250def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3251                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3252                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3253                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3254def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3255                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3256                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3257                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3258def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3259                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3260                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3261                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3262def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3263                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3264                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3265                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3266}
3267
3268let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3269    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3270def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3271                      "movdqa\t{$src, $dst|$dst, $src}",
3272                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3273                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3274def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3275                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3276                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3277                      VEX, VEX_L, VEX_WIG;
3278def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3279                   "vmovdqu\t{$src, $dst|$dst, $src}",
3280                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3281                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3282                   XS, VEX, VEX_WIG;
3283def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3284                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3285                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3286                   XS, VEX, VEX_L, VEX_WIG;
3287}
3288
3289let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3290def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3291                      (ins i128mem:$dst, VR128:$src),
3292                      "movdqa\t{$src, $dst|$dst, $src}",
3293                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3294                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3295def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3296                      (ins i256mem:$dst, VR256:$src),
3297                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3298                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3299def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3300                   "vmovdqu\t{$src, $dst|$dst, $src}",
3301                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3302                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3303def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3304                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3305                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3306}
3307
3308let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3309let hasSideEffects = 0 in {
3310def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3311                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3312
3313def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3314                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3315                   XS, Requires<[UseSSE2]>;
3316}
3317
3318// For Disassembler
3319let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3320def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3321                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3322                       FoldGenData<"MOVDQArr">;
3323
3324def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3325                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3326                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3327}
3328} // SchedRW
3329
3330let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3331    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3332def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3333                   "movdqa\t{$src, $dst|$dst, $src}",
3334                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3335def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3336                   "movdqu\t{$src, $dst|$dst, $src}",
3337                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3338                 XS, Requires<[UseSSE2]>;
3339}
3340
3341let mayStore = 1, hasSideEffects = 0,
3342    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3343def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3344                   "movdqa\t{$src, $dst|$dst, $src}",
3345                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3346def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3347                   "movdqu\t{$src, $dst|$dst, $src}",
3348                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3349                 XS, Requires<[UseSSE2]>;
3350}
3351
3352} // ExeDomain = SSEPackedInt
3353
3354// Reversed version with ".s" suffix for GAS compatibility.
3355def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3356                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3357def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3358                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3359def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3360                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3361def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3362                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3363
3364// Reversed version with ".s" suffix for GAS compatibility.
3365def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3366                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3367def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3368                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3369
3370let Predicates = [HasAVX, NoVLX] in {
3371  // Additional patterns for other integer sizes.
3372  def : Pat<(alignedloadv4i32 addr:$src),
3373            (VMOVDQArm addr:$src)>;
3374  def : Pat<(alignedloadv8i16 addr:$src),
3375            (VMOVDQArm addr:$src)>;
3376  def : Pat<(alignedloadv16i8 addr:$src),
3377            (VMOVDQArm addr:$src)>;
3378  def : Pat<(loadv4i32 addr:$src),
3379            (VMOVDQUrm addr:$src)>;
3380  def : Pat<(loadv8i16 addr:$src),
3381            (VMOVDQUrm addr:$src)>;
3382  def : Pat<(loadv16i8 addr:$src),
3383            (VMOVDQUrm addr:$src)>;
3384
3385  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3386            (VMOVDQAmr addr:$dst, VR128:$src)>;
3387  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3388            (VMOVDQAmr addr:$dst, VR128:$src)>;
3389  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3390            (VMOVDQAmr addr:$dst, VR128:$src)>;
3391  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3392            (VMOVDQUmr addr:$dst, VR128:$src)>;
3393  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3394            (VMOVDQUmr addr:$dst, VR128:$src)>;
3395  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3396            (VMOVDQUmr addr:$dst, VR128:$src)>;
3397}
3398
3399//===---------------------------------------------------------------------===//
3400// SSE2 - Packed Integer Arithmetic Instructions
3401//===---------------------------------------------------------------------===//
3402
3403let ExeDomain = SSEPackedInt in { // SSE integer instructions
3404
3405/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3406multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3407                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3408                         PatFrag memop_frag, X86MemOperand x86memop,
3409                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3410  let isCommutable = 1 in
3411  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3412       (ins RC:$src1, RC:$src2),
3413       !if(Is2Addr,
3414           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3415           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3416       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3417       Sched<[sched]>;
3418  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3419       (ins RC:$src1, x86memop:$src2),
3420       !if(Is2Addr,
3421           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3422           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3423       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3424                                     (memop_frag addr:$src2))))]>,
3425       Sched<[sched.Folded, sched.ReadAfterFold]>;
3426}
3427} // ExeDomain = SSEPackedInt
3428
3429defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3430                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3431defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3432                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3433defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3434                             SchedWriteVecALU, 1, NoVLX>;
3435defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3436                             SchedWriteVecALU, 1, NoVLX>;
3437defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3438                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3439defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3440                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3441defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3442                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3443defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3444                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3445defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3446                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3447defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3448                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3449defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3450                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3451defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3452                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3453defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3454                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3455defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3456                             SchedWriteVecALU, 0, NoVLX>;
3457defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3458                             SchedWriteVecALU, 0, NoVLX>;
3459defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3460                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3461defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3462                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3463defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3464                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3465defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3466                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3467defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3468                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3469defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3470                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3471defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3472                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3473defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3474                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3475defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3476                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3477defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3478                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3479defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3480                             SchedWriteVecIMul, 1, NoVLX>;
3481
3482let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3483defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3484                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3485                              VEX_4V, VEX_WIG;
3486
3487let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3488defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3489                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3490                               0>, VEX_4V, VEX_L, VEX_WIG;
3491let Constraints = "$src1 = $dst" in
3492defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3493                             memop, i128mem, SchedWriteVecIMul.XMM>;
3494
3495let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3496defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3497                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3498                             VEX_4V, VEX_WIG;
3499let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3500defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3501                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3502                             VEX_4V, VEX_L, VEX_WIG;
3503let Constraints = "$src1 = $dst" in
3504defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3505                            memop, i128mem, SchedWritePSADBW.XMM>;
3506
3507//===---------------------------------------------------------------------===//
3508// SSE2 - Packed Integer Logical Instructions
3509//===---------------------------------------------------------------------===//
3510
3511multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3512                         string OpcodeStr, SDNode OpNode,
3513                         SDNode OpNode2, RegisterClass RC,
3514                         X86FoldableSchedWrite sched,
3515                         X86FoldableSchedWrite schedImm,
3516                         ValueType DstVT, ValueType SrcVT,
3517                         PatFrag ld_frag, bit Is2Addr = 1> {
3518  // src2 is always 128-bit
3519  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3520       (ins RC:$src1, VR128:$src2),
3521       !if(Is2Addr,
3522           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3523           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3524       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3525       Sched<[sched]>;
3526  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3527       (ins RC:$src1, i128mem:$src2),
3528       !if(Is2Addr,
3529           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3530           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3531       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3532                       (SrcVT (ld_frag addr:$src2)))))]>,
3533       Sched<[sched.Folded, sched.ReadAfterFold]>;
3534  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3535       (ins RC:$src1, u8imm:$src2),
3536       !if(Is2Addr,
3537           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3538           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3539       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3540       Sched<[schedImm]>;
3541}
3542
3543multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3544                             string OpcodeStr, SDNode OpNode,
3545                             SDNode OpNode2, ValueType DstVT128,
3546                             ValueType DstVT256, ValueType SrcVT,
3547                             X86SchedWriteWidths sched,
3548                             X86SchedWriteWidths schedImm, Predicate prd> {
3549let Predicates = [HasAVX, prd] in
3550  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3551                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3552                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3553let Predicates = [HasAVX2, prd] in
3554  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3555                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3556                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3557                                VEX_WIG;
3558let Constraints = "$src1 = $dst" in
3559  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3560                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3561                            memop>;
3562}
3563
3564multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3565                        SDNode OpNode, RegisterClass RC, ValueType VT,
3566                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3567  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3568       !if(Is2Addr,
3569           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3570           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3571       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3572       Sched<[sched]>;
3573}
3574
3575multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3576                            SDNode OpNode, X86SchedWriteWidths sched> {
3577let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3578  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3579                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3580let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3581  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3582                               VR256, v32i8, sched.YMM, 0>,
3583                               VEX_4V, VEX_L, VEX_WIG;
3584let Constraints = "$src1 = $dst" in
3585  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3586                           sched.XMM>;
3587}
3588
3589let ExeDomain = SSEPackedInt in {
3590  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3591                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3592                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3593  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3594                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3595                                 SchedWriteVecShiftImm, NoVLX>;
3596  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3597                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3598                                 SchedWriteVecShiftImm, NoVLX>;
3599
3600  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3601                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3602                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3603  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3604                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3605                                 SchedWriteVecShiftImm, NoVLX>;
3606  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3607                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3608                                 SchedWriteVecShiftImm, NoVLX>;
3609
3610  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3611                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3612                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3613  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3614                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3615                                 SchedWriteVecShiftImm, NoVLX>;
3616
3617  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3618                                 SchedWriteShuffle>;
3619  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3620                                 SchedWriteShuffle>;
3621} // ExeDomain = SSEPackedInt
3622
3623//===---------------------------------------------------------------------===//
3624// SSE2 - Packed Integer Comparison Instructions
3625//===---------------------------------------------------------------------===//
3626
3627defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3628                             SchedWriteVecALU, 1, TruePredicate>;
3629defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3630                             SchedWriteVecALU, 1, TruePredicate>;
3631defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3632                             SchedWriteVecALU, 1, TruePredicate>;
3633defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3634                             SchedWriteVecALU, 0, TruePredicate>;
3635defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3636                             SchedWriteVecALU, 0, TruePredicate>;
3637defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3638                             SchedWriteVecALU, 0, TruePredicate>;
3639
3640//===---------------------------------------------------------------------===//
3641// SSE2 - Packed Integer Shuffle Instructions
3642//===---------------------------------------------------------------------===//
3643
3644let ExeDomain = SSEPackedInt in {
3645multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3646                         SDNode OpNode, X86SchedWriteWidths sched,
3647                         Predicate prd> {
3648let Predicates = [HasAVX, prd] in {
3649  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3650                      (ins VR128:$src1, u8imm:$src2),
3651                      !strconcat("v", OpcodeStr,
3652                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3653                      [(set VR128:$dst,
3654                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3655                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3656  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3657                      (ins i128mem:$src1, u8imm:$src2),
3658                      !strconcat("v", OpcodeStr,
3659                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3660                     [(set VR128:$dst,
3661                       (vt128 (OpNode (load addr:$src1),
3662                        (i8 timm:$src2))))]>, VEX,
3663                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3664}
3665
3666let Predicates = [HasAVX2, prd] in {
3667  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3668                       (ins VR256:$src1, u8imm:$src2),
3669                       !strconcat("v", OpcodeStr,
3670                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3671                       [(set VR256:$dst,
3672                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3673                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3674  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3675                       (ins i256mem:$src1, u8imm:$src2),
3676                       !strconcat("v", OpcodeStr,
3677                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3678                      [(set VR256:$dst,
3679                        (vt256 (OpNode (load addr:$src1),
3680                         (i8 timm:$src2))))]>, VEX, VEX_L,
3681                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3682}
3683
3684let Predicates = [UseSSE2] in {
3685  def ri : Ii8<0x70, MRMSrcReg,
3686               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3687               !strconcat(OpcodeStr,
3688                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3689               [(set VR128:$dst,
3690                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3691               Sched<[sched.XMM]>;
3692  def mi : Ii8<0x70, MRMSrcMem,
3693               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3694               !strconcat(OpcodeStr,
3695                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3696               [(set VR128:$dst,
3697                 (vt128 (OpNode (memop addr:$src1),
3698                        (i8 timm:$src2))))]>,
3699               Sched<[sched.XMM.Folded]>;
3700}
3701}
3702} // ExeDomain = SSEPackedInt
3703
3704defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3705                             SchedWriteShuffle, NoVLX>, PD;
3706defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3707                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3708defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3709                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3710
3711//===---------------------------------------------------------------------===//
3712// Packed Integer Pack Instructions (SSE & AVX)
3713//===---------------------------------------------------------------------===//
3714
3715let ExeDomain = SSEPackedInt in {
3716multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3717                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3718                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3719                     PatFrag ld_frag, bit Is2Addr = 1> {
3720  def rr : PDI<opc, MRMSrcReg,
3721               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3722               !if(Is2Addr,
3723                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3724                   !strconcat(OpcodeStr,
3725                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3726               [(set RC:$dst,
3727                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3728               Sched<[sched]>;
3729  def rm : PDI<opc, MRMSrcMem,
3730               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3731               !if(Is2Addr,
3732                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3733                   !strconcat(OpcodeStr,
3734                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3735               [(set RC:$dst,
3736                     (OutVT (OpNode (ArgVT RC:$src1),
3737                                    (ld_frag addr:$src2))))]>,
3738               Sched<[sched.Folded, sched.ReadAfterFold]>;
3739}
3740
3741multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3742                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3743                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3744                     PatFrag ld_frag, bit Is2Addr = 1> {
3745  def rr : SS48I<opc, MRMSrcReg,
3746                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3747                 !if(Is2Addr,
3748                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3749                     !strconcat(OpcodeStr,
3750                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3751                 [(set RC:$dst,
3752                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3753                 Sched<[sched]>;
3754  def rm : SS48I<opc, MRMSrcMem,
3755                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3756                 !if(Is2Addr,
3757                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3758                     !strconcat(OpcodeStr,
3759                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3760                 [(set RC:$dst,
3761                       (OutVT (OpNode (ArgVT RC:$src1),
3762                                      (ld_frag addr:$src2))))]>,
3763                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3764}
3765
3766let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3767  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3768                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3769                             VEX_4V, VEX_WIG;
3770  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3771                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3772                             VEX_4V, VEX_WIG;
3773
3774  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3775                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3776                             VEX_4V, VEX_WIG;
3777  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3778                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3779                             VEX_4V;
3780}
3781
3782let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3783  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3784                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3785                              VEX_4V, VEX_L, VEX_WIG;
3786  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3787                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3788                              VEX_4V, VEX_L, VEX_WIG;
3789
3790  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3791                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3792                              VEX_4V, VEX_L, VEX_WIG;
3793  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3794                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3795                              VEX_4V, VEX_L;
3796}
3797
3798let Constraints = "$src1 = $dst" in {
3799  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3800                            i128mem, SchedWriteShuffle.XMM, memop>;
3801  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3802                            i128mem, SchedWriteShuffle.XMM, memop>;
3803
3804  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3805                            i128mem, SchedWriteShuffle.XMM, memop>;
3806
3807  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3808                            i128mem, SchedWriteShuffle.XMM, memop>;
3809}
3810} // ExeDomain = SSEPackedInt
3811
3812//===---------------------------------------------------------------------===//
3813// SSE2 - Packed Integer Unpack Instructions
3814//===---------------------------------------------------------------------===//
3815
3816let ExeDomain = SSEPackedInt in {
3817multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3818                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3819                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3820                       bit Is2Addr = 1> {
3821  def rr : PDI<opc, MRMSrcReg,
3822      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3823      !if(Is2Addr,
3824          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3825          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3826      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3827      Sched<[sched]>;
3828  def rm : PDI<opc, MRMSrcMem,
3829      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3830      !if(Is2Addr,
3831          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3832          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3833      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3834      Sched<[sched.Folded, sched.ReadAfterFold]>;
3835}
3836
3837let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3838  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3839                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3840                                 VEX_4V, VEX_WIG;
3841  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3842                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3843                                 VEX_4V, VEX_WIG;
3844  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3845                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3846                                 VEX_4V, VEX_WIG;
3847  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3848                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3849                                 VEX_4V, VEX_WIG;
3850}
3851
3852let Predicates = [HasAVX, NoVLX] in {
3853  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3854                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3855                                 VEX_4V, VEX_WIG;
3856  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3857                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3858                                 VEX_4V, VEX_WIG;
3859  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3860                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3861                                 VEX_4V, VEX_WIG;
3862  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3863                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3864                                 VEX_4V, VEX_WIG;
3865}
3866
3867let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3868  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3869                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3870                                  VEX_4V, VEX_L, VEX_WIG;
3871  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3872                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3873                                  VEX_4V, VEX_L, VEX_WIG;
3874  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3875                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3876                                  VEX_4V, VEX_L, VEX_WIG;
3877  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3878                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3879                                  VEX_4V, VEX_L, VEX_WIG;
3880}
3881
3882let Predicates = [HasAVX2, NoVLX] in {
3883  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3884                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3885                                  VEX_4V, VEX_L, VEX_WIG;
3886  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3887                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3888                                  VEX_4V, VEX_L, VEX_WIG;
3889  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3890                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3891                                  VEX_4V, VEX_L, VEX_WIG;
3892  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3893                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3894                                  VEX_4V, VEX_L, VEX_WIG;
3895}
3896
3897let Constraints = "$src1 = $dst" in {
3898  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3899                                i128mem, SchedWriteShuffle.XMM, memop>;
3900  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3901                                i128mem, SchedWriteShuffle.XMM, memop>;
3902  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3903                                i128mem, SchedWriteShuffle.XMM, memop>;
3904  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3905                                i128mem, SchedWriteShuffle.XMM, memop>;
3906
3907  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3908                                i128mem, SchedWriteShuffle.XMM, memop>;
3909  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3910                                i128mem, SchedWriteShuffle.XMM, memop>;
3911  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3912                                i128mem, SchedWriteShuffle.XMM, memop>;
3913  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3914                                i128mem, SchedWriteShuffle.XMM, memop>;
3915}
3916} // ExeDomain = SSEPackedInt
3917
3918//===---------------------------------------------------------------------===//
3919// SSE2 - Packed Integer Extract and Insert
3920//===---------------------------------------------------------------------===//
3921
3922let ExeDomain = SSEPackedInt in {
3923multiclass sse2_pinsrw<bit Is2Addr = 1> {
3924  def rr : Ii8<0xC4, MRMSrcReg,
3925       (outs VR128:$dst), (ins VR128:$src1,
3926        GR32orGR64:$src2, u8imm:$src3),
3927       !if(Is2Addr,
3928           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3929           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3930       [(set VR128:$dst,
3931         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
3932       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3933  def rm : Ii8<0xC4, MRMSrcMem,
3934                      (outs VR128:$dst), (ins VR128:$src1,
3935                       i16mem:$src2, u8imm:$src3),
3936       !if(Is2Addr,
3937           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3938           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3939       [(set VR128:$dst,
3940         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3941                    imm:$src3))]>,
3942       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3943}
3944
3945// Extract
3946let Predicates = [HasAVX, NoBWI] in
3947def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3948                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3949                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3950                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3951                                            imm:$src2))]>,
3952                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3953def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3954                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3955                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3956                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3957                                            imm:$src2))]>,
3958               Sched<[WriteVecExtract]>;
3959
3960// Insert
3961let Predicates = [HasAVX, NoBWI] in
3962defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3963
3964let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3965defm PINSRW : sse2_pinsrw, PD;
3966
3967} // ExeDomain = SSEPackedInt
3968
3969//===---------------------------------------------------------------------===//
3970// SSE2 - Packed Mask Creation
3971//===---------------------------------------------------------------------===//
3972
3973let ExeDomain = SSEPackedInt in {
3974
3975def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3976           (ins VR128:$src),
3977           "pmovmskb\t{$src, $dst|$dst, $src}",
3978           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3979           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3980
3981let Predicates = [HasAVX2] in {
3982def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3983           (ins VR256:$src),
3984           "pmovmskb\t{$src, $dst|$dst, $src}",
3985           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3986           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3987}
3988
3989def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3990           "pmovmskb\t{$src, $dst|$dst, $src}",
3991           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3992           Sched<[WriteVecMOVMSK]>;
3993
3994} // ExeDomain = SSEPackedInt
3995
3996//===---------------------------------------------------------------------===//
3997// SSE2 - Conditional Store
3998//===---------------------------------------------------------------------===//
3999
4000let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4001let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4002def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4003           (ins VR128:$src, VR128:$mask),
4004           "maskmovdqu\t{$mask, $src|$src, $mask}",
4005           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4006           VEX, VEX_WIG;
4007let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4008def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4009           (ins VR128:$src, VR128:$mask),
4010           "maskmovdqu\t{$mask, $src|$src, $mask}",
4011           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4012           VEX, VEX_WIG;
4013
4014let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4015def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4016           "maskmovdqu\t{$mask, $src|$src, $mask}",
4017           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4018let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4019def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4020           "maskmovdqu\t{$mask, $src|$src, $mask}",
4021           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
4022
4023} // ExeDomain = SSEPackedInt
4024
4025//===---------------------------------------------------------------------===//
4026// SSE2 - Move Doubleword/Quadword
4027//===---------------------------------------------------------------------===//
4028
4029//===---------------------------------------------------------------------===//
4030// Move Int Doubleword to Packed Double Int
4031//
4032let ExeDomain = SSEPackedInt in {
4033def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4034                        "movd\t{$src, $dst|$dst, $src}",
4035                        [(set VR128:$dst,
4036                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4037                          VEX, Sched<[WriteVecMoveFromGpr]>;
4038def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4039                        "movd\t{$src, $dst|$dst, $src}",
4040                        [(set VR128:$dst,
4041                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4042                        VEX, Sched<[WriteVecLoad]>;
4043def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4044                          "movq\t{$src, $dst|$dst, $src}",
4045                          [(set VR128:$dst,
4046                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4047                          VEX, Sched<[WriteVecMoveFromGpr]>;
4048let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4049def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4050                          "movq\t{$src, $dst|$dst, $src}", []>,
4051                          VEX, Sched<[WriteVecLoad]>;
4052let isCodeGenOnly = 1 in
4053def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4054                         "movq\t{$src, $dst|$dst, $src}",
4055                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4056                         VEX, Sched<[WriteVecMoveFromGpr]>;
4057
4058def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4059                      "movd\t{$src, $dst|$dst, $src}",
4060                      [(set VR128:$dst,
4061                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4062                      Sched<[WriteVecMoveFromGpr]>;
4063def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4064                      "movd\t{$src, $dst|$dst, $src}",
4065                      [(set VR128:$dst,
4066                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4067                      Sched<[WriteVecLoad]>;
4068def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4069                        "movq\t{$src, $dst|$dst, $src}",
4070                        [(set VR128:$dst,
4071                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4072                        Sched<[WriteVecMoveFromGpr]>;
4073let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4074def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4075                        "movq\t{$src, $dst|$dst, $src}", []>,
4076                        Sched<[WriteVecLoad]>;
4077let isCodeGenOnly = 1 in
4078def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4079                       "movq\t{$src, $dst|$dst, $src}",
4080                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4081                       Sched<[WriteVecMoveFromGpr]>;
4082} // ExeDomain = SSEPackedInt
4083
4084//===---------------------------------------------------------------------===//
4085// Move Int Doubleword to Single Scalar
4086//
4087let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4088  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4089                        "movd\t{$src, $dst|$dst, $src}",
4090                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4091                        VEX, Sched<[WriteVecMoveFromGpr]>;
4092
4093  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4094                        "movd\t{$src, $dst|$dst, $src}",
4095                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4096                        Sched<[WriteVecMoveFromGpr]>;
4097
4098} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4099
4100//===---------------------------------------------------------------------===//
4101// Move Packed Doubleword Int to Packed Double Int
4102//
4103let ExeDomain = SSEPackedInt in {
4104def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4105                         "movd\t{$src, $dst|$dst, $src}",
4106                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4107                                          (iPTR 0)))]>, VEX,
4108                         Sched<[WriteVecMoveToGpr]>;
4109def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4110                         (ins i32mem:$dst, VR128:$src),
4111                         "movd\t{$src, $dst|$dst, $src}",
4112                         [(store (i32 (extractelt (v4i32 VR128:$src),
4113                                       (iPTR 0))), addr:$dst)]>,
4114                         VEX, Sched<[WriteVecStore]>;
4115def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4116                       "movd\t{$src, $dst|$dst, $src}",
4117                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4118                                        (iPTR 0)))]>,
4119                   Sched<[WriteVecMoveToGpr]>;
4120def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4121                       "movd\t{$src, $dst|$dst, $src}",
4122                       [(store (i32 (extractelt (v4i32 VR128:$src),
4123                                     (iPTR 0))), addr:$dst)]>,
4124                       Sched<[WriteVecStore]>;
4125} // ExeDomain = SSEPackedInt
4126
4127//===---------------------------------------------------------------------===//
4128// Move Packed Doubleword Int first element to Doubleword Int
4129//
4130let ExeDomain = SSEPackedInt in {
4131let SchedRW = [WriteVecMoveToGpr] in {
4132def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4133                          "movq\t{$src, $dst|$dst, $src}",
4134                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4135                                                        (iPTR 0)))]>,
4136                      VEX;
4137
4138def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4139                        "movq\t{$src, $dst|$dst, $src}",
4140                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4141                                                         (iPTR 0)))]>;
4142} //SchedRW
4143
4144let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4145def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4146                          (ins i64mem:$dst, VR128:$src),
4147                          "movq\t{$src, $dst|$dst, $src}", []>,
4148                          VEX, Sched<[WriteVecStore]>;
4149let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4150def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4151                        "movq\t{$src, $dst|$dst, $src}", []>,
4152                        Sched<[WriteVecStore]>;
4153} // ExeDomain = SSEPackedInt
4154
4155//===---------------------------------------------------------------------===//
4156// Bitcast FR64 <-> GR64
4157//
4158let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4159  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4160                           "movq\t{$src, $dst|$dst, $src}",
4161                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4162                           VEX, Sched<[WriteVecMoveToGpr]>;
4163
4164  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4165                         "movq\t{$src, $dst|$dst, $src}",
4166                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4167                         Sched<[WriteVecMoveToGpr]>;
4168} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4169
4170//===---------------------------------------------------------------------===//
4171// Move Scalar Single to Double Int
4172//
4173let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4174  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4175                        "movd\t{$src, $dst|$dst, $src}",
4176                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4177                        VEX, Sched<[WriteVecMoveToGpr]>;
4178  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4179                        "movd\t{$src, $dst|$dst, $src}",
4180                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4181                        Sched<[WriteVecMoveToGpr]>;
4182} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4183
4184let Predicates = [UseAVX] in {
4185  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4186            (VMOVDI2PDIrr GR32:$src)>;
4187
4188  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4189            (VMOV64toPQIrr GR64:$src)>;
4190
4191  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4192  // These instructions also write zeros in the high part of a 256-bit register.
4193  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4194            (VMOVDI2PDIrm addr:$src)>;
4195  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4196            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4197}
4198
4199let Predicates = [UseSSE2] in {
4200  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4201            (MOVDI2PDIrr GR32:$src)>;
4202
4203  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4204            (MOV64toPQIrr GR64:$src)>;
4205  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4206            (MOVDI2PDIrm addr:$src)>;
4207}
4208
4209// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4210// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4211// these aliases.
4212def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4213                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4214def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4215                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4216// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4217def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4218                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4219def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4220                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4221
4222//===---------------------------------------------------------------------===//
4223// SSE2 - Move Quadword
4224//===---------------------------------------------------------------------===//
4225
4226//===---------------------------------------------------------------------===//
4227// Move Quadword Int to Packed Quadword Int
4228//
4229
4230let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4231def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4232                    "vmovq\t{$src, $dst|$dst, $src}",
4233                    [(set VR128:$dst,
4234                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4235                    VEX, Requires<[UseAVX]>, VEX_WIG;
4236def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4237                    "movq\t{$src, $dst|$dst, $src}",
4238                    [(set VR128:$dst,
4239                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4240                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4241} // ExeDomain, SchedRW
4242
4243//===---------------------------------------------------------------------===//
4244// Move Packed Quadword Int to Quadword Int
4245//
4246let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4247def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4248                        "movq\t{$src, $dst|$dst, $src}",
4249                        [(store (i64 (extractelt (v2i64 VR128:$src),
4250                                      (iPTR 0))), addr:$dst)]>,
4251                        VEX, VEX_WIG;
4252def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4253                      "movq\t{$src, $dst|$dst, $src}",
4254                      [(store (i64 (extractelt (v2i64 VR128:$src),
4255                                    (iPTR 0))), addr:$dst)]>;
4256} // ExeDomain, SchedRW
4257
4258// For disassembler only
4259let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4260    SchedRW = [SchedWriteVecLogic.XMM] in {
4261def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4262                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4263def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4264                      "movq\t{$src, $dst|$dst, $src}", []>;
4265}
4266
4267def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4268                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4269def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4270                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4271
4272let Predicates = [UseAVX] in {
4273  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4274            (VMOVQI2PQIrm addr:$src)>;
4275  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4276            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4277
4278  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4279            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4280}
4281
4282let Predicates = [UseSSE2] in {
4283  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4284
4285  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4286            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4287}
4288
4289//===---------------------------------------------------------------------===//
4290// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4291// IA32 document. movq xmm1, xmm2 does clear the high bits.
4292//
4293let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4294def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4295                        "vmovq\t{$src, $dst|$dst, $src}",
4296                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4297                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4298def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4299                        "movq\t{$src, $dst|$dst, $src}",
4300                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4301                        XS, Requires<[UseSSE2]>;
4302} // ExeDomain, SchedRW
4303
4304let Predicates = [UseAVX] in {
4305  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4306            (VMOVZPQILo2PQIrr VR128:$src)>;
4307}
4308let Predicates = [UseSSE2] in {
4309  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4310            (MOVZPQILo2PQIrr VR128:$src)>;
4311}
4312
4313let Predicates = [UseAVX] in {
4314  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4315            (SUBREG_TO_REG (i32 0),
4316             (v2f64 (VMOVZPQILo2PQIrr
4317                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4318             sub_xmm)>;
4319  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4320            (SUBREG_TO_REG (i32 0),
4321             (v2i64 (VMOVZPQILo2PQIrr
4322                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4323             sub_xmm)>;
4324}
4325
4326//===---------------------------------------------------------------------===//
4327// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4328//===---------------------------------------------------------------------===//
4329
4330multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4331                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4332                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4333def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4334                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4335                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4336                      Sched<[sched]>;
4337def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4338                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4339                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4340                      Sched<[sched.Folded]>;
4341}
4342
4343let Predicates = [HasAVX, NoVLX] in {
4344  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4345                                       v4f32, VR128, loadv4f32, f128mem,
4346                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4347  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4348                                       v4f32, VR128, loadv4f32, f128mem,
4349                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4350  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4351                                       v8f32, VR256, loadv8f32, f256mem,
4352                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4353  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4354                                       v8f32, VR256, loadv8f32, f256mem,
4355                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4356}
4357defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4358                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4359defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4360                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4361
4362let Predicates = [HasAVX, NoVLX] in {
4363  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4364            (VMOVSHDUPrr VR128:$src)>;
4365  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4366            (VMOVSHDUPrm addr:$src)>;
4367  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4368            (VMOVSLDUPrr VR128:$src)>;
4369  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4370            (VMOVSLDUPrm addr:$src)>;
4371  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4372            (VMOVSHDUPYrr VR256:$src)>;
4373  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4374            (VMOVSHDUPYrm addr:$src)>;
4375  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4376            (VMOVSLDUPYrr VR256:$src)>;
4377  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4378            (VMOVSLDUPYrm addr:$src)>;
4379}
4380
4381let Predicates = [UseSSE3] in {
4382  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4383            (MOVSHDUPrr VR128:$src)>;
4384  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4385            (MOVSHDUPrm addr:$src)>;
4386  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4387            (MOVSLDUPrr VR128:$src)>;
4388  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4389            (MOVSLDUPrm addr:$src)>;
4390}
4391
4392//===---------------------------------------------------------------------===//
4393// SSE3 - Replicate Double FP - MOVDDUP
4394//===---------------------------------------------------------------------===//
4395
4396multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4397def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4398                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4399                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4400                    Sched<[sched.XMM]>;
4401def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4402                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4403                    [(set VR128:$dst,
4404                      (v2f64 (X86Movddup
4405                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4406                    Sched<[sched.XMM.Folded]>;
4407}
4408
4409// FIXME: Merge with above classes when there are patterns for the ymm version
4410multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4411def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4412                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4413                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4414                    Sched<[sched.YMM]>;
4415def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4416                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4417                    [(set VR256:$dst,
4418                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4419                    Sched<[sched.YMM.Folded]>;
4420}
4421
4422let Predicates = [HasAVX, NoVLX] in {
4423  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4424                                      VEX, VEX_WIG;
4425  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4426                                        VEX, VEX_L, VEX_WIG;
4427}
4428
4429defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4430
4431
4432let Predicates = [HasAVX, NoVLX] in {
4433  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4434            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4435}
4436
4437let Predicates = [UseSSE3] in {
4438  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4439            (MOVDDUPrm addr:$src)>;
4440}
4441
4442//===---------------------------------------------------------------------===//
4443// SSE3 - Move Unaligned Integer
4444//===---------------------------------------------------------------------===//
4445
4446let Predicates = [HasAVX] in {
4447  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4448                      "vlddqu\t{$src, $dst|$dst, $src}",
4449                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4450                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4451  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4452                       "vlddqu\t{$src, $dst|$dst, $src}",
4453                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4454                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4455} // Predicates
4456
4457def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4458                   "lddqu\t{$src, $dst|$dst, $src}",
4459                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4460                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4461
4462//===---------------------------------------------------------------------===//
4463// SSE3 - Arithmetic
4464//===---------------------------------------------------------------------===//
4465
4466multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4467                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4468                       PatFrag ld_frag, bit Is2Addr = 1> {
4469let Uses = [MXCSR], mayRaiseFPException = 1 in {
4470  def rr : I<0xD0, MRMSrcReg,
4471       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4472       !if(Is2Addr,
4473           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4474           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4475       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4476       Sched<[sched]>;
4477  def rm : I<0xD0, MRMSrcMem,
4478       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4479       !if(Is2Addr,
4480           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4481           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4482       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4483       Sched<[sched.Folded, sched.ReadAfterFold]>;
4484}
4485}
4486
4487let Predicates = [HasAVX] in {
4488  let ExeDomain = SSEPackedSingle in {
4489    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4490                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4491                                 XD, VEX_4V, VEX_WIG;
4492    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4493                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4494                                  XD, VEX_4V, VEX_L, VEX_WIG;
4495  }
4496  let ExeDomain = SSEPackedDouble in {
4497    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4498                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4499                                 PD, VEX_4V, VEX_WIG;
4500    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4501                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4502                                  PD, VEX_4V, VEX_L, VEX_WIG;
4503  }
4504}
4505let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4506  let ExeDomain = SSEPackedSingle in
4507  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4508                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4509  let ExeDomain = SSEPackedDouble in
4510  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4511                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4512}
4513
4514//===---------------------------------------------------------------------===//
4515// SSE3 Instructions
4516//===---------------------------------------------------------------------===//
4517
4518// Horizontal ops
4519multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4520                   X86MemOperand x86memop, SDNode OpNode,
4521                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4522                   bit Is2Addr = 1> {
4523let Uses = [MXCSR], mayRaiseFPException = 1 in {
4524  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4525       !if(Is2Addr,
4526         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4527         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4528      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4529      Sched<[sched]>;
4530
4531  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4532       !if(Is2Addr,
4533         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4534         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4535      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4536      Sched<[sched.Folded, sched.ReadAfterFold]>;
4537}
4538}
4539multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4540                  X86MemOperand x86memop, SDNode OpNode,
4541                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4542                  bit Is2Addr = 1> {
4543let Uses = [MXCSR], mayRaiseFPException = 1 in {
4544  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4545       !if(Is2Addr,
4546         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4547         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4548      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4549        Sched<[sched]>;
4550
4551  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4552       !if(Is2Addr,
4553         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4554         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4555      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4556        Sched<[sched.Folded, sched.ReadAfterFold]>;
4557}
4558}
4559
4560let Predicates = [HasAVX] in {
4561  let ExeDomain = SSEPackedSingle in {
4562    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4563                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4564    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4565                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4566    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4567                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4568    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4569                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4570  }
4571  let ExeDomain = SSEPackedDouble in {
4572    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4573                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4574    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4575                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4576    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4577                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4578    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4579                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4580  }
4581}
4582
4583let Constraints = "$src1 = $dst" in {
4584  let ExeDomain = SSEPackedSingle in {
4585    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4586                          WriteFHAdd, memopv4f32>;
4587    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4588                          WriteFHAdd, memopv4f32>;
4589  }
4590  let ExeDomain = SSEPackedDouble in {
4591    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4592                         WriteFHAdd, memopv2f64>;
4593    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4594                         WriteFHAdd, memopv2f64>;
4595  }
4596}
4597
4598//===---------------------------------------------------------------------===//
4599// SSSE3 - Packed Absolute Instructions
4600//===---------------------------------------------------------------------===//
4601
4602/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4603multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4604                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4605  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4606                 (ins VR128:$src),
4607                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4608                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4609                 Sched<[sched.XMM]>;
4610
4611  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4612                 (ins i128mem:$src),
4613                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4614                 [(set VR128:$dst,
4615                   (vt (OpNode (ld_frag addr:$src))))]>,
4616                 Sched<[sched.XMM.Folded]>;
4617}
4618
4619/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4620multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4621                          SDNode OpNode, X86SchedWriteWidths sched> {
4622  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4623                  (ins VR256:$src),
4624                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4625                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4626                  Sched<[sched.YMM]>;
4627
4628  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4629                  (ins i256mem:$src),
4630                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4631                  [(set VR256:$dst,
4632                    (vt (OpNode (load addr:$src))))]>,
4633                  Sched<[sched.YMM.Folded]>;
4634}
4635
4636let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4637  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4638                              load>, VEX, VEX_WIG;
4639  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4640                              load>, VEX, VEX_WIG;
4641}
4642let Predicates = [HasAVX, NoVLX] in {
4643  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4644                              load>, VEX, VEX_WIG;
4645}
4646let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4647  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4648                                VEX, VEX_L, VEX_WIG;
4649  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4650                                VEX, VEX_L, VEX_WIG;
4651}
4652let Predicates = [HasAVX2, NoVLX] in {
4653  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4654                                VEX, VEX_L, VEX_WIG;
4655}
4656
4657defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4658                          memop>;
4659defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4660                          memop>;
4661defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4662                          memop>;
4663
4664//===---------------------------------------------------------------------===//
4665// SSSE3 - Packed Binary Operator Instructions
4666//===---------------------------------------------------------------------===//
4667
4668/// SS3I_binop_rm - Simple SSSE3 bin op
4669multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4670                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4671                         PatFrag memop_frag, X86MemOperand x86memop,
4672                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4673  let isCommutable = 1 in
4674  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4675       (ins RC:$src1, RC:$src2),
4676       !if(Is2Addr,
4677         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4678         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4679       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4680       Sched<[sched]>;
4681  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4682       (ins RC:$src1, x86memop:$src2),
4683       !if(Is2Addr,
4684         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4685         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4686       [(set RC:$dst,
4687         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4688       Sched<[sched.Folded, sched.ReadAfterFold]>;
4689}
4690
4691/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4692multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4693                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4694                             PatFrag ld_frag, bit Is2Addr = 1> {
4695  let isCommutable = 1 in
4696  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4697       (ins VR128:$src1, VR128:$src2),
4698       !if(Is2Addr,
4699         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4700         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4701       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4702       Sched<[sched]>;
4703  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4704       (ins VR128:$src1, i128mem:$src2),
4705       !if(Is2Addr,
4706         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4707         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4708       [(set VR128:$dst,
4709         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4710       Sched<[sched.Folded, sched.ReadAfterFold]>;
4711}
4712
4713multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4714                               Intrinsic IntId256,
4715                               X86FoldableSchedWrite sched> {
4716  let isCommutable = 1 in
4717  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4718       (ins VR256:$src1, VR256:$src2),
4719       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4720       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4721       Sched<[sched]>;
4722  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4723       (ins VR256:$src1, i256mem:$src2),
4724       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4725       [(set VR256:$dst,
4726         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4727       Sched<[sched.Folded, sched.ReadAfterFold]>;
4728}
4729
4730let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4731let isCommutable = 0 in {
4732  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4733                                  VR128, load, i128mem,
4734                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4735  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4736                                  v16i8, VR128, load, i128mem,
4737                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4738}
4739defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4740                                  VR128, load, i128mem,
4741                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4742}
4743
4744let ImmT = NoImm, Predicates = [HasAVX] in {
4745let isCommutable = 0 in {
4746  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4747                                  load, i128mem,
4748                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4749  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4750                                  load, i128mem,
4751                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4752  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4753                                  load, i128mem,
4754                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4755  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4756                                  load, i128mem,
4757                                  SchedWritePHAdd.XMM, 0>, VEX_4V;
4758  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4759                                      int_x86_ssse3_psign_b_128,
4760                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4761  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4762                                      int_x86_ssse3_psign_w_128,
4763                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4764  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4765                                      int_x86_ssse3_psign_d_128,
4766                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4767  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4768                                      int_x86_ssse3_phadd_sw_128,
4769                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4770  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4771                                      int_x86_ssse3_phsub_sw_128,
4772                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4773}
4774}
4775
4776let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4777let isCommutable = 0 in {
4778  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4779                                  VR256, load, i256mem,
4780                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4781  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4782                                   v32i8, VR256, load, i256mem,
4783                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4784}
4785defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4786                                  VR256, load, i256mem,
4787                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4788}
4789
4790let ImmT = NoImm, Predicates = [HasAVX2] in {
4791let isCommutable = 0 in {
4792  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4793                                  VR256, load, i256mem,
4794                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4795  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4796                                  load, i256mem,
4797                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4798  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4799                                  VR256, load, i256mem,
4800                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4801  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4802                                  load, i256mem,
4803                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L;
4804  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4805                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4806  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4807                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4808  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4809                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4810  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4811                                       int_x86_avx2_phadd_sw,
4812                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4813  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4814                                       int_x86_avx2_phsub_sw,
4815                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4816}
4817}
4818
4819// None of these have i8 immediate fields.
4820let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4821let isCommutable = 0 in {
4822  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4823                                 memop, i128mem, SchedWritePHAdd.XMM>;
4824  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4825                                 memop, i128mem, SchedWritePHAdd.XMM>;
4826  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4827                                 memop, i128mem, SchedWritePHAdd.XMM>;
4828  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4829                                 memop, i128mem, SchedWritePHAdd.XMM>;
4830  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4831                                     SchedWriteVecALU.XMM, memop>;
4832  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4833                                     SchedWriteVecALU.XMM, memop>;
4834  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4835                                     SchedWriteVecALU.XMM, memop>;
4836  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4837                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4838  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4839                                     int_x86_ssse3_phadd_sw_128,
4840                                     SchedWritePHAdd.XMM, memop>;
4841  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4842                                     int_x86_ssse3_phsub_sw_128,
4843                                     SchedWritePHAdd.XMM, memop>;
4844  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4845                                 v16i8, VR128, memop, i128mem,
4846                                 SchedWriteVecIMul.XMM>;
4847}
4848defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4849                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4850}
4851
4852//===---------------------------------------------------------------------===//
4853// SSSE3 - Packed Align Instruction Patterns
4854//===---------------------------------------------------------------------===//
4855
4856multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4857                         PatFrag memop_frag, X86MemOperand x86memop,
4858                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4859  let hasSideEffects = 0 in {
4860  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4861      (ins RC:$src1, RC:$src2, u8imm:$src3),
4862      !if(Is2Addr,
4863        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4864        !strconcat(asm,
4865                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4866      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4867      Sched<[sched]>;
4868  let mayLoad = 1 in
4869  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4870      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4871      !if(Is2Addr,
4872        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4873        !strconcat(asm,
4874                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4875      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4876                                     (memop_frag addr:$src2),
4877                                     (i8 timm:$src3))))]>,
4878      Sched<[sched.Folded, sched.ReadAfterFold]>;
4879  }
4880}
4881
4882let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4883  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4884                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4885let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4886  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4887                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4888let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4889  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4890                               SchedWriteShuffle.XMM>;
4891
4892//===---------------------------------------------------------------------===//
4893// SSSE3 - Thread synchronization
4894//===---------------------------------------------------------------------===//
4895
4896let SchedRW = [WriteSystem] in {
4897let Uses = [EAX, ECX, EDX] in
4898def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4899                     TB, Requires<[HasSSE3, Not64BitMode]>;
4900let Uses = [RAX, ECX, EDX] in
4901def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4902                     TB, Requires<[HasSSE3, In64BitMode]>;
4903
4904let Uses = [ECX, EAX] in
4905def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4906                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4907} // SchedRW
4908
4909def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4910def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4911
4912def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4913      Requires<[Not64BitMode]>;
4914def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4915      Requires<[In64BitMode]>;
4916
4917//===----------------------------------------------------------------------===//
4918// SSE4.1 - Packed Move with Sign/Zero Extend
4919// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4920//===----------------------------------------------------------------------===//
4921
4922multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4923                            RegisterClass OutRC, RegisterClass InRC,
4924                            X86FoldableSchedWrite sched> {
4925  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4926                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4927                 Sched<[sched]>;
4928
4929  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4930                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4931                 Sched<[sched.Folded]>;
4932}
4933
4934multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4935                              X86MemOperand MemOp, X86MemOperand MemYOp,
4936                              Predicate prd> {
4937  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4938                               SchedWriteShuffle.XMM>;
4939  let Predicates = [HasAVX, prd] in
4940    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4941                                     VR128, VR128, SchedWriteShuffle.XMM>,
4942                                     VEX, VEX_WIG;
4943  let Predicates = [HasAVX2, prd] in
4944    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4945                                     VR256, VR128, WriteShuffle256>,
4946                                     VEX, VEX_L, VEX_WIG;
4947}
4948
4949multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4950                          X86MemOperand MemYOp, Predicate prd> {
4951  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4952                                        MemOp, MemYOp, prd>;
4953  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4954                                        !strconcat("pmovzx", OpcodeStr),
4955                                        MemOp, MemYOp, prd>;
4956}
4957
4958defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4959defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4960defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4961
4962defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4963defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4964
4965defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4966
4967// AVX2 Patterns
4968multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4969                                     SDNode ExtOp, SDNode InVecOp> {
4970  // Register-Register patterns
4971  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4972  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4973            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4974  }
4975  let Predicates = [HasAVX2, NoVLX] in {
4976  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4977            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4978  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4979            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4980
4981  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4982            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4983  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4984            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
4985
4986  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
4987            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
4988  }
4989
4990  // Simple Register-Memory patterns
4991  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4992  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
4993            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4994
4995  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
4996            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
4997  }
4998
4999  let Predicates = [HasAVX2, NoVLX] in {
5000  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5001            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5002  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5003            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5004
5005  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5006            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5007  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5008            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5009
5010  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5011            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5012  }
5013
5014  // AVX2 Register-Memory patterns
5015  let Predicates = [HasAVX2, NoVLX] in {
5016  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5017            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5018
5019  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5020            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5021  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5022            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5023  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5024            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5025
5026  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5027            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5028
5029  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5030            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5031  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5032            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5033
5034  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5035            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5036  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5037            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5038  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5039            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5040  }
5041}
5042
5043defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5044defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5045
5046// SSE4.1/AVX patterns.
5047multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5048                                SDNode ExtOp> {
5049  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5050  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5051            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5052  }
5053  let Predicates = [HasAVX, NoVLX] in {
5054  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5055            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5056  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5057            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5058
5059  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5060            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5061  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5062            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5063
5064  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5065            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5066  }
5067  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5068  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5069            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5070  }
5071  let Predicates = [HasAVX, NoVLX] in {
5072  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5073            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5074  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5075            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5076
5077  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5078            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5079  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5080            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5081
5082  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5083            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5084  }
5085  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5086  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5087            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5088  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5089            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5090  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5091            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5092  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5093            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5094  }
5095  let Predicates = [HasAVX, NoVLX] in {
5096  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5097            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5098  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5099            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5100  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5101            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5102
5103  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5104            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5105  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5106            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5107
5108  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5109            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5110  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5111            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5112  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5113            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5114  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5115            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5116
5117  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5118            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5119  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5120            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5121  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5122            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5123
5124  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5125            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5126  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5127            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5128  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5129            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5130  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5131            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5132  }
5133}
5134
5135defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5136defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5137
5138let Predicates = [UseSSE41] in {
5139  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5140  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5141}
5142
5143//===----------------------------------------------------------------------===//
5144// SSE4.1 - Extract Instructions
5145//===----------------------------------------------------------------------===//
5146
5147/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5148multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5149  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5150                 (ins VR128:$src1, u8imm:$src2),
5151                 !strconcat(OpcodeStr,
5152                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5153                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5154                                         imm:$src2))]>,
5155                  Sched<[WriteVecExtract]>;
5156  let hasSideEffects = 0, mayStore = 1 in
5157  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5158                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5159                 !strconcat(OpcodeStr,
5160                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5161                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
5162                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5163}
5164
5165let Predicates = [HasAVX, NoBWI] in
5166  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5167
5168defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5169
5170
5171/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5172multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5173  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5174  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5175                   (ins VR128:$src1, u8imm:$src2),
5176                   !strconcat(OpcodeStr,
5177                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5178                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5179
5180  let hasSideEffects = 0, mayStore = 1 in
5181  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5182                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5183                 !strconcat(OpcodeStr,
5184                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5185                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
5186                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5187}
5188
5189let Predicates = [HasAVX, NoBWI] in
5190  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5191
5192defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5193
5194
5195/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5196multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5197  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5198                 (ins VR128:$src1, u8imm:$src2),
5199                 !strconcat(OpcodeStr,
5200                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5201                 [(set GR32:$dst,
5202                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5203                  Sched<[WriteVecExtract]>;
5204  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5205                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5206                 !strconcat(OpcodeStr,
5207                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5208                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5209                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5210}
5211
5212let Predicates = [HasAVX, NoDQI] in
5213  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5214
5215defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5216
5217/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5218multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5219  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5220                 (ins VR128:$src1, u8imm:$src2),
5221                 !strconcat(OpcodeStr,
5222                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5223                 [(set GR64:$dst,
5224                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5225                  Sched<[WriteVecExtract]>;
5226  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5227                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5228                 !strconcat(OpcodeStr,
5229                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5230                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5231                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5232}
5233
5234let Predicates = [HasAVX, NoDQI] in
5235  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5236
5237defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5238
5239/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5240/// destination
5241multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5242  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5243                   (ins VR128:$src1, u8imm:$src2),
5244                   !strconcat(OpcodeStr,
5245                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5246                   [(set GR32orGR64:$dst,
5247                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5248                   Sched<[WriteVecExtract]>;
5249  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5250                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5251                   !strconcat(OpcodeStr,
5252                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5253                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5254                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5255}
5256
5257let ExeDomain = SSEPackedSingle in {
5258  let Predicates = [UseAVX] in
5259    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5260  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5261}
5262
5263//===----------------------------------------------------------------------===//
5264// SSE4.1 - Insert Instructions
5265//===----------------------------------------------------------------------===//
5266
5267multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5268  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5269      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5270      !if(Is2Addr,
5271        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5272        !strconcat(asm,
5273                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5274      [(set VR128:$dst,
5275        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
5276      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5277  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5278      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5279      !if(Is2Addr,
5280        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5281        !strconcat(asm,
5282                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5283      [(set VR128:$dst,
5284        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>,
5285                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5286}
5287
5288let Predicates = [HasAVX, NoBWI] in
5289  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5290let Constraints = "$src1 = $dst" in
5291  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5292
5293multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5294  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5295      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5296      !if(Is2Addr,
5297        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5298        !strconcat(asm,
5299                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5300      [(set VR128:$dst,
5301        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5302      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5303  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5304      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5305      !if(Is2Addr,
5306        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5307        !strconcat(asm,
5308                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5309      [(set VR128:$dst,
5310        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5311                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5312}
5313
5314let Predicates = [HasAVX, NoDQI] in
5315  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5316let Constraints = "$src1 = $dst" in
5317  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5318
5319multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5320  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5321      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5322      !if(Is2Addr,
5323        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5324        !strconcat(asm,
5325                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5326      [(set VR128:$dst,
5327        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5328      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5329  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5330      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5331      !if(Is2Addr,
5332        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5333        !strconcat(asm,
5334                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5335      [(set VR128:$dst,
5336        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5337                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5338}
5339
5340let Predicates = [HasAVX, NoDQI] in
5341  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5342let Constraints = "$src1 = $dst" in
5343  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5344
5345// insertps has a few different modes, there's the first two here below which
5346// are optimized inserts that won't zero arbitrary elements in the destination
5347// vector. The next one matches the intrinsic and could zero arbitrary elements
5348// in the target vector.
5349multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5350  let isCommutable = 1 in
5351  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5352      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5353      !if(Is2Addr,
5354        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5355        !strconcat(asm,
5356                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5357      [(set VR128:$dst,
5358        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5359      Sched<[SchedWriteFShuffle.XMM]>;
5360  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5361      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5362      !if(Is2Addr,
5363        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5364        !strconcat(asm,
5365                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5366      [(set VR128:$dst,
5367        (X86insertps VR128:$src1,
5368                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5369                    timm:$src3))]>,
5370      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5371}
5372
5373let ExeDomain = SSEPackedSingle in {
5374  let Predicates = [UseAVX] in
5375    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5376                     VEX_4V, VEX_WIG;
5377  let Constraints = "$src1 = $dst" in
5378    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5379}
5380
5381//===----------------------------------------------------------------------===//
5382// SSE4.1 - Round Instructions
5383//===----------------------------------------------------------------------===//
5384
5385multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5386                           X86MemOperand x86memop, RegisterClass RC,
5387                           ValueType VT, PatFrag mem_frag, SDNode OpNode,
5388                           X86FoldableSchedWrite sched> {
5389  // Intrinsic operation, reg.
5390  // Vector intrinsic operation, reg
5391let Uses = [MXCSR], mayRaiseFPException = 1 in {
5392  def r : SS4AIi8<opc, MRMSrcReg,
5393                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5394                  !strconcat(OpcodeStr,
5395                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5396                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5397                  Sched<[sched]>;
5398
5399  // Vector intrinsic operation, mem
5400  def m : SS4AIi8<opc, MRMSrcMem,
5401                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5402                  !strconcat(OpcodeStr,
5403                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5404                  [(set RC:$dst,
5405                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5406                  Sched<[sched.Folded]>;
5407}
5408}
5409
5410multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5411                          string OpcodeStr, X86FoldableSchedWrite sched> {
5412let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5413  def SSr : SS4AIi8<opcss, MRMSrcReg,
5414        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5415        !strconcat(OpcodeStr,
5416            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5417      []>, Sched<[sched]>;
5418
5419  let mayLoad = 1 in
5420  def SSm : SS4AIi8<opcss, MRMSrcMem,
5421        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5422        !strconcat(OpcodeStr,
5423             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5424        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5425} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5426
5427let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5428  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5429        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5430        !strconcat(OpcodeStr,
5431              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5432        []>, Sched<[sched]>;
5433
5434  let mayLoad = 1 in
5435  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5436        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5437        !strconcat(OpcodeStr,
5438             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5439        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5440} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5441}
5442
5443multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5444                           string OpcodeStr, X86FoldableSchedWrite sched> {
5445let Uses = [MXCSR], mayRaiseFPException = 1 in {
5446let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5447  def SSr : SS4AIi8<opcss, MRMSrcReg,
5448                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5449                    !strconcat(OpcodeStr,
5450                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5451                    []>, Sched<[sched]>;
5452
5453  let mayLoad = 1 in
5454  def SSm : SS4AIi8<opcss, MRMSrcMem,
5455                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5456                    !strconcat(OpcodeStr,
5457                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5458                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5459} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5460
5461let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5462  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5463                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5464                    !strconcat(OpcodeStr,
5465                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5466                    []>, Sched<[sched]>;
5467
5468  let mayLoad = 1 in
5469  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5470                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5471                    !strconcat(OpcodeStr,
5472                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5473                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5474} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5475}
5476}
5477
5478multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5479                            string OpcodeStr, X86FoldableSchedWrite sched,
5480                            ValueType VT32, ValueType VT64,
5481                            SDNode OpNode, bit Is2Addr = 1> {
5482let Uses = [MXCSR], mayRaiseFPException = 1 in {
5483let ExeDomain = SSEPackedSingle in {
5484  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5485        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5486        !if(Is2Addr,
5487            !strconcat(OpcodeStr,
5488                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5489            !strconcat(OpcodeStr,
5490                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5491        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5492        Sched<[sched]>;
5493
5494  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5495        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5496        !if(Is2Addr,
5497            !strconcat(OpcodeStr,
5498                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5499            !strconcat(OpcodeStr,
5500                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5501        [(set VR128:$dst,
5502             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5503        Sched<[sched.Folded, sched.ReadAfterFold]>;
5504} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5505
5506let ExeDomain = SSEPackedDouble in {
5507  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5508        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5509        !if(Is2Addr,
5510            !strconcat(OpcodeStr,
5511                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5512            !strconcat(OpcodeStr,
5513                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5514        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5515        Sched<[sched]>;
5516
5517  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5518        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5519        !if(Is2Addr,
5520            !strconcat(OpcodeStr,
5521                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5522            !strconcat(OpcodeStr,
5523                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5524        [(set VR128:$dst,
5525              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5526        Sched<[sched.Folded, sched.ReadAfterFold]>;
5527} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5528}
5529}
5530
5531// FP round - roundss, roundps, roundsd, roundpd
5532let Predicates = [HasAVX, NoVLX] in {
5533  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5534    // Intrinsic form
5535    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5536                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5537                                   VEX, VEX_WIG;
5538    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5539                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5540                                   VEX, VEX_L, VEX_WIG;
5541  }
5542
5543  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5544    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5545                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5546                                   VEX, VEX_WIG;
5547    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5548                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5549                                   VEX, VEX_L, VEX_WIG;
5550  }
5551}
5552let Predicates = [UseAVX] in {
5553  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5554                                  v4f32, v2f64, X86RndScales, 0>,
5555                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5556  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5557                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5558}
5559
5560let Predicates = [UseAVX] in {
5561  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5562            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5563  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5564            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5565}
5566
5567let Predicates = [UseAVX, OptForSize] in {
5568  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5569            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5570  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5571            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5572}
5573
5574let ExeDomain = SSEPackedSingle in
5575defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5576                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5577let ExeDomain = SSEPackedDouble in
5578defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5579                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5580
5581defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5582
5583let Constraints = "$src1 = $dst" in
5584defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5585                               v4f32, v2f64, X86RndScales>;
5586
5587let Predicates = [UseSSE41] in {
5588  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5589            (ROUNDSSr FR32:$src1, timm:$src2)>;
5590  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5591            (ROUNDSDr FR64:$src1, timm:$src2)>;
5592}
5593
5594let Predicates = [UseSSE41, OptForSize] in {
5595  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5596            (ROUNDSSm addr:$src1, timm:$src2)>;
5597  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5598            (ROUNDSDm addr:$src1, timm:$src2)>;
5599}
5600
5601//===----------------------------------------------------------------------===//
5602// SSE4.1 - Packed Bit Test
5603//===----------------------------------------------------------------------===//
5604
5605// ptest instruction we'll lower to this in X86ISelLowering primarily from
5606// the intel intrinsic that corresponds to this.
5607let Defs = [EFLAGS], Predicates = [HasAVX] in {
5608def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5609                "vptest\t{$src2, $src1|$src1, $src2}",
5610                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5611                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5612def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5613                "vptest\t{$src2, $src1|$src1, $src2}",
5614                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5615                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5616                VEX, VEX_WIG;
5617
5618def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5619                "vptest\t{$src2, $src1|$src1, $src2}",
5620                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5621                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5622def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5623                "vptest\t{$src2, $src1|$src1, $src2}",
5624                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5625                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5626                VEX, VEX_L, VEX_WIG;
5627}
5628
5629let Defs = [EFLAGS] in {
5630def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5631              "ptest\t{$src2, $src1|$src1, $src2}",
5632              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5633              Sched<[SchedWriteVecTest.XMM]>;
5634def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5635              "ptest\t{$src2, $src1|$src1, $src2}",
5636              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5637              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5638}
5639
5640// The bit test instructions below are AVX only
5641multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5642                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5643                       X86FoldableSchedWrite sched> {
5644  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5645            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5646            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5647            Sched<[sched]>, VEX;
5648  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5649            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5650            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5651            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5652}
5653
5654let Defs = [EFLAGS], Predicates = [HasAVX] in {
5655let ExeDomain = SSEPackedSingle in {
5656defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5657                            SchedWriteFTest.XMM>;
5658defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5659                            SchedWriteFTest.YMM>, VEX_L;
5660}
5661let ExeDomain = SSEPackedDouble in {
5662defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5663                            SchedWriteFTest.XMM>;
5664defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5665                            SchedWriteFTest.YMM>, VEX_L;
5666}
5667}
5668
5669//===----------------------------------------------------------------------===//
5670// SSE4.1 - Misc Instructions
5671//===----------------------------------------------------------------------===//
5672
5673let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5674  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5675                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5676                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5677                     Sched<[WritePOPCNT]>, OpSize16, XS;
5678  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5679                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5680                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5681                      (implicit EFLAGS)]>,
5682                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5683
5684  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5685                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5686                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5687                     Sched<[WritePOPCNT]>, OpSize32, XS;
5688
5689  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5690                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5691                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5692                      (implicit EFLAGS)]>,
5693                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5694
5695  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5696                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5697                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5698                      Sched<[WritePOPCNT]>, XS;
5699  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5700                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5701                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5702                       (implicit EFLAGS)]>,
5703                       Sched<[WritePOPCNT.Folded]>, XS;
5704}
5705
5706// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5707multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5708                                 SDNode OpNode, PatFrag ld_frag,
5709                                 X86FoldableSchedWrite Sched> {
5710  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5711                 (ins VR128:$src),
5712                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5713                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5714                 Sched<[Sched]>;
5715  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5716                  (ins i128mem:$src),
5717                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5718                  [(set VR128:$dst,
5719                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5720                 Sched<[Sched.Folded]>;
5721}
5722
5723// PHMIN has the same profile as PSAD, thus we use the same scheduling
5724// model, although the naming is misleading.
5725let Predicates = [HasAVX] in
5726defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5727                                         X86phminpos, load,
5728                                         WritePHMINPOS>, VEX, VEX_WIG;
5729defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5730                                         X86phminpos, memop,
5731                                         WritePHMINPOS>;
5732
5733/// SS48I_binop_rm - Simple SSE41 binary operator.
5734multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5735                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5736                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5737                          bit Is2Addr = 1> {
5738  let isCommutable = 1 in
5739  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5740       (ins RC:$src1, RC:$src2),
5741       !if(Is2Addr,
5742           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5743           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5744       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5745       Sched<[sched]>;
5746  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5747       (ins RC:$src1, x86memop:$src2),
5748       !if(Is2Addr,
5749           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5750           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5751       [(set RC:$dst,
5752         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5753       Sched<[sched.Folded, sched.ReadAfterFold]>;
5754}
5755
5756let Predicates = [HasAVX, NoVLX] in {
5757  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5758                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5759                                  VEX_4V, VEX_WIG;
5760  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5761                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5762                                  VEX_4V, VEX_WIG;
5763  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5764                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5765                                  VEX_4V, VEX_WIG;
5766  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5767                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5768                                  VEX_4V, VEX_WIG;
5769  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5770                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5771                                  VEX_4V, VEX_WIG;
5772}
5773let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5774  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5775                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5776                                  VEX_4V, VEX_WIG;
5777  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5778                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5779                                  VEX_4V, VEX_WIG;
5780  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5781                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5782                                  VEX_4V, VEX_WIG;
5783  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5784                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5785                                  VEX_4V, VEX_WIG;
5786}
5787
5788let Predicates = [HasAVX2, NoVLX] in {
5789  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5790                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5791                                  VEX_4V, VEX_L, VEX_WIG;
5792  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5793                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5794                                  VEX_4V, VEX_L, VEX_WIG;
5795  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5796                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5797                                  VEX_4V, VEX_L, VEX_WIG;
5798  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5799                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5800                                  VEX_4V, VEX_L, VEX_WIG;
5801  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5802                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5803                                  VEX_4V, VEX_L, VEX_WIG;
5804}
5805let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5806  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5807                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5808                                  VEX_4V, VEX_L, VEX_WIG;
5809  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5810                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5811                                  VEX_4V, VEX_L, VEX_WIG;
5812  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5813                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5814                                  VEX_4V, VEX_L, VEX_WIG;
5815  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5816                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5817                                  VEX_4V, VEX_L, VEX_WIG;
5818}
5819
5820let Constraints = "$src1 = $dst" in {
5821  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5822                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5823  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5824                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5825  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5826                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5827  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5828                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5829  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5830                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5831  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5832                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5833  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5834                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5835  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5836                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5837  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5838                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5839}
5840
5841let Predicates = [HasAVX, NoVLX] in
5842  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5843                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5844                                 VEX_4V, VEX_WIG;
5845let Predicates = [HasAVX] in
5846  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5847                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5848                                 VEX_4V, VEX_WIG;
5849
5850let Predicates = [HasAVX2, NoVLX] in
5851  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5852                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5853                                  VEX_4V, VEX_L, VEX_WIG;
5854let Predicates = [HasAVX2] in
5855  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5856                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5857                                  VEX_4V, VEX_L, VEX_WIG;
5858
5859let Constraints = "$src1 = $dst" in {
5860  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5861                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5862  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5863                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5864}
5865
5866/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5867multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5868                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5869                 X86MemOperand x86memop, bit Is2Addr,
5870                 X86FoldableSchedWrite sched> {
5871  let isCommutable = 1 in
5872  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5873        (ins RC:$src1, RC:$src2, u8imm:$src3),
5874        !if(Is2Addr,
5875            !strconcat(OpcodeStr,
5876                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5877            !strconcat(OpcodeStr,
5878                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5879        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5880        Sched<[sched]>;
5881  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5882        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5883        !if(Is2Addr,
5884            !strconcat(OpcodeStr,
5885                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5886            !strconcat(OpcodeStr,
5887                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5888        [(set RC:$dst,
5889          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5890        Sched<[sched.Folded, sched.ReadAfterFold]>;
5891}
5892
5893/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5894multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5895                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5896                           X86MemOperand x86memop, bit Is2Addr,
5897                           X86FoldableSchedWrite sched> {
5898  let isCommutable = 1 in
5899  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5900        (ins RC:$src1, RC:$src2, u8imm:$src3),
5901        !if(Is2Addr,
5902            !strconcat(OpcodeStr,
5903                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5904            !strconcat(OpcodeStr,
5905                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5906        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5907        Sched<[sched]>;
5908  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5909        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5910        !if(Is2Addr,
5911            !strconcat(OpcodeStr,
5912                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5913            !strconcat(OpcodeStr,
5914                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5915        [(set RC:$dst,
5916          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5917        Sched<[sched.Folded, sched.ReadAfterFold]>;
5918}
5919
5920def BlendCommuteImm2 : SDNodeXForm<timm, [{
5921  uint8_t Imm = N->getZExtValue() & 0x03;
5922  return getI8Imm(Imm ^ 0x03, SDLoc(N));
5923}]>;
5924
5925def BlendCommuteImm4 : SDNodeXForm<timm, [{
5926  uint8_t Imm = N->getZExtValue() & 0x0f;
5927  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5928}]>;
5929
5930def BlendCommuteImm8 : SDNodeXForm<timm, [{
5931  uint8_t Imm = N->getZExtValue() & 0xff;
5932  return getI8Imm(Imm ^ 0xff, SDLoc(N));
5933}]>;
5934
5935// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5936def BlendScaleImm4 : SDNodeXForm<timm, [{
5937  uint8_t Imm = N->getZExtValue();
5938  uint8_t NewImm = 0;
5939  for (unsigned i = 0; i != 4; ++i) {
5940    if (Imm & (1 << i))
5941      NewImm |= 0x3 << (i * 2);
5942  }
5943  return getI8Imm(NewImm, SDLoc(N));
5944}]>;
5945
5946// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5947def BlendScaleImm2 : SDNodeXForm<timm, [{
5948  uint8_t Imm = N->getZExtValue();
5949  uint8_t NewImm = 0;
5950  for (unsigned i = 0; i != 2; ++i) {
5951    if (Imm & (1 << i))
5952      NewImm |= 0xf << (i * 4);
5953  }
5954  return getI8Imm(NewImm, SDLoc(N));
5955}]>;
5956
5957// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5958def BlendScaleImm2to4 : SDNodeXForm<timm, [{
5959  uint8_t Imm = N->getZExtValue();
5960  uint8_t NewImm = 0;
5961  for (unsigned i = 0; i != 2; ++i) {
5962    if (Imm & (1 << i))
5963      NewImm |= 0x3 << (i * 2);
5964  }
5965  return getI8Imm(NewImm, SDLoc(N));
5966}]>;
5967
5968// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5969def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
5970  uint8_t Imm = N->getZExtValue();
5971  uint8_t NewImm = 0;
5972  for (unsigned i = 0; i != 4; ++i) {
5973    if (Imm & (1 << i))
5974      NewImm |= 0x3 << (i * 2);
5975  }
5976  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5977}]>;
5978
5979// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5980def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
5981  uint8_t Imm = N->getZExtValue();
5982  uint8_t NewImm = 0;
5983  for (unsigned i = 0; i != 2; ++i) {
5984    if (Imm & (1 << i))
5985      NewImm |= 0xf << (i * 4);
5986  }
5987  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5988}]>;
5989
5990// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
5991def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
5992  uint8_t Imm = N->getZExtValue();
5993  uint8_t NewImm = 0;
5994  for (unsigned i = 0; i != 2; ++i) {
5995    if (Imm & (1 << i))
5996      NewImm |= 0x3 << (i * 2);
5997  }
5998  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
5999}]>;
6000
6001let Predicates = [HasAVX] in {
6002  let isCommutable = 0 in {
6003    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6004                                        VR128, load, i128mem, 0,
6005                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6006  }
6007
6008let Uses = [MXCSR], mayRaiseFPException = 1 in {
6009  let ExeDomain = SSEPackedSingle in
6010  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6011                                   VR128, load, f128mem, 0,
6012                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6013  let ExeDomain = SSEPackedDouble in
6014  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6015                                   VR128, load, f128mem, 0,
6016                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6017  let ExeDomain = SSEPackedSingle in
6018  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6019                                    VR256, load, i256mem, 0,
6020                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6021}
6022}
6023
6024let Predicates = [HasAVX2] in {
6025  let isCommutable = 0 in {
6026  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6027                                  VR256, load, i256mem, 0,
6028                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6029  }
6030}
6031
6032let Constraints = "$src1 = $dst" in {
6033  let isCommutable = 0 in {
6034  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6035                                     VR128, memop, i128mem, 1,
6036                                     SchedWriteMPSAD.XMM>;
6037  }
6038
6039  let ExeDomain = SSEPackedSingle in
6040  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6041                                  VR128, memop, f128mem, 1,
6042                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6043  let ExeDomain = SSEPackedDouble in
6044  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6045                                  VR128, memop, f128mem, 1,
6046                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6047}
6048
6049/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6050multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6051                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6052                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6053                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6054let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6055  let isCommutable = 1 in
6056  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6057        (ins RC:$src1, RC:$src2, u8imm:$src3),
6058        !if(Is2Addr,
6059            !strconcat(OpcodeStr,
6060                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6061            !strconcat(OpcodeStr,
6062                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6063        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6064        Sched<[sched]>;
6065  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6066        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6067        !if(Is2Addr,
6068            !strconcat(OpcodeStr,
6069                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6070            !strconcat(OpcodeStr,
6071                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6072        [(set RC:$dst,
6073          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6074        Sched<[sched.Folded, sched.ReadAfterFold]>;
6075}
6076
6077  // Pattern to commute if load is in first source.
6078  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6079            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6080                                            (commuteXForm timm:$src3))>;
6081}
6082
6083let Predicates = [HasAVX] in {
6084  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6085                                  VR128, load, f128mem, 0, SSEPackedSingle,
6086                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6087                                  VEX_4V, VEX_WIG;
6088  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6089                                   VR256, load, f256mem, 0, SSEPackedSingle,
6090                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6091                                   VEX_4V, VEX_L, VEX_WIG;
6092  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6093                                  VR128, load, f128mem, 0, SSEPackedDouble,
6094                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6095                                  VEX_4V, VEX_WIG;
6096  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6097                                   VR256, load, f256mem, 0, SSEPackedDouble,
6098                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6099                                   VEX_4V, VEX_L, VEX_WIG;
6100  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6101                                  VR128, load, i128mem, 0, SSEPackedInt,
6102                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6103                                  VEX_4V, VEX_WIG;
6104}
6105
6106let Predicates = [HasAVX2] in {
6107  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6108                                   VR256, load, i256mem, 0, SSEPackedInt,
6109                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6110                                   VEX_4V, VEX_L, VEX_WIG;
6111}
6112
6113// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6114// ExecutionDomainFixPass will cleanup domains later on.
6115let Predicates = [HasAVX1Only] in {
6116def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6117          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6118def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6119          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6120def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6121          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6122
6123// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6124// it from becoming movsd via commuting under optsize.
6125def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6126          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6127def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6128          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6129def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6130          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6131
6132def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6133          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6134def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6135          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6136def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6137          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6138
6139// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6140// it from becoming movss via commuting under optsize.
6141def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6142          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6143def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6144          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6145def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6146          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6147}
6148
6149defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6150                               VR128, memop, f128mem, 1, SSEPackedSingle,
6151                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6152defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6153                               VR128, memop, f128mem, 1, SSEPackedDouble,
6154                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6155defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6156                               VR128, memop, i128mem, 1, SSEPackedInt,
6157                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6158
6159let Predicates = [UseSSE41] in {
6160// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6161// it from becoming movss via commuting under optsize.
6162def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6163          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6164def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6165          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6166def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6167          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6168
6169def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6170          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6171def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6172          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6173def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6174          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6175}
6176
6177// For insertion into the zero index (low half) of a 256-bit vector, it is
6178// more efficient to generate a blend with immediate instead of an insert*128.
6179let Predicates = [HasAVX] in {
6180def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6181          (VBLENDPDYrri VR256:$src1,
6182                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6183                                       VR128:$src2, sub_xmm), 0x3)>;
6184def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6185          (VBLENDPSYrri VR256:$src1,
6186                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6187                                       VR128:$src2, sub_xmm), 0xf)>;
6188
6189def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6190          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6191                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6192def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6193          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6194                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6195}
6196
6197/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6198multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6199                                X86MemOperand x86memop, ValueType VT,
6200                                PatFrag mem_frag, SDNode OpNode,
6201                                X86FoldableSchedWrite sched> {
6202  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6203                  (ins RC:$src1, RC:$src2, RC:$src3),
6204                  !strconcat(OpcodeStr,
6205                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6206                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6207                  SSEPackedInt>, TAPD, VEX_4V,
6208                Sched<[sched]>;
6209
6210  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6211                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6212                  !strconcat(OpcodeStr,
6213                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6214                  [(set RC:$dst,
6215                        (OpNode RC:$src3, (mem_frag addr:$src2),
6216                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6217                Sched<[sched.Folded, sched.ReadAfterFold,
6218                       // x86memop:$src2
6219                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6220                       ReadDefault,
6221                       // RC::$src3
6222                       sched.ReadAfterFold]>;
6223}
6224
6225let Predicates = [HasAVX] in {
6226let ExeDomain = SSEPackedDouble in {
6227defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6228                                       v2f64, loadv2f64, X86Blendv,
6229                                       SchedWriteFVarBlend.XMM>;
6230defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6231                                       v4f64, loadv4f64, X86Blendv,
6232                                       SchedWriteFVarBlend.YMM>, VEX_L;
6233} // ExeDomain = SSEPackedDouble
6234let ExeDomain = SSEPackedSingle in {
6235defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6236                                       v4f32, loadv4f32, X86Blendv,
6237                                       SchedWriteFVarBlend.XMM>;
6238defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6239                                       v8f32, loadv8f32, X86Blendv,
6240                                       SchedWriteFVarBlend.YMM>, VEX_L;
6241} // ExeDomain = SSEPackedSingle
6242defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6243                                       v16i8, loadv16i8, X86Blendv,
6244                                       SchedWriteVarBlend.XMM>;
6245}
6246
6247let Predicates = [HasAVX2] in {
6248defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6249                                       v32i8, loadv32i8, X86Blendv,
6250                                       SchedWriteVarBlend.YMM>, VEX_L;
6251}
6252
6253let Predicates = [HasAVX] in {
6254  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6255                              (v4i32 VR128:$src2))),
6256            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6257  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6258                              (v2i64 VR128:$src2))),
6259            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6260  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6261                              (v8i32 VR256:$src2))),
6262            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6263  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6264                              (v4i64 VR256:$src2))),
6265            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6266}
6267
6268// Prefer a movss or movsd over a blendps when optimizing for size. these were
6269// changed to use blends because blends have better throughput on sandybridge
6270// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6271let Predicates = [HasAVX, OptForSpeed] in {
6272  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6273            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6274  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6275            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6276
6277  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6278            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6279  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6280            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6281  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6282            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6283
6284  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6285            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6286  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6287            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6288  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6289            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6290
6291  // Move low f32 and clear high bits.
6292  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6293            (SUBREG_TO_REG (i32 0),
6294             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6295                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6296                          (i8 1))), sub_xmm)>;
6297  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6298            (SUBREG_TO_REG (i32 0),
6299             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6300                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6301                          (i8 3))), sub_xmm)>;
6302}
6303
6304// Prefer a movss or movsd over a blendps when optimizing for size. these were
6305// changed to use blends because blends have better throughput on sandybridge
6306// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6307let Predicates = [UseSSE41, OptForSpeed] in {
6308  // With SSE41 we can use blends for these patterns.
6309  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6310            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6311  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6312            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6313
6314  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6315            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6316  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6317            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6318  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6319            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6320
6321  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6322            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6323  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6324            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6325  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6326            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6327}
6328
6329
6330/// SS41I_ternary - SSE 4.1 ternary operator
6331let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6332  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6333                           PatFrag mem_frag, X86MemOperand x86memop,
6334                           SDNode OpNode, X86FoldableSchedWrite sched> {
6335    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6336                    (ins VR128:$src1, VR128:$src2),
6337                    !strconcat(OpcodeStr,
6338                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6339                    [(set VR128:$dst,
6340                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6341                    Sched<[sched]>;
6342
6343    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6344                    (ins VR128:$src1, x86memop:$src2),
6345                    !strconcat(OpcodeStr,
6346                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6347                    [(set VR128:$dst,
6348                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6349                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6350  }
6351}
6352
6353let ExeDomain = SSEPackedDouble in
6354defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6355                              X86Blendv, SchedWriteFVarBlend.XMM>;
6356let ExeDomain = SSEPackedSingle in
6357defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6358                              X86Blendv, SchedWriteFVarBlend.XMM>;
6359defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6360                              X86Blendv, SchedWriteVarBlend.XMM>;
6361
6362// Aliases with the implicit xmm0 argument
6363def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6364                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6365def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6366                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6367def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6368                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6369def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6370                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6371def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6372                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6373def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6374                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6375
6376let Predicates = [UseSSE41] in {
6377  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6378                              (v4i32 VR128:$src2))),
6379            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6380  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6381                              (v2i64 VR128:$src2))),
6382            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6383}
6384
6385let AddedComplexity = 400 in { // Prefer non-temporal versions
6386
6387let Predicates = [HasAVX, NoVLX] in
6388def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6389                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6390                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6391let Predicates = [HasAVX2, NoVLX] in
6392def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6393                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6394                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6395def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6396                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6397                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6398
6399let Predicates = [HasAVX2, NoVLX] in {
6400  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6401            (VMOVNTDQAYrm addr:$src)>;
6402  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6403            (VMOVNTDQAYrm addr:$src)>;
6404  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6405            (VMOVNTDQAYrm addr:$src)>;
6406  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6407            (VMOVNTDQAYrm addr:$src)>;
6408  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6409            (VMOVNTDQAYrm addr:$src)>;
6410  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6411            (VMOVNTDQAYrm addr:$src)>;
6412}
6413
6414let Predicates = [HasAVX, NoVLX] in {
6415  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6416            (VMOVNTDQArm addr:$src)>;
6417  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6418            (VMOVNTDQArm addr:$src)>;
6419  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6420            (VMOVNTDQArm addr:$src)>;
6421  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6422            (VMOVNTDQArm addr:$src)>;
6423  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6424            (VMOVNTDQArm addr:$src)>;
6425  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6426            (VMOVNTDQArm addr:$src)>;
6427}
6428
6429let Predicates = [UseSSE41] in {
6430  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6431            (MOVNTDQArm addr:$src)>;
6432  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6433            (MOVNTDQArm addr:$src)>;
6434  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6435            (MOVNTDQArm addr:$src)>;
6436  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6437            (MOVNTDQArm addr:$src)>;
6438  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6439            (MOVNTDQArm addr:$src)>;
6440  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6441            (MOVNTDQArm addr:$src)>;
6442}
6443
6444} // AddedComplexity
6445
6446//===----------------------------------------------------------------------===//
6447// SSE4.2 - Compare Instructions
6448//===----------------------------------------------------------------------===//
6449
6450/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6451multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6452                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6453                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6454                          bit Is2Addr = 1> {
6455  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6456       (ins RC:$src1, RC:$src2),
6457       !if(Is2Addr,
6458           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6459           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6460       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6461       Sched<[sched]>;
6462  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6463       (ins RC:$src1, x86memop:$src2),
6464       !if(Is2Addr,
6465           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6466           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6467       [(set RC:$dst,
6468         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6469       Sched<[sched.Folded, sched.ReadAfterFold]>;
6470}
6471
6472let Predicates = [HasAVX] in
6473  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6474                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6475                                 VEX_4V, VEX_WIG;
6476
6477let Predicates = [HasAVX2] in
6478  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6479                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6480                                  VEX_4V, VEX_L, VEX_WIG;
6481
6482let Constraints = "$src1 = $dst" in
6483  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6484                                memop, i128mem, SchedWriteVecALU.XMM>;
6485
6486//===----------------------------------------------------------------------===//
6487// SSE4.2 - String/text Processing Instructions
6488//===----------------------------------------------------------------------===//
6489
6490multiclass pcmpistrm_SS42AI<string asm> {
6491  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6492    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6493    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6494    []>, Sched<[WritePCmpIStrM]>;
6495  let mayLoad = 1 in
6496  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6497    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6498    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6499    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6500}
6501
6502let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6503  let Predicates = [HasAVX] in
6504  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
6505  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6506}
6507
6508multiclass SS42AI_pcmpestrm<string asm> {
6509  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6510    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6511    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6512    []>, Sched<[WritePCmpEStrM]>;
6513  let mayLoad = 1 in
6514  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6515    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6516    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6517    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6518}
6519
6520let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6521  let Predicates = [HasAVX] in
6522  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
6523  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6524}
6525
6526multiclass SS42AI_pcmpistri<string asm> {
6527  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6528    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6529    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6530    []>, Sched<[WritePCmpIStrI]>;
6531  let mayLoad = 1 in
6532  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6533    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6534    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6535    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6536}
6537
6538let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6539  let Predicates = [HasAVX] in
6540  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
6541  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6542}
6543
6544multiclass SS42AI_pcmpestri<string asm> {
6545  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6546    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6547    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6548    []>, Sched<[WritePCmpEStrI]>;
6549  let mayLoad = 1 in
6550  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6551    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6552    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6553    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6554}
6555
6556let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6557  let Predicates = [HasAVX] in
6558  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
6559  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6560}
6561
6562//===----------------------------------------------------------------------===//
6563// SSE4.2 - CRC Instructions
6564//===----------------------------------------------------------------------===//
6565
6566// No CRC instructions have AVX equivalents
6567
6568// crc intrinsic instruction
6569// This set of instructions are only rm, the only difference is the size
6570// of r and m.
6571class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6572                   RegisterClass RCIn, SDPatternOperator Int> :
6573  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6574         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6575         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6576         Sched<[WriteCRC32]>;
6577
6578class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6579                   X86MemOperand x86memop, SDPatternOperator Int> :
6580  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6581         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6582         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6583         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6584
6585let Constraints = "$src1 = $dst" in {
6586  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6587                                 int_x86_sse42_crc32_32_8>;
6588  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6589                                 int_x86_sse42_crc32_32_8>;
6590  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6591                                 int_x86_sse42_crc32_32_16>, OpSize16;
6592  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6593                                 int_x86_sse42_crc32_32_16>, OpSize16;
6594  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6595                                 int_x86_sse42_crc32_32_32>, OpSize32;
6596  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6597                                 int_x86_sse42_crc32_32_32>, OpSize32;
6598  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6599                                 int_x86_sse42_crc32_64_64>, REX_W;
6600  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6601                                 int_x86_sse42_crc32_64_64>, REX_W;
6602  let hasSideEffects = 0 in {
6603    let mayLoad = 1 in
6604    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6605                                   null_frag>, REX_W;
6606    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6607                                   null_frag>, REX_W;
6608  }
6609}
6610
6611//===----------------------------------------------------------------------===//
6612// SHA-NI Instructions
6613//===----------------------------------------------------------------------===//
6614
6615// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6616multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6617                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6618  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6619             (ins VR128:$src1, VR128:$src2),
6620             !if(UsesXMM0,
6621                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6622                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6623             [!if(UsesXMM0,
6624                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6625                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6626             T8PS, Sched<[sched]>;
6627
6628  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6629             (ins VR128:$src1, i128mem:$src2),
6630             !if(UsesXMM0,
6631                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6632                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6633             [!if(UsesXMM0,
6634                  (set VR128:$dst, (IntId VR128:$src1,
6635                    (memop addr:$src2), XMM0)),
6636                  (set VR128:$dst, (IntId VR128:$src1,
6637                    (memop addr:$src2))))]>, T8PS,
6638             Sched<[sched.Folded, sched.ReadAfterFold]>;
6639}
6640
6641let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6642  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6643                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6644                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6645                         [(set VR128:$dst,
6646                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6647                            (i8 timm:$src3)))]>, TAPS,
6648                         Sched<[SchedWriteVecIMul.XMM]>;
6649  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6650                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6651                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6652                         [(set VR128:$dst,
6653                           (int_x86_sha1rnds4 VR128:$src1,
6654                            (memop addr:$src2),
6655                            (i8 timm:$src3)))]>, TAPS,
6656                         Sched<[SchedWriteVecIMul.XMM.Folded,
6657                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6658
6659  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6660                              SchedWriteVecIMul.XMM>;
6661  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6662                              SchedWriteVecIMul.XMM>;
6663  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6664                              SchedWriteVecIMul.XMM>;
6665
6666  let Uses=[XMM0] in
6667  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6668                                SchedWriteVecIMul.XMM, 1>;
6669
6670  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6671                               SchedWriteVecIMul.XMM>;
6672  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6673                               SchedWriteVecIMul.XMM>;
6674}
6675
6676// Aliases with explicit %xmm0
6677def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6678                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6679def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6680                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6681
6682//===----------------------------------------------------------------------===//
6683// AES-NI Instructions
6684//===----------------------------------------------------------------------===//
6685
6686multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6687                             Intrinsic IntId, PatFrag ld_frag,
6688                             bit Is2Addr = 0, RegisterClass RC = VR128,
6689                             X86MemOperand MemOp = i128mem> {
6690  let AsmString = OpcodeStr#
6691                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6692                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6693    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6694                   (ins RC:$src1, RC:$src2), "",
6695                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6696                   Sched<[WriteAESDecEnc]>;
6697    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6698                   (ins RC:$src1, MemOp:$src2), "",
6699                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6700                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6701  }
6702}
6703
6704// Perform One Round of an AES Encryption/Decryption Flow
6705let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6706  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6707                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6708  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6709                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6710  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6711                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6712  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6713                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6714}
6715
6716let Predicates = [NoVLX, HasVAES] in {
6717  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6718                         int_x86_aesni_aesenc_256, load, 0, VR256,
6719                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6720  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6721                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6722                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6723  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6724                         int_x86_aesni_aesdec_256, load, 0, VR256,
6725                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6726  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6727                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6728                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6729}
6730
6731let Constraints = "$src1 = $dst" in {
6732  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6733                         int_x86_aesni_aesenc, memop, 1>;
6734  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6735                         int_x86_aesni_aesenclast, memop, 1>;
6736  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6737                         int_x86_aesni_aesdec, memop, 1>;
6738  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6739                         int_x86_aesni_aesdeclast, memop, 1>;
6740}
6741
6742// Perform the AES InvMixColumn Transformation
6743let Predicates = [HasAVX, HasAES] in {
6744  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6745      (ins VR128:$src1),
6746      "vaesimc\t{$src1, $dst|$dst, $src1}",
6747      [(set VR128:$dst,
6748        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6749      VEX, VEX_WIG;
6750  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6751      (ins i128mem:$src1),
6752      "vaesimc\t{$src1, $dst|$dst, $src1}",
6753      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6754      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6755}
6756def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6757  (ins VR128:$src1),
6758  "aesimc\t{$src1, $dst|$dst, $src1}",
6759  [(set VR128:$dst,
6760    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6761def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6762  (ins i128mem:$src1),
6763  "aesimc\t{$src1, $dst|$dst, $src1}",
6764  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6765  Sched<[WriteAESIMC.Folded]>;
6766
6767// AES Round Key Generation Assist
6768let Predicates = [HasAVX, HasAES] in {
6769  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6770      (ins VR128:$src1, u8imm:$src2),
6771      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6772      [(set VR128:$dst,
6773        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6774      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6775  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6776      (ins i128mem:$src1, u8imm:$src2),
6777      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6778      [(set VR128:$dst,
6779        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6780      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6781}
6782def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6783  (ins VR128:$src1, u8imm:$src2),
6784  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6785  [(set VR128:$dst,
6786    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6787  Sched<[WriteAESKeyGen]>;
6788def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6789  (ins i128mem:$src1, u8imm:$src2),
6790  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6791  [(set VR128:$dst,
6792    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6793  Sched<[WriteAESKeyGen.Folded]>;
6794
6795//===----------------------------------------------------------------------===//
6796// PCLMUL Instructions
6797//===----------------------------------------------------------------------===//
6798
6799// Immediate transform to help with commuting.
6800def PCLMULCommuteImm : SDNodeXForm<timm, [{
6801  uint8_t Imm = N->getZExtValue();
6802  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6803}]>;
6804
6805// SSE carry-less Multiplication instructions
6806let Predicates = [NoAVX, HasPCLMUL] in {
6807  let Constraints = "$src1 = $dst" in {
6808    let isCommutable = 1 in
6809    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6810              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6811              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6812              [(set VR128:$dst,
6813                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6814                Sched<[WriteCLMul]>;
6815
6816    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6817              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6818              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6819              [(set VR128:$dst,
6820                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6821                  timm:$src3))]>,
6822              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6823  } // Constraints = "$src1 = $dst"
6824
6825  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6826                                (i8 timm:$src3)),
6827            (PCLMULQDQrm VR128:$src1, addr:$src2,
6828                          (PCLMULCommuteImm timm:$src3))>;
6829} // Predicates = [NoAVX, HasPCLMUL]
6830
6831// SSE aliases
6832foreach HI = ["hq","lq"] in
6833foreach LO = ["hq","lq"] in {
6834  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6835                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6836                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6837  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6838                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6839                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6840}
6841
6842// AVX carry-less Multiplication instructions
6843multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6844                      PatFrag LdFrag, Intrinsic IntId> {
6845  let isCommutable = 1 in
6846  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6847            (ins RC:$src1, RC:$src2, u8imm:$src3),
6848            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6849            [(set RC:$dst,
6850              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6851            Sched<[WriteCLMul]>;
6852
6853  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6854            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6855            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6856            [(set RC:$dst,
6857               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6858            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6859
6860  // We can commute a load in the first operand by swapping the sources and
6861  // rotating the immediate.
6862  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6863            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6864                                           (PCLMULCommuteImm timm:$src3))>;
6865}
6866
6867let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6868defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6869                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6870
6871let Predicates = [NoVLX, HasVPCLMULQDQ] in
6872defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6873                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6874
6875multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6876                                   X86MemOperand MemOp, string Hi, string Lo> {
6877  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6878                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6879                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6880  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6881                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6882                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6883}
6884
6885multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6886                              X86MemOperand MemOp> {
6887  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6888  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6889  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6890  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6891}
6892
6893// AVX aliases
6894defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6895defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6896
6897//===----------------------------------------------------------------------===//
6898// SSE4A Instructions
6899//===----------------------------------------------------------------------===//
6900
6901let Predicates = [HasSSE4A] in {
6902
6903let ExeDomain = SSEPackedInt in {
6904let Constraints = "$src = $dst" in {
6905def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6906                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6907                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6908                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6909                                    timm:$idx))]>,
6910                 PD, Sched<[SchedWriteVecALU.XMM]>;
6911def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6912              (ins VR128:$src, VR128:$mask),
6913              "extrq\t{$mask, $src|$src, $mask}",
6914              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6915                                 VR128:$mask))]>,
6916              PD, Sched<[SchedWriteVecALU.XMM]>;
6917
6918def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6919                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6920                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6921                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6922                                      timm:$len, timm:$idx))]>,
6923                   XD, Sched<[SchedWriteVecALU.XMM]>;
6924def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6925                 (ins VR128:$src, VR128:$mask),
6926                 "insertq\t{$mask, $src|$src, $mask}",
6927                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6928                                    VR128:$mask))]>,
6929                 XD, Sched<[SchedWriteVecALU.XMM]>;
6930}
6931} // ExeDomain = SSEPackedInt
6932
6933// Non-temporal (unaligned) scalar stores.
6934let AddedComplexity = 400 in { // Prefer non-temporal versions
6935let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6936def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6937                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6938
6939def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6940                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6941} // SchedRW
6942
6943def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6944          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6945
6946def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6947          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6948
6949} // AddedComplexity
6950} // HasSSE4A
6951
6952//===----------------------------------------------------------------------===//
6953// AVX Instructions
6954//===----------------------------------------------------------------------===//
6955
6956//===----------------------------------------------------------------------===//
6957// VBROADCAST - Load from memory and broadcast to all elements of the
6958//              destination operand
6959//
6960class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6961                           X86MemOperand x86memop, ValueType VT,
6962                           PatFrag bcast_frag, SchedWrite Sched> :
6963  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6964        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6965        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
6966        Sched<[Sched]>, VEX;
6967
6968// AVX2 adds register forms
6969class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6970                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6971  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6972         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6973         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6974         Sched<[Sched]>, VEX;
6975
6976let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6977  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6978                                         f32mem, v4f32, X86VBroadcastld32,
6979                                         SchedWriteFShuffle.XMM.Folded>;
6980  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6981                                         f32mem, v8f32, X86VBroadcastld32,
6982                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
6983}
6984let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
6985def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
6986                                        v4f64, X86VBroadcastld64,
6987                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
6988
6989let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
6990  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
6991                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
6992  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
6993                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
6994}
6995let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
6996def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
6997                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
6998
6999//===----------------------------------------------------------------------===//
7000// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7001//                  halves of a 256-bit vector.
7002//
7003let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7004def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7005                           (ins i128mem:$src),
7006                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7007                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7008
7009let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7010    ExeDomain = SSEPackedSingle in
7011def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7012                           (ins f128mem:$src),
7013                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7014                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7015
7016let Predicates = [HasAVX, NoVLX] in {
7017def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
7018          (VBROADCASTF128 addr:$src)>;
7019def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
7020          (VBROADCASTF128 addr:$src)>;
7021}
7022
7023// NOTE: We're using FP instructions here, but execution domain fixing can
7024// convert to integer when profitable.
7025let Predicates = [HasAVX, NoVLX] in {
7026def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
7027          (VBROADCASTF128 addr:$src)>;
7028def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))),
7029          (VBROADCASTF128 addr:$src)>;
7030def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))),
7031          (VBROADCASTF128 addr:$src)>;
7032def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
7033          (VBROADCASTF128 addr:$src)>;
7034}
7035
7036//===----------------------------------------------------------------------===//
7037// VINSERTF128 - Insert packed floating-point values
7038//
7039let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7040def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7041          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7042          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7043          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7044let mayLoad = 1 in
7045def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7046          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7047          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7048          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7049}
7050
7051// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7052// with YMM register containing zero.
7053// FIXME: Avoid producing vxorps to clear the fake inputs.
7054let Predicates = [HasAVX1Only] in {
7055def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7056}
7057
7058multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
7059                            PatFrag memop_frag> {
7060  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7061                                   (iPTR imm)),
7062            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7063                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7064  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7065                                    (From (memop_frag addr:$src2)),
7066                                    (iPTR imm)),
7067            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7068                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7069}
7070
7071let Predicates = [HasAVX, NoVLX] in {
7072  defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
7073  defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
7074}
7075
7076let Predicates = [HasAVX1Only] in {
7077  defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64,  loadv2i64>;
7078  defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32,  loadv4i32>;
7079  defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>;
7080  defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8,  loadv16i8>;
7081}
7082
7083//===----------------------------------------------------------------------===//
7084// VEXTRACTF128 - Extract packed floating-point values
7085//
7086let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7087def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7088          (ins VR256:$src1, u8imm:$src2),
7089          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7090          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7091let mayStore = 1 in
7092def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7093          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7094          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7095          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7096}
7097
7098multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7099  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7100            (To (!cast<Instruction>(InstrStr#rr)
7101                                    (From VR256:$src1),
7102                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7103  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7104                                                 (iPTR imm))), addr:$dst),
7105            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7106             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7107}
7108
7109// AVX1 patterns
7110let Predicates = [HasAVX, NoVLX] in {
7111  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7112  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7113}
7114
7115let Predicates = [HasAVX1Only] in {
7116  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7117  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7118  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7119  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7120}
7121
7122//===----------------------------------------------------------------------===//
7123// VMASKMOV - Conditional SIMD Packed Loads and Stores
7124//
7125multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7126                          Intrinsic IntLd, Intrinsic IntLd256,
7127                          Intrinsic IntSt, Intrinsic IntSt256,
7128                          X86SchedWriteMaskMove schedX,
7129                          X86SchedWriteMaskMove schedY> {
7130  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7131             (ins VR128:$src1, f128mem:$src2),
7132             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7133             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7134             VEX_4V, Sched<[schedX.RM]>;
7135  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7136             (ins VR256:$src1, f256mem:$src2),
7137             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7138             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7139             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7140  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7141             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7142             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7143             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7144             VEX_4V, Sched<[schedX.MR]>;
7145  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7146             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7147             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7148             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7149             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7150}
7151
7152let ExeDomain = SSEPackedSingle in
7153defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7154                                 int_x86_avx_maskload_ps,
7155                                 int_x86_avx_maskload_ps_256,
7156                                 int_x86_avx_maskstore_ps,
7157                                 int_x86_avx_maskstore_ps_256,
7158                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7159let ExeDomain = SSEPackedDouble in
7160defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7161                                 int_x86_avx_maskload_pd,
7162                                 int_x86_avx_maskload_pd_256,
7163                                 int_x86_avx_maskstore_pd,
7164                                 int_x86_avx_maskstore_pd_256,
7165                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7166
7167//===----------------------------------------------------------------------===//
7168// VPERMIL - Permute Single and Double Floating-Point Values
7169//
7170
7171multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7172                      RegisterClass RC, X86MemOperand x86memop_f,
7173                      X86MemOperand x86memop_i,
7174                      ValueType f_vt, ValueType i_vt,
7175                      X86FoldableSchedWrite sched,
7176                      X86FoldableSchedWrite varsched> {
7177  let Predicates = [HasAVX, NoVLX] in {
7178    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7179               (ins RC:$src1, RC:$src2),
7180               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7181               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7182               Sched<[varsched]>;
7183    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7184               (ins RC:$src1, x86memop_i:$src2),
7185               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7186               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7187                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7188               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7189
7190    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7191             (ins RC:$src1, u8imm:$src2),
7192             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7193             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7194             Sched<[sched]>;
7195    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7196             (ins x86memop_f:$src1, u8imm:$src2),
7197             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7198             [(set RC:$dst,
7199               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7200             Sched<[sched.Folded]>;
7201  }// Predicates = [HasAVX, NoVLX]
7202}
7203
7204let ExeDomain = SSEPackedSingle in {
7205  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7206                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7207                               SchedWriteFVarShuffle.XMM>;
7208  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7209                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7210                               SchedWriteFVarShuffle.YMM>, VEX_L;
7211}
7212let ExeDomain = SSEPackedDouble in {
7213  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7214                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7215                               SchedWriteFVarShuffle.XMM>;
7216  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7217                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7218                               SchedWriteFVarShuffle.YMM>, VEX_L;
7219}
7220
7221//===----------------------------------------------------------------------===//
7222// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7223//
7224
7225let ExeDomain = SSEPackedSingle in {
7226let isCommutable = 1 in
7227def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7228          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7229          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7230          [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7231                              (i8 timm:$src3))))]>, VEX_4V, VEX_L,
7232          Sched<[WriteFShuffle256]>;
7233def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7234          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7235          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7236          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
7237                             (i8 timm:$src3)))]>, VEX_4V, VEX_L,
7238          Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7239}
7240
7241// Immediate transform to help with commuting.
7242def Perm2XCommuteImm : SDNodeXForm<timm, [{
7243  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7244}]>;
7245
7246let Predicates = [HasAVX] in {
7247// Pattern with load in other operand.
7248def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
7249                                VR256:$src1, (i8 timm:$imm))),
7250          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7251}
7252
7253let Predicates = [HasAVX1Only] in {
7254def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7255          (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>;
7256def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
7257                  (loadv4i64 addr:$src2), (i8 timm:$imm))),
7258          (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>;
7259// Pattern with load in other operand.
7260def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7261                                VR256:$src1, (i8 timm:$imm))),
7262          (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7263}
7264
7265//===----------------------------------------------------------------------===//
7266// VZERO - Zero YMM registers
7267// Note: These instruction do not affect the YMM16-YMM31.
7268//
7269
7270let SchedRW = [WriteSystem] in {
7271let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7272            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7273  // Zero All YMM registers
7274  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7275                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7276                  Requires<[HasAVX]>, VEX_WIG;
7277
7278  // Zero Upper bits of YMM registers
7279  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7280                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7281                     Requires<[HasAVX]>, VEX_WIG;
7282} // Defs
7283} // SchedRW
7284
7285//===----------------------------------------------------------------------===//
7286// Half precision conversion instructions
7287//
7288
7289multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7290                      X86FoldableSchedWrite sched> {
7291  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7292             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7293             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7294             T8PD, VEX, Sched<[sched]>;
7295  let hasSideEffects = 0, mayLoad = 1 in
7296  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7297             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7298             []>, T8PD, VEX, Sched<[sched.Folded]>;
7299}
7300
7301multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7302                      SchedWrite RR, SchedWrite MR> {
7303  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7304               (ins RC:$src1, i32u8imm:$src2),
7305               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7306               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7307               TAPD, VEX, Sched<[RR]>;
7308  let hasSideEffects = 0, mayStore = 1 in
7309  def mr : Ii8<0x1D, MRMDestMem, (outs),
7310               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7311               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7312               TAPD, VEX, Sched<[MR]>;
7313}
7314
7315let Predicates = [HasF16C, NoVLX] in {
7316  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7317  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7318  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7319                               WriteCvtPS2PHSt>, SIMD_EXC;
7320  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7321                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7322
7323  // Pattern match vcvtph2ps of a scalar i64 load.
7324  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7325            (VCVTPH2PSrm addr:$src)>;
7326  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7327              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7328            (VCVTPH2PSrm addr:$src)>;
7329  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7330            (VCVTPH2PSYrm addr:$src)>;
7331
7332  def : Pat<(store (f64 (extractelt
7333                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7334                         (iPTR 0))), addr:$dst),
7335            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7336  def : Pat<(store (i64 (extractelt
7337                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7338                         (iPTR 0))), addr:$dst),
7339            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7340  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7341            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7342}
7343
7344//===----------------------------------------------------------------------===//
7345// AVX2 Instructions
7346//===----------------------------------------------------------------------===//
7347
7348/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7349multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7350                          ValueType OpVT, X86FoldableSchedWrite sched,
7351                          RegisterClass RC,
7352                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7353  let isCommutable = 1 in
7354  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7355        (ins RC:$src1, RC:$src2, u8imm:$src3),
7356        !strconcat(OpcodeStr,
7357            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7358        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7359        Sched<[sched]>, VEX_4V;
7360  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7361        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7362        !strconcat(OpcodeStr,
7363            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7364        [(set RC:$dst,
7365          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7366        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7367
7368  // Pattern to commute if load is in first source.
7369  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7370            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7371                                            (commuteXForm timm:$src3))>;
7372}
7373
7374let Predicates = [HasAVX2] in {
7375defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7376                               SchedWriteBlend.XMM, VR128, i128mem,
7377                               BlendCommuteImm4>;
7378defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7379                                SchedWriteBlend.YMM, VR256, i256mem,
7380                                BlendCommuteImm8>, VEX_L;
7381
7382def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7383          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7384def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7385          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7386def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7387          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7388
7389def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7390          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7391def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7392          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7393def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7394          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7395}
7396
7397// For insertion into the zero index (low half) of a 256-bit vector, it is
7398// more efficient to generate a blend with immediate instead of an insert*128.
7399// NOTE: We're using FP instructions here, but execution domain fixing should
7400// take care of using integer instructions when profitable.
7401let Predicates = [HasAVX] in {
7402def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7403          (VBLENDPSYrri VR256:$src1,
7404                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7405                                       VR128:$src2, sub_xmm), 0xf)>;
7406def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7407          (VBLENDPSYrri VR256:$src1,
7408                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7409                                       VR128:$src2, sub_xmm), 0xf)>;
7410def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7411          (VBLENDPSYrri VR256:$src1,
7412                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7413                                       VR128:$src2, sub_xmm), 0xf)>;
7414def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7415          (VBLENDPSYrri VR256:$src1,
7416                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7417                                       VR128:$src2, sub_xmm), 0xf)>;
7418
7419def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7420          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7421                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7422def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7423          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7424                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7425def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7426          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7427                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7428def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7429          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7430                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7431}
7432
7433//===----------------------------------------------------------------------===//
7434// VPBROADCAST - Load from memory and broadcast to all elements of the
7435//               destination operand
7436//
7437multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7438                          X86MemOperand x86memop, PatFrag bcast_frag,
7439                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7440  let Predicates = [HasAVX2, prd] in {
7441    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7442                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7443                  [(set VR128:$dst,
7444                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7445                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7446    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7447                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7448                  [(set VR128:$dst,
7449                   (OpVT128 (bcast_frag addr:$src)))]>,
7450                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7451    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7452                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7453                   [(set VR256:$dst,
7454                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7455                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7456    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7457                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7458                   [(set VR256:$dst,
7459                    (OpVT256 (bcast_frag addr:$src)))]>,
7460                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7461
7462    // Provide aliases for broadcast from the same register class that
7463    // automatically does the extract.
7464    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7465              (!cast<Instruction>(NAME#"Yrr")
7466                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7467  }
7468}
7469
7470defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7471                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7472defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7473                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7474defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7475                                    v4i32, v8i32, NoVLX>;
7476defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7477                                    v2i64, v4i64, NoVLX>;
7478
7479let Predicates = [HasAVX2, NoVLX] in {
7480  // Provide fallback in case the load node that is used in the patterns above
7481  // is used by additional users, which prevents the pattern selection.
7482    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7483              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7484    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7485              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7486    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7487              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7488}
7489
7490let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7491  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7492        (VPBROADCASTBrr (VMOVDI2PDIrr
7493                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7494                                             GR8:$src, sub_8bit))))>;
7495  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7496        (VPBROADCASTBYrr (VMOVDI2PDIrr
7497                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7498                                              GR8:$src, sub_8bit))))>;
7499
7500  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7501        (VPBROADCASTWrr (VMOVDI2PDIrr
7502                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7503                                             GR16:$src, sub_16bit))))>;
7504  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7505        (VPBROADCASTWYrr (VMOVDI2PDIrr
7506                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7507                                              GR16:$src, sub_16bit))))>;
7508}
7509let Predicates = [HasAVX2, NoVLX] in {
7510  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7511            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7512  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7513            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7514  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7515            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7516  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7517            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7518}
7519
7520// AVX1 broadcast patterns
7521let Predicates = [HasAVX1Only] in {
7522def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7523          (VBROADCASTSSYrm addr:$src)>;
7524def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7525          (VBROADCASTSDYrm addr:$src)>;
7526def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7527          (VBROADCASTSSrm addr:$src)>;
7528}
7529
7530  // Provide fallback in case the load node that is used in the patterns above
7531  // is used by additional users, which prevents the pattern selection.
7532let Predicates = [HasAVX, NoVLX] in {
7533  // 128bit broadcasts:
7534  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7535            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7536  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7537            (VMOVDDUPrm addr:$src)>;
7538
7539  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7540            (VMOVDDUPrr VR128:$src)>;
7541}
7542
7543let Predicates = [HasAVX1Only] in {
7544  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7545            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7546  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7547            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7548              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7549              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7550  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7551            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7552              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7553              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7554
7555  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7556            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7557  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7558            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7559              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7560              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7561  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7562            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7563              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7564              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7565
7566  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7567            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7568  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7569            (VMOVDDUPrm addr:$src)>;
7570}
7571
7572//===----------------------------------------------------------------------===//
7573// VPERM - Permute instructions
7574//
7575
7576multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7577                     ValueType OpVT, X86FoldableSchedWrite Sched,
7578                     X86MemOperand memOp> {
7579  let Predicates = [HasAVX2, NoVLX] in {
7580    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7581                     (ins VR256:$src1, VR256:$src2),
7582                     !strconcat(OpcodeStr,
7583                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7584                     [(set VR256:$dst,
7585                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7586                     Sched<[Sched]>, VEX_4V, VEX_L;
7587    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7588                     (ins VR256:$src1, memOp:$src2),
7589                     !strconcat(OpcodeStr,
7590                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7591                     [(set VR256:$dst,
7592                       (OpVT (X86VPermv VR256:$src1,
7593                              (load addr:$src2))))]>,
7594                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7595  }
7596}
7597
7598defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7599let ExeDomain = SSEPackedSingle in
7600defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7601
7602multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7603                         ValueType OpVT, X86FoldableSchedWrite Sched,
7604                         X86MemOperand memOp> {
7605  let Predicates = [HasAVX2, NoVLX] in {
7606    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7607                       (ins VR256:$src1, u8imm:$src2),
7608                       !strconcat(OpcodeStr,
7609                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7610                       [(set VR256:$dst,
7611                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7612                       Sched<[Sched]>, VEX, VEX_L;
7613    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7614                       (ins memOp:$src1, u8imm:$src2),
7615                       !strconcat(OpcodeStr,
7616                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7617                       [(set VR256:$dst,
7618                         (OpVT (X86VPermi (mem_frag addr:$src1),
7619                                (i8 timm:$src2))))]>,
7620                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7621  }
7622}
7623
7624defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7625                            WriteShuffle256, i256mem>, VEX_W;
7626let ExeDomain = SSEPackedDouble in
7627defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7628                             WriteFShuffle256, f256mem>, VEX_W;
7629
7630//===----------------------------------------------------------------------===//
7631// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
7632//
7633let isCommutable = 1 in
7634def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7635          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7636          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7637          [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
7638                            (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>,
7639          VEX_4V, VEX_L;
7640def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7641          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7642          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7643          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
7644                             (i8 timm:$src3)))]>,
7645          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7646
7647let Predicates = [HasAVX2] in
7648def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
7649                                VR256:$src1, (i8 timm:$imm))),
7650          (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>;
7651
7652
7653//===----------------------------------------------------------------------===//
7654// VINSERTI128 - Insert packed integer values
7655//
7656let hasSideEffects = 0 in {
7657def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7658          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7659          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7660          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7661let mayLoad = 1 in
7662def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7663          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7664          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7665          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7666}
7667
7668let Predicates = [HasAVX2, NoVLX] in {
7669  defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64,  loadv2i64>;
7670  defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32,  loadv4i32>;
7671  defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>;
7672  defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8,  loadv16i8>;
7673}
7674
7675//===----------------------------------------------------------------------===//
7676// VEXTRACTI128 - Extract packed integer values
7677//
7678def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7679          (ins VR256:$src1, u8imm:$src2),
7680          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7681          Sched<[WriteShuffle256]>, VEX, VEX_L;
7682let hasSideEffects = 0, mayStore = 1 in
7683def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7684          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7685          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7686          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7687
7688let Predicates = [HasAVX2, NoVLX] in {
7689  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7690  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7691  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7692  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7693}
7694
7695//===----------------------------------------------------------------------===//
7696// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7697//
7698multiclass avx2_pmovmask<string OpcodeStr,
7699                         Intrinsic IntLd128, Intrinsic IntLd256,
7700                         Intrinsic IntSt128, Intrinsic IntSt256,
7701                         X86SchedWriteMaskMove schedX,
7702                         X86SchedWriteMaskMove schedY> {
7703  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7704             (ins VR128:$src1, i128mem:$src2),
7705             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7706             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7707             VEX_4V, Sched<[schedX.RM]>;
7708  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7709             (ins VR256:$src1, i256mem:$src2),
7710             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7711             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7712             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7713  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7714             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7715             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7716             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7717             VEX_4V, Sched<[schedX.MR]>;
7718  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7719             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7720             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7721             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7722             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7723}
7724
7725defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7726                                int_x86_avx2_maskload_d,
7727                                int_x86_avx2_maskload_d_256,
7728                                int_x86_avx2_maskstore_d,
7729                                int_x86_avx2_maskstore_d_256,
7730                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
7731defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7732                                int_x86_avx2_maskload_q,
7733                                int_x86_avx2_maskload_q_256,
7734                                int_x86_avx2_maskstore_q,
7735                                int_x86_avx2_maskstore_q_256,
7736                                WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
7737
7738multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7739                          ValueType MaskVT> {
7740    // masked store
7741    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7742             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7743    // masked load
7744    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7745             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7746    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7747                              (VT immAllZerosV))),
7748             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7749}
7750let Predicates = [HasAVX] in {
7751  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7752  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7753  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7754  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7755}
7756let Predicates = [HasAVX1Only] in {
7757  // load/store i32/i64 not supported use ps/pd version
7758  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7759  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7760  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7761  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7762}
7763let Predicates = [HasAVX2] in {
7764  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7765  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7766  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7767  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7768}
7769
7770//===----------------------------------------------------------------------===//
7771// SubVector Broadcasts
7772// Provide fallback in case the load node that is used in the patterns above
7773// is used by additional users, which prevents the pattern selection.
7774
7775let Predicates = [HasAVX, NoVLX] in {
7776def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
7777          (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7778                         (v2f64 VR128:$src), 1)>;
7779def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
7780          (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7781                         (v4f32 VR128:$src), 1)>;
7782}
7783
7784// NOTE: We're using FP instructions here, but execution domain fixing can
7785// convert to integer when profitable.
7786let Predicates = [HasAVX, NoVLX] in {
7787def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
7788          (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7789                         (v2i64 VR128:$src), 1)>;
7790def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
7791          (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7792                         (v4i32 VR128:$src), 1)>;
7793def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
7794          (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7795                         (v8i16 VR128:$src), 1)>;
7796def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
7797          (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
7798                         (v16i8 VR128:$src), 1)>;
7799}
7800
7801//===----------------------------------------------------------------------===//
7802// Variable Bit Shifts
7803//
7804multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7805                          ValueType vt128, ValueType vt256> {
7806  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7807             (ins VR128:$src1, VR128:$src2),
7808             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7809             [(set VR128:$dst,
7810               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7811             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7812  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7813             (ins VR128:$src1, i128mem:$src2),
7814             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7815             [(set VR128:$dst,
7816               (vt128 (OpNode VR128:$src1,
7817                       (vt128 (load addr:$src2)))))]>,
7818             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7819                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7820  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7821             (ins VR256:$src1, VR256:$src2),
7822             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7823             [(set VR256:$dst,
7824               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7825             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7826  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7827             (ins VR256:$src1, i256mem:$src2),
7828             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7829             [(set VR256:$dst,
7830               (vt256 (OpNode VR256:$src1,
7831                       (vt256 (load addr:$src2)))))]>,
7832             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7833                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7834}
7835
7836let Predicates = [HasAVX2, NoVLX] in {
7837  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7838  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7839  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7840  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7841  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7842}
7843
7844//===----------------------------------------------------------------------===//
7845// VGATHER - GATHER Operations
7846
7847// FIXME: Improve scheduling of gather instructions.
7848multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
7849                       ValueType VTy, RegisterClass RC256,
7850                       X86MemOperand memop128, X86MemOperand memop256,
7851                       ValueType MTx = VTx, ValueType MTy = VTy> {
7852let mayLoad = 1, hasSideEffects = 0 in {
7853  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7854            (ins VR128:$src1, memop128:$src2, VR128:$mask),
7855            !strconcat(OpcodeStr,
7856              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7857            []>, VEX, Sched<[WriteLoad]>;
7858  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7859            (ins RC256:$src1, memop256:$src2, RC256:$mask),
7860            !strconcat(OpcodeStr,
7861              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7862            []>, VEX, VEX_L, Sched<[WriteLoad]>;
7863}
7864}
7865
7866let Predicates = [HasAVX2] in {
7867  let mayLoad = 1, hasSideEffects = 0, Constraints
7868    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7869    in {
7870    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
7871                        VR256, vx128mem, vx256mem>, VEX_W;
7872    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
7873                        VR256, vx128mem, vy256mem>, VEX_W;
7874    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
7875                        VR256, vx128mem, vy256mem>;
7876    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
7877                        VR128, vx64mem, vy128mem>;
7878
7879    let ExeDomain = SSEPackedDouble in {
7880      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
7881                          VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
7882      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
7883                          VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
7884    }
7885
7886    let ExeDomain = SSEPackedSingle in {
7887      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
7888                          VR256, vx128mem, vy256mem, v4i32, v8i32>;
7889      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
7890                          VR128, vx64mem, vy128mem, v4i32, v4i32>;
7891    }
7892  }
7893}
7894
7895//===----------------------------------------------------------------------===//
7896// GFNI instructions
7897//===----------------------------------------------------------------------===//
7898
7899multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7900                        RegisterClass RC, PatFrag MemOpFrag,
7901                        X86MemOperand X86MemOp, bit Is2Addr = 0> {
7902  let ExeDomain = SSEPackedInt,
7903      AsmString = !if(Is2Addr,
7904        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
7905        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7906    let isCommutable = 1 in
7907    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7908                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7909             Sched<[SchedWriteVecALU.XMM]>, T8PD;
7910
7911    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7912                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7913                                 (MemOpFrag addr:$src2))))]>,
7914             Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7915  }
7916}
7917
7918multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7919                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7920                           X86MemOperand X86MemOp, bit Is2Addr = 0> {
7921  let AsmString = !if(Is2Addr,
7922      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7923      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7924  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7925              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7926              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
7927              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7928  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7929              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7930              [(set RC:$dst, (OpVT (OpNode RC:$src1,
7931                                    (MemOpFrag addr:$src2),
7932                              timm:$src3)))], SSEPackedInt>,
7933              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
7934  }
7935}
7936
7937multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
7938  let Constraints = "$src1 = $dst",
7939      Predicates  = [HasGFNI, UseSSE2] in
7940  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
7941                                      VR128, load, i128mem, 1>;
7942  let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7943    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
7944                                      load, i128mem>, VEX_4V, VEX_W;
7945    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
7946                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
7947  }
7948}
7949
7950// GF2P8MULB
7951let Constraints = "$src1 = $dst",
7952    Predicates  = [HasGFNI, UseSSE2] in
7953defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
7954                                    i128mem, 1>;
7955let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7956  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
7957                                   i128mem>, VEX_4V;
7958  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
7959                                   i256mem>, VEX_4V, VEX_L;
7960}
7961// GF2P8AFFINEINVQB, GF2P8AFFINEQB
7962let isCommutable = 0 in {
7963  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
7964                                             X86GF2P8affineinvqb>, TAPD;
7965  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
7966                                             X86GF2P8affineqb>, TAPD;
7967}
7968
7969