xref: /freebsd/contrib/llvm-project/llvm/lib/Target/X86/X86InstrSSE.td (revision 62ff619dcc3540659a319be71c9a489f1659e14a)
1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file describes the X86 SSE instruction set, defining the instructions,
10// and properties of the instructions which are needed for code generation,
11// machine code emission, and analysis.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// SSE 1 & 2 Instructions Classes
17//===----------------------------------------------------------------------===//
18
19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
21                           RegisterClass RC, X86MemOperand x86memop,
22                           Domain d, X86FoldableSchedWrite sched,
23                           bit Is2Addr = 1> {
24let isCodeGenOnly = 1 in {
25  let isCommutable = 1 in {
26    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
27       !if(Is2Addr,
28           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
29           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
30       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>,
31       Sched<[sched]>;
32  }
33  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
34       !if(Is2Addr,
35           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
36           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
37       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>,
38       Sched<[sched.Folded, sched.ReadAfterFold]>;
39}
40}
41
42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
43multiclass sse12_fp_scalar_int<bits<8> opc,
44                               SDPatternOperator OpNode, RegisterClass RC,
45                               ValueType VT, string asm, Operand memopr,
46                               PatFrags mem_frags, Domain d,
47                               X86FoldableSchedWrite sched, bit Is2Addr = 1> {
48let hasSideEffects = 0 in {
49  def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
50       !if(Is2Addr,
51           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
52           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
53       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>,
54       Sched<[sched]>;
55  let mayLoad = 1 in
56  def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
57       !if(Is2Addr,
58           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
59           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
60       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
61       Sched<[sched.Folded, sched.ReadAfterFold]>;
62}
63}
64
65/// sse12_fp_packed - SSE 1 & 2 packed instructions class
66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
67                           RegisterClass RC, ValueType vt,
68                           X86MemOperand x86memop, PatFrag mem_frag,
69                           Domain d, X86FoldableSchedWrite sched,
70                           bit Is2Addr = 1> {
71  let isCommutable = 1 in
72    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
73       !if(Is2Addr,
74           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
75           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
76       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>,
77       Sched<[sched]>;
78  let mayLoad = 1 in
79    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
80       !if(Is2Addr,
81           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
82           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
83       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
84          d>,
85       Sched<[sched.Folded, sched.ReadAfterFold]>;
86}
87
88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
90                                      string OpcodeStr, X86MemOperand x86memop,
91                                      X86FoldableSchedWrite sched,
92                                      list<dag> pat_rr, list<dag> pat_rm,
93                                      bit Is2Addr = 1> {
94  let isCommutable = 1, hasSideEffects = 0 in
95    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
96       !if(Is2Addr,
97           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
98           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
99       pat_rr, d>,
100       Sched<[sched]>;
101  let hasSideEffects = 0, mayLoad = 1 in
102  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
103       !if(Is2Addr,
104           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
105           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
106       pat_rm, d>,
107       Sched<[sched.Folded, sched.ReadAfterFold]>;
108}
109
110
111// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
112// This is expanded by ExpandPostRAPseudos.
113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
114    isPseudo = 1, SchedRW = [WriteZero] in {
115  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
116                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
117  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
118                   [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>;
119  def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
120                     [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>;
121}
122
123//===----------------------------------------------------------------------===//
124// AVX & SSE - Zero/One Vectors
125//===----------------------------------------------------------------------===//
126
127// Alias instruction that maps zero vector to pxor / xorp* for sse.
128// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
129// swizzled by ExecutionDomainFix to pxor.
130// We set canFoldAsLoad because this can be converted to a constant-pool
131// load of an all-zeros value if folding it would be beneficial.
132let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
133    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
134def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
135               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
136}
137
138let Predicates = [NoAVX512] in {
139def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
140def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
141def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
142def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
143def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
144}
145
146
147// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
148// and doesn't need it because on sandy bridge the register is set to zero
149// at the rename stage without using any execution unit, so SET0PSY
150// and SET0PDY can be used for vector int instructions without penalty
151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
152    isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
153def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
154                 [(set VR256:$dst, (v8i32 immAllZerosV))]>;
155}
156
157let Predicates = [NoAVX512] in {
158def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
159def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
160def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
161def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
162def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
163}
164
165// We set canFoldAsLoad because this can be converted to a constant-pool
166// load of an all-ones value if folding it would be beneficial.
167let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
168    isPseudo = 1, SchedRW = [WriteZero] in {
169  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
170                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
171  let Predicates = [HasAVX1Only, OptForMinSize] in {
172  def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
173                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
174  }
175  let Predicates = [HasAVX2] in
176  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
177                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
178}
179
180//===----------------------------------------------------------------------===//
181// SSE 1 & 2 - Move FP Scalar Instructions
182//
183// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
184// register copies because it's a partial register update; Register-to-register
185// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
186// that the insert be implementable in terms of a copy, and just mentioned, we
187// don't use movss/movsd for copies.
188//===----------------------------------------------------------------------===//
189
190multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc,
191                         string asm_opr, Domain d, string Name> {
192  let isCommutable = 1 in
193  def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
194              (ins VR128:$src1, VR128:$src2),
195              !strconcat(base_opc, asm_opr),
196              [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>,
197              Sched<[SchedWriteFShuffle.XMM]>;
198
199  // For the disassembler
200  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
201  def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
202                  (ins VR128:$src1, VR128:$src2),
203                  !strconcat(base_opc, asm_opr), []>,
204                  Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>;
205}
206
207multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
208                      X86MemOperand x86memop, string OpcodeStr,
209                      Domain d, string Name, Predicate pred> {
210  // AVX
211  let Predicates = [UseAVX, OptForSize] in
212  defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
213                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
214                              "V"#Name>,
215                              VEX_4V, VEX_LIG, VEX_WIG;
216
217  def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
218                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
219                     [(store RC:$src, addr:$dst)], d>,
220                     VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG;
221  // SSE1 & 2
222  let Constraints = "$src1 = $dst" in {
223    let Predicates = [pred, NoSSE41_Or_OptForSize] in
224    defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
225                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
226  }
227
228  def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
229                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
230                     [(store RC:$src, addr:$dst)], d>,
231                     Sched<[WriteFStore]>;
232
233  def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
234                  (!cast<Instruction>("V"#NAME#"rr_REV")
235                   VR128:$dst, VR128:$src1, VR128:$src2), 0>;
236  def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}",
237                  (!cast<Instruction>(NAME#"rr_REV")
238                   VR128:$dst, VR128:$src2), 0>;
239}
240
241// Loading from memory automatically zeroing upper bits.
242multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
243                         PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
244                         Domain d> {
245  def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
246                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
247                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
248                     VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
249  def NAME#rm   : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
250                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
251                     [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
252                     Sched<[WriteFLoad]>;
253
254  // _alt version uses FR32/FR64 register class.
255  let isCodeGenOnly = 1 in {
256  def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
257                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
258                         [(set RC:$dst, (mem_pat addr:$src))], d>,
259                         VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
260  def NAME#rm_alt   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
261                         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
262                         [(set RC:$dst, (mem_pat addr:$src))], d>,
263                         Sched<[WriteFLoad]>;
264  }
265}
266
267defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
268                        SSEPackedSingle, "MOVSS", UseSSE1>, XS;
269defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
270                        SSEPackedDouble, "MOVSD", UseSSE2>, XD;
271
272let canFoldAsLoad = 1, isReMaterializable = 1 in {
273  defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
274                             SSEPackedSingle>, XS;
275  defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
276                             SSEPackedDouble>, XD;
277}
278
279// Patterns
280let Predicates = [UseAVX] in {
281  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
282            (VMOVSSrm addr:$src)>;
283  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
284            (VMOVSDrm addr:$src)>;
285
286  // Represent the same patterns above but in the form they appear for
287  // 256-bit types
288  def : Pat<(v8f32 (X86vzload32 addr:$src)),
289            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
290  def : Pat<(v4f64 (X86vzload64 addr:$src)),
291            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
292}
293
294let Predicates = [UseAVX, OptForSize] in {
295  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
296  // MOVSS to the lower bits.
297  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
298            (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
299  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
300            (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
301
302  // Move low f32 and clear high bits.
303  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
304            (SUBREG_TO_REG (i32 0),
305             (v4f32 (VMOVSSrr (v4f32 (V_SET0)),
306              (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>;
307  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
308            (SUBREG_TO_REG (i32 0),
309             (v4i32 (VMOVSSrr (v4i32 (V_SET0)),
310              (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>;
311}
312
313let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in {
314// Move scalar to XMM zero-extended, zeroing a VR128 then do a
315// MOVSS to the lower bits.
316def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
317          (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
318def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
319          (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
320}
321
322let Predicates = [UseSSE2] in
323def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
324          (MOVSDrm addr:$src)>;
325
326let Predicates = [UseSSE1] in
327def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
328          (MOVSSrm addr:$src)>;
329
330//===----------------------------------------------------------------------===//
331// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
332//===----------------------------------------------------------------------===//
333
334multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
335                            X86MemOperand x86memop, PatFrag ld_frag,
336                            string asm, Domain d,
337                            X86SchedWriteMoveLS sched> {
338let hasSideEffects = 0, isMoveReg = 1 in
339  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
340              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
341           Sched<[sched.RR]>;
342let canFoldAsLoad = 1, isReMaterializable = 1 in
343  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
344              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
345                   [(set RC:$dst, (ld_frag addr:$src))], d>,
346           Sched<[sched.RM]>;
347}
348
349let Predicates = [HasAVX, NoVLX] in {
350defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
351                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
352                                PS, VEX, VEX_WIG;
353defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
354                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
355                                PD, VEX, VEX_WIG;
356defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
357                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
358                                PS, VEX, VEX_WIG;
359defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
360                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
361                                PD, VEX, VEX_WIG;
362
363defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
364                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
365                                 PS, VEX, VEX_L, VEX_WIG;
366defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
367                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
368                                 PD, VEX, VEX_L, VEX_WIG;
369defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
370                                 SSEPackedSingle, SchedWriteFMoveLS.YMM>,
371                                 PS, VEX, VEX_L, VEX_WIG;
372defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
373                                 SSEPackedDouble, SchedWriteFMoveLS.YMM>,
374                                 PD, VEX, VEX_L, VEX_WIG;
375}
376
377let Predicates = [UseSSE1] in {
378defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
379                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
380                               PS;
381defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
382                               SSEPackedSingle, SchedWriteFMoveLS.XMM>,
383                               PS;
384}
385let Predicates = [UseSSE2] in {
386defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
387                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
388                               PD;
389defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
390                               SSEPackedDouble, SchedWriteFMoveLS.XMM>,
391                               PD;
392}
393
394let Predicates = [HasAVX, NoVLX]  in {
395let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
396def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
397                   "movaps\t{$src, $dst|$dst, $src}",
398                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>,
399                   VEX, VEX_WIG;
400def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
401                   "movapd\t{$src, $dst|$dst, $src}",
402                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>,
403                   VEX, VEX_WIG;
404def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
405                   "movups\t{$src, $dst|$dst, $src}",
406                   [(store (v4f32 VR128:$src), addr:$dst)]>,
407                   VEX, VEX_WIG;
408def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
409                   "movupd\t{$src, $dst|$dst, $src}",
410                   [(store (v2f64 VR128:$src), addr:$dst)]>,
411                   VEX, VEX_WIG;
412} // SchedRW
413
414let SchedRW = [SchedWriteFMoveLS.YMM.MR] in {
415def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
416                   "movaps\t{$src, $dst|$dst, $src}",
417                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>,
418                   VEX, VEX_L, VEX_WIG;
419def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
420                   "movapd\t{$src, $dst|$dst, $src}",
421                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>,
422                   VEX, VEX_L, VEX_WIG;
423def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
424                   "movups\t{$src, $dst|$dst, $src}",
425                   [(store (v8f32 VR256:$src), addr:$dst)]>,
426                   VEX, VEX_L, VEX_WIG;
427def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
428                   "movupd\t{$src, $dst|$dst, $src}",
429                   [(store (v4f64 VR256:$src), addr:$dst)]>,
430                   VEX, VEX_L, VEX_WIG;
431} // SchedRW
432} // Predicate
433
434// For disassembler
435let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
436    isMoveReg = 1 in {
437let SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
438  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
439                          (ins VR128:$src),
440                          "movaps\t{$src, $dst|$dst, $src}", []>,
441                          VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">;
442  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
443                           (ins VR128:$src),
444                           "movapd\t{$src, $dst|$dst, $src}", []>,
445                           VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">;
446  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
447                           (ins VR128:$src),
448                           "movups\t{$src, $dst|$dst, $src}", []>,
449                           VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">;
450  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
451                           (ins VR128:$src),
452                           "movupd\t{$src, $dst|$dst, $src}", []>,
453                           VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">;
454} // SchedRW
455
456let SchedRW = [SchedWriteFMoveLS.YMM.RR] in {
457  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
458                            (ins VR256:$src),
459                            "movaps\t{$src, $dst|$dst, $src}", []>,
460                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">;
461  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
462                            (ins VR256:$src),
463                            "movapd\t{$src, $dst|$dst, $src}", []>,
464                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">;
465  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
466                            (ins VR256:$src),
467                            "movups\t{$src, $dst|$dst, $src}", []>,
468                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">;
469  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
470                            (ins VR256:$src),
471                            "movupd\t{$src, $dst|$dst, $src}", []>,
472                            VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">;
473} // SchedRW
474} // Predicate
475
476// Reversed version with ".s" suffix for GAS compatibility.
477def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
478                (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
479def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
480                (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
481def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
482                (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
483def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
484                (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
485def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}",
486                (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>;
487def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}",
488                (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>;
489def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}",
490                (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>;
491def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}",
492                (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>;
493
494let SchedRW = [SchedWriteFMoveLS.XMM.MR] in {
495def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
496                   "movaps\t{$src, $dst|$dst, $src}",
497                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
498def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
499                   "movapd\t{$src, $dst|$dst, $src}",
500                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
501def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
502                   "movups\t{$src, $dst|$dst, $src}",
503                   [(store (v4f32 VR128:$src), addr:$dst)]>;
504def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
505                   "movupd\t{$src, $dst|$dst, $src}",
506                   [(store (v2f64 VR128:$src), addr:$dst)]>;
507} // SchedRW
508
509// For disassembler
510let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
511    isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in {
512  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
513                         "movaps\t{$src, $dst|$dst, $src}", []>,
514                         FoldGenData<"MOVAPSrr">;
515  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
516                         "movapd\t{$src, $dst|$dst, $src}", []>,
517                         FoldGenData<"MOVAPDrr">;
518  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
519                         "movups\t{$src, $dst|$dst, $src}", []>,
520                         FoldGenData<"MOVUPSrr">;
521  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
522                         "movupd\t{$src, $dst|$dst, $src}", []>,
523                         FoldGenData<"MOVUPDrr">;
524}
525
526// Reversed version with ".s" suffix for GAS compatibility.
527def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}",
528                (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>;
529def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}",
530                (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>;
531def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}",
532                (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>;
533def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}",
534                (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>;
535
536let Predicates = [HasAVX, NoVLX] in {
537  // 256-bit load/store need to use floating point load/store in case we don't
538  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
539  // available and changing the domain is beneficial.
540  def : Pat<(alignedloadv4i64 addr:$src),
541            (VMOVAPSYrm addr:$src)>;
542  def : Pat<(alignedloadv8i32 addr:$src),
543            (VMOVAPSYrm addr:$src)>;
544  def : Pat<(alignedloadv16i16 addr:$src),
545            (VMOVAPSYrm addr:$src)>;
546  def : Pat<(alignedloadv32i8 addr:$src),
547            (VMOVAPSYrm addr:$src)>;
548  def : Pat<(loadv4i64 addr:$src),
549            (VMOVUPSYrm addr:$src)>;
550  def : Pat<(loadv8i32 addr:$src),
551            (VMOVUPSYrm addr:$src)>;
552  def : Pat<(loadv16i16 addr:$src),
553            (VMOVUPSYrm addr:$src)>;
554  def : Pat<(loadv32i8 addr:$src),
555            (VMOVUPSYrm addr:$src)>;
556
557  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
558            (VMOVAPSYmr addr:$dst, VR256:$src)>;
559  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
560            (VMOVAPSYmr addr:$dst, VR256:$src)>;
561  def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
562            (VMOVAPSYmr addr:$dst, VR256:$src)>;
563  def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
564            (VMOVAPSYmr addr:$dst, VR256:$src)>;
565  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
566            (VMOVUPSYmr addr:$dst, VR256:$src)>;
567  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
568            (VMOVUPSYmr addr:$dst, VR256:$src)>;
569  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
570            (VMOVUPSYmr addr:$dst, VR256:$src)>;
571  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
572            (VMOVUPSYmr addr:$dst, VR256:$src)>;
573}
574
575// Use movaps / movups for SSE integer load / store (one byte shorter).
576// The instructions selected below are then converted to MOVDQA/MOVDQU
577// during the SSE domain pass.
578let Predicates = [UseSSE1] in {
579  def : Pat<(alignedloadv2i64 addr:$src),
580            (MOVAPSrm addr:$src)>;
581  def : Pat<(alignedloadv4i32 addr:$src),
582            (MOVAPSrm addr:$src)>;
583  def : Pat<(alignedloadv8i16 addr:$src),
584            (MOVAPSrm addr:$src)>;
585  def : Pat<(alignedloadv16i8 addr:$src),
586            (MOVAPSrm addr:$src)>;
587  def : Pat<(loadv2i64 addr:$src),
588            (MOVUPSrm addr:$src)>;
589  def : Pat<(loadv4i32 addr:$src),
590            (MOVUPSrm addr:$src)>;
591  def : Pat<(loadv8i16 addr:$src),
592            (MOVUPSrm addr:$src)>;
593  def : Pat<(loadv16i8 addr:$src),
594            (MOVUPSrm addr:$src)>;
595
596  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
597            (MOVAPSmr addr:$dst, VR128:$src)>;
598  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
599            (MOVAPSmr addr:$dst, VR128:$src)>;
600  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
601            (MOVAPSmr addr:$dst, VR128:$src)>;
602  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
603            (MOVAPSmr addr:$dst, VR128:$src)>;
604  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
605            (MOVUPSmr addr:$dst, VR128:$src)>;
606  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
607            (MOVUPSmr addr:$dst, VR128:$src)>;
608  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
609            (MOVUPSmr addr:$dst, VR128:$src)>;
610  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
611            (MOVUPSmr addr:$dst, VR128:$src)>;
612}
613
614//===----------------------------------------------------------------------===//
615// SSE 1 & 2 - Move Low packed FP Instructions
616//===----------------------------------------------------------------------===//
617
618multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
619                                      string base_opc, string asm_opr> {
620  // No pattern as they need be special cased between high and low.
621  let hasSideEffects = 0, mayLoad = 1 in
622  def PSrm : PI<opc, MRMSrcMem,
623                (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
624                !strconcat(base_opc, "s", asm_opr),
625                [], SSEPackedSingle>, PS,
626                Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
627
628  def PDrm : PI<opc, MRMSrcMem,
629         (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
630         !strconcat(base_opc, "d", asm_opr),
631     [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
632                              (scalar_to_vector (loadf64 addr:$src2)))))],
633              SSEPackedDouble>, PD,
634     Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
635}
636
637multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
638                                 string base_opc> {
639  let Predicates = [UseAVX] in
640    defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
641                                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
642                                    VEX_4V, VEX_WIG;
643
644  let Constraints = "$src1 = $dst" in
645    defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
646                                    "\t{$src2, $dst|$dst, $src2}">;
647}
648
649defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">;
650
651let SchedRW = [WriteFStore] in {
652let Predicates = [UseAVX] in {
653let mayStore = 1, hasSideEffects = 0 in
654def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
655                     "movlps\t{$src, $dst|$dst, $src}",
656                     []>,
657                     VEX, VEX_WIG;
658def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
659                     "movlpd\t{$src, $dst|$dst, $src}",
660                     [(store (f64 (extractelt (v2f64 VR128:$src),
661                                   (iPTR 0))), addr:$dst)]>,
662                     VEX, VEX_WIG;
663}// UseAVX
664let mayStore = 1, hasSideEffects = 0 in
665def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
666                   "movlps\t{$src, $dst|$dst, $src}",
667                   []>;
668def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
669                   "movlpd\t{$src, $dst|$dst, $src}",
670                   [(store (f64 (extractelt (v2f64 VR128:$src),
671                                 (iPTR 0))), addr:$dst)]>;
672} // SchedRW
673
674let Predicates = [UseSSE1] in {
675  // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll
676  // end up with a movsd or blend instead of shufp.
677  // No need for aligned load, we're only loading 64-bits.
678  def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1,
679                      (i8 -28)),
680            (MOVLPSrm VR128:$src1, addr:$src2)>;
681  def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
682            (MOVLPSrm VR128:$src1, addr:$src2)>;
683
684  def : Pat<(v4f32 (X86vzload64 addr:$src)),
685            (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
686  def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
687            (MOVLPSmr addr:$dst, VR128:$src)>;
688}
689
690//===----------------------------------------------------------------------===//
691// SSE 1 & 2 - Move Hi packed FP Instructions
692//===----------------------------------------------------------------------===//
693
694defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">;
695
696let SchedRW = [WriteFStore] in {
697// v2f64 extract element 1 is always custom lowered to unpack high to low
698// and extract element 0 so the non-store version isn't too horrible.
699let Predicates = [UseAVX] in {
700let mayStore = 1, hasSideEffects = 0 in
701def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
702                   "movhps\t{$src, $dst|$dst, $src}",
703                   []>, VEX, VEX_WIG;
704def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
705                   "movhpd\t{$src, $dst|$dst, $src}",
706                   [(store (f64 (extractelt
707                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
708                                 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG;
709} // UseAVX
710let mayStore = 1, hasSideEffects = 0 in
711def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
712                   "movhps\t{$src, $dst|$dst, $src}",
713                   []>;
714def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
715                   "movhpd\t{$src, $dst|$dst, $src}",
716                   [(store (f64 (extractelt
717                                 (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
718                                 (iPTR 0))), addr:$dst)]>;
719} // SchedRW
720
721let Predicates = [UseAVX] in {
722  // MOVHPD patterns
723  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
724            (VMOVHPDrm VR128:$src1, addr:$src2)>;
725
726  def : Pat<(store (f64 (extractelt
727                          (v2f64 (X86VPermilpi VR128:$src, (i8 1))),
728                          (iPTR 0))), addr:$dst),
729            (VMOVHPDmr addr:$dst, VR128:$src)>;
730
731  // MOVLPD patterns
732  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
733            (VMOVLPDrm VR128:$src1, addr:$src2)>;
734}
735
736let Predicates = [UseSSE1] in {
737  // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll
738  // end up with a movsd or blend instead of shufp.
739  // No need for aligned load, we're only loading 64-bits.
740  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))),
741            (MOVHPSrm VR128:$src1, addr:$src2)>;
742  def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
743            (MOVHPSrm VR128:$src1, addr:$src2)>;
744
745  def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
746                                addr:$dst),
747            (MOVHPSmr addr:$dst, VR128:$src)>;
748}
749
750let Predicates = [UseSSE2] in {
751  // MOVHPD patterns
752  def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
753            (MOVHPDrm VR128:$src1, addr:$src2)>;
754
755  def : Pat<(store (f64 (extractelt
756                          (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
757                          (iPTR 0))), addr:$dst),
758            (MOVHPDmr addr:$dst, VR128:$src)>;
759
760  // MOVLPD patterns
761  def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
762            (MOVLPDrm VR128:$src1, addr:$src2)>;
763}
764
765let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in {
766  // Use MOVLPD to load into the low bits from a full vector unless we can use
767  // BLENDPD.
768  def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))),
769            (MOVLPDrm VR128:$src1, addr:$src2)>;
770}
771
772//===----------------------------------------------------------------------===//
773// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
774//===----------------------------------------------------------------------===//
775
776let Predicates = [UseAVX] in {
777  def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
778                                       (ins VR128:$src1, VR128:$src2),
779                      "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
780                      [(set VR128:$dst,
781                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
782                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG;
783  let isCommutable = 1 in
784  def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
785                                       (ins VR128:$src1, VR128:$src2),
786                      "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
787                      [(set VR128:$dst,
788                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
789                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG,
790                      NotMemoryFoldable;
791}
792let Constraints = "$src1 = $dst" in {
793  def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
794                                       (ins VR128:$src1, VR128:$src2),
795                      "movlhps\t{$src2, $dst|$dst, $src2}",
796                      [(set VR128:$dst,
797                        (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
798                      Sched<[SchedWriteFShuffle.XMM]>;
799  let isCommutable = 1 in
800  def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
801                                       (ins VR128:$src1, VR128:$src2),
802                      "movhlps\t{$src2, $dst|$dst, $src2}",
803                      [(set VR128:$dst,
804                        (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
805                      Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable;
806}
807
808//===----------------------------------------------------------------------===//
809// SSE 1 & 2 - Conversion Instructions
810//===----------------------------------------------------------------------===//
811
812multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
813                     SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag,
814                     string asm, string mem, X86FoldableSchedWrite sched,
815                     Domain d,
816                     SchedRead Int2Fpu = ReadDefault> {
817  let ExeDomain = d in {
818  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
819              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
820              [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
821              Sched<[sched, Int2Fpu]>;
822  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
823              mem#"\t{$src, $dst|$dst, $src}",
824              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
825              Sched<[sched.Folded]>;
826  }
827}
828
829multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
830                       ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
831                       string asm, Domain d, X86FoldableSchedWrite sched> {
832let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
833  def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
834             [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
835             Sched<[sched]>;
836  let mayLoad = 1 in
837  def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
838             [(set RC:$dst, (DstTy (any_sint_to_fp
839                                    (SrcTy (ld_frag addr:$src)))))], d>,
840             Sched<[sched.Folded]>;
841}
842}
843
844multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
845                          X86MemOperand x86memop, string asm, string mem,
846                          X86FoldableSchedWrite sched, Domain d> {
847let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
848  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
849              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
850              Sched<[sched, ReadDefault, ReadInt2Fpu]>;
851  let mayLoad = 1 in
852  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
853              (ins DstRC:$src1, x86memop:$src),
854              asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
855           Sched<[sched.Folded, sched.ReadAfterFold]>;
856} // hasSideEffects = 0
857}
858
859let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
860defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
861                                "cvttss2si", "cvttss2si",
862                                WriteCvtSS2I, SSEPackedSingle>,
863                                XS, VEX, VEX_LIG;
864defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
865                                "cvttss2si", "cvttss2si",
866                                WriteCvtSS2I, SSEPackedSingle>,
867                                XS, VEX, VEX_W, VEX_LIG;
868defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
869                                "cvttsd2si", "cvttsd2si",
870                                WriteCvtSD2I, SSEPackedDouble>,
871                                XD, VEX, VEX_LIG;
872defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
873                                "cvttsd2si", "cvttsd2si",
874                                WriteCvtSD2I, SSEPackedDouble>,
875                                XD, VEX, VEX_W, VEX_LIG;
876
877defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
878                               "cvtss2si", "cvtss2si",
879                               WriteCvtSS2I, SSEPackedSingle>,
880                               XS, VEX, VEX_LIG;
881defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
882                               "cvtss2si", "cvtss2si",
883                               WriteCvtSS2I, SSEPackedSingle>,
884                               XS, VEX, VEX_W, VEX_LIG;
885defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
886                               "cvtsd2si", "cvtsd2si",
887                               WriteCvtSD2I, SSEPackedDouble>,
888                               XD, VEX, VEX_LIG;
889defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
890                               "cvtsd2si", "cvtsd2si",
891                               WriteCvtSD2I, SSEPackedDouble>,
892                               XD, VEX, VEX_W, VEX_LIG;
893}
894
895// The assembler can recognize rr 64-bit instructions by seeing a rxx
896// register, but the same isn't true when only using memory operands,
897// provide other assembly "l" and "q" forms to address this explicitly
898// where appropriate to do so.
899let isCodeGenOnly = 1 in {
900defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
901                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
902                                  VEX_LIG, SIMD_EXC;
903defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
904                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
905                                  VEX_W, VEX_LIG, SIMD_EXC;
906defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
907                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
908                                  VEX_LIG;
909defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
910                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
911                                  VEX_W, VEX_LIG, SIMD_EXC;
912} // isCodeGenOnly = 1
913
914let Predicates = [UseAVX] in {
915  def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
916            (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
917  def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
918            (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
919  def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
920            (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
921  def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
922            (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
923
924  def : Pat<(f32 (any_sint_to_fp GR32:$src)),
925            (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
926  def : Pat<(f32 (any_sint_to_fp GR64:$src)),
927            (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
928  def : Pat<(f64 (any_sint_to_fp GR32:$src)),
929            (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
930  def : Pat<(f64 (any_sint_to_fp GR64:$src)),
931            (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
932
933  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
934  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
935
936  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
937  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
938}
939
940let isCodeGenOnly = 1 in {
941defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
942                      "cvttss2si", "cvttss2si",
943                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
944defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
945                      "cvttss2si", "cvttss2si",
946                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
947defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
948                      "cvttsd2si", "cvttsd2si",
949                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
950defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
951                      "cvttsd2si", "cvttsd2si",
952                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
953
954defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
955                     "cvtss2si", "cvtss2si",
956                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
957defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
958                     "cvtss2si", "cvtss2si",
959                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
960defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
961                     "cvtsd2si", "cvtsd2si",
962                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
963defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
964                     "cvtsd2si", "cvtsd2si",
965                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
966
967defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
968                      "cvtsi2ss", "cvtsi2ss{l}",
969                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
970defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
971                      "cvtsi2ss", "cvtsi2ss{q}",
972                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
973defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
974                      "cvtsi2sd", "cvtsi2sd{l}",
975                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
976defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
977                      "cvtsi2sd", "cvtsi2sd{q}",
978                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
979} // isCodeGenOnly = 1
980
981let Predicates = [UseSSE1] in {
982  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
983  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
984}
985
986let Predicates = [UseSSE2] in {
987  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
988  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
989}
990
991// Conversion Instructions Intrinsics - Match intrinsics which expect MM
992// and/or XMM operand(s).
993
994multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
995                          ValueType DstVT, ValueType SrcVT, SDNode OpNode,
996                          Operand memop, PatFrags mem_frags, string asm,
997                          X86FoldableSchedWrite sched, Domain d> {
998let ExeDomain = d in {
999  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
1000                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1001                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
1002               Sched<[sched]>;
1003  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
1004                  !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
1005                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
1006               Sched<[sched.Folded]>;
1007}
1008}
1009
1010multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
1011                    RegisterClass DstRC, X86MemOperand x86memop,
1012                    string asm, string mem, X86FoldableSchedWrite sched,
1013                    Domain d, bit Is2Addr = 1> {
1014let hasSideEffects = 0, ExeDomain = d in {
1015  def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
1016                  !if(Is2Addr,
1017                      !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
1018                      !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
1019                  []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
1020  let mayLoad = 1 in
1021  def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
1022                  (ins DstRC:$src1, x86memop:$src2),
1023                  !if(Is2Addr,
1024                      asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}",
1025                      asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
1026                  []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
1027}
1028}
1029
1030let Uses = [MXCSR], mayRaiseFPException = 1 in {
1031let Predicates = [UseAVX] in {
1032defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
1033                  X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1034                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1035defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
1036                    X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
1037                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
1038}
1039defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
1040                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1041                 SSEPackedDouble>, XD;
1042defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
1043                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
1044                   SSEPackedDouble>, XD, REX_W;
1045}
1046
1047let Predicates = [UseAVX] in {
1048defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1049          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
1050          XS, VEX_4V, VEX_LIG, SIMD_EXC;
1051defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1052          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
1053          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1054defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1055          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
1056          XD, VEX_4V, VEX_LIG;
1057defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1058          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
1059          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
1060}
1061let Constraints = "$src1 = $dst" in {
1062  defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1063                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
1064                        XS, SIMD_EXC;
1065  defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1066                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
1067                        XS, REX_W, SIMD_EXC;
1068  defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
1069                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
1070                        XD;
1071  defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
1072                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
1073                        XD, REX_W, SIMD_EXC;
1074}
1075
1076def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1077               (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1078def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1079               (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1080def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1081               (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">;
1082def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1083               (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">;
1084
1085def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
1086              (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1087def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
1088              (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">;
1089
1090def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
1091                (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">;
1092def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
1093                (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">;
1094def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
1095                (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">;
1096def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
1097                (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">;
1098
1099def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
1100                (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1101def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
1102                (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">;
1103
1104/// SSE 1 Only
1105
1106// Aliases for intrinsics
1107let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1108defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1109                                ssmem, sse_load_f32, "cvttss2si",
1110                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1111defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1112                               X86cvtts2Int, ssmem, sse_load_f32,
1113                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1114                               XS, VEX, VEX_LIG, VEX_W;
1115defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1116                                sdmem, sse_load_f64, "cvttsd2si",
1117                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
1118defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1119                              X86cvtts2Int, sdmem, sse_load_f64,
1120                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
1121                              XD, VEX, VEX_LIG, VEX_W;
1122}
1123let Uses = [MXCSR], mayRaiseFPException = 1 in {
1124defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
1125                                    ssmem, sse_load_f32, "cvttss2si",
1126                                    WriteCvtSS2I, SSEPackedSingle>, XS;
1127defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
1128                                   X86cvtts2Int, ssmem, sse_load_f32,
1129                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
1130                                   XS, REX_W;
1131defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
1132                                    sdmem, sse_load_f64, "cvttsd2si",
1133                                    WriteCvtSD2I, SSEPackedDouble>, XD;
1134defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
1135                                  X86cvtts2Int, sdmem, sse_load_f64,
1136                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
1137                                  XD, REX_W;
1138}
1139
1140def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1141                (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1142def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
1143                (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1144def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1145                (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1146def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
1147                (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1148def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1149                (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1150def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
1151                (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1152def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1153                (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1154def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
1155                (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1156
1157def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1158                (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1159def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
1160                (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">;
1161def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1162                (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1163def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
1164                (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">;
1165def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1166                (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1167def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
1168                (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">;
1169def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1170                (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1171def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
1172                (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
1173
1174let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1175defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1176                                  ssmem, sse_load_f32, "cvtss2si",
1177                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
1178defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1179                                  ssmem, sse_load_f32, "cvtss2si",
1180                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
1181}
1182let Uses = [MXCSR], mayRaiseFPException = 1 in {
1183defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
1184                               ssmem, sse_load_f32, "cvtss2si",
1185                               WriteCvtSS2I, SSEPackedSingle>, XS;
1186defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
1187                                 ssmem, sse_load_f32, "cvtss2si",
1188                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
1189
1190defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
1191                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1192                               SSEPackedSingle, WriteCvtI2PS>,
1193                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1194defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
1195                               "vcvtdq2ps\t{$src, $dst|$dst, $src}",
1196                               SSEPackedSingle, WriteCvtI2PSY>,
1197                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
1198
1199defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
1200                            "cvtdq2ps\t{$src, $dst|$dst, $src}",
1201                            SSEPackedSingle, WriteCvtI2PS>,
1202                            PS, Requires<[UseSSE2]>;
1203}
1204
1205// AVX aliases
1206def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1207                (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1208def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
1209                (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1210def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1211                (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1212def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
1213                (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1214def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1215                (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1216def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
1217                (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1218def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1219                (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1220def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
1221                (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1222
1223// SSE aliases
1224def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1225                (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1226def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
1227                (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">;
1228def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1229                (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
1230def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
1231                (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">;
1232def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1233                (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1234def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
1235                (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">;
1236def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1237                (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">;
1238def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
1239                (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">;
1240
1241/// SSE 2 Only
1242
1243// Convert scalar double to scalar single
1244let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
1245    ExeDomain = SSEPackedSingle in {
1246def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
1247                        (ins FR32:$src1, FR64:$src2),
1248                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1249                        VEX_4V, VEX_LIG, VEX_WIG,
1250                        Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1251let mayLoad = 1 in
1252def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
1253                     (ins FR32:$src1, f64mem:$src2),
1254                     "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1255                     XD, VEX_4V, VEX_LIG, VEX_WIG,
1256                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
1257}
1258
1259def : Pat<(f32 (any_fpround FR64:$src)),
1260            (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
1261          Requires<[UseAVX]>;
1262
1263let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1264def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
1265                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
1266                      [(set FR32:$dst, (any_fpround FR64:$src))]>,
1267                      Sched<[WriteCvtSD2SS]>, SIMD_EXC;
1268def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
1269                    "cvtsd2ss\t{$src, $dst|$dst, $src}",
1270                    [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
1271                    XD, Requires<[UseSSE2, OptForSize]>,
1272                    Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
1273}
1274
1275let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in {
1276def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1277                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1278                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1279                       [(set VR128:$dst,
1280                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1281                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1282                       Sched<[WriteCvtSD2SS]>;
1283def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1284                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1285                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1286                       [(set VR128:$dst,
1287                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1288                       XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
1289                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1290let Constraints = "$src1 = $dst" in {
1291def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
1292                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1293                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1294                       [(set VR128:$dst,
1295                         (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
1296                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
1297def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
1298                       (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
1299                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
1300                       [(set VR128:$dst,
1301                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
1302                       XD, Requires<[UseSSE2]>,
1303                       Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
1304}
1305}
1306
1307// Convert scalar single to scalar double
1308// SSE2 instructions with XS prefix
1309let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
1310def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
1311                    (ins FR64:$src1, FR32:$src2),
1312                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1313                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1314                    Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
1315let mayLoad = 1 in
1316def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
1317                    (ins FR64:$src1, f32mem:$src2),
1318                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
1319                    XS, VEX_4V, VEX_LIG, VEX_WIG,
1320                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
1321                    Requires<[UseAVX, OptForSize]>, SIMD_EXC;
1322} // isCodeGenOnly = 1, hasSideEffects = 0
1323
1324def : Pat<(f64 (any_fpextend FR32:$src)),
1325    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
1326def : Pat<(any_fpextend (loadf32 addr:$src)),
1327    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
1328
1329let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
1330def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
1331                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1332                   [(set FR64:$dst, (any_fpextend FR32:$src))]>,
1333                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
1334def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
1335                   "cvtss2sd\t{$src, $dst|$dst, $src}",
1336                   [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
1337                   XS, Requires<[UseSSE2, OptForSize]>,
1338                   Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
1339} // isCodeGenOnly = 1
1340
1341let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
1342    ExeDomain = SSEPackedSingle in {
1343def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1344                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1345                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1346                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG,
1347                    Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
1348let mayLoad = 1 in
1349def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1350                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1351                    "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
1352                    []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>,
1353                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1354let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
1355def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
1356                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
1357                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1358                    []>, XS, Requires<[UseSSE2]>,
1359                    Sched<[WriteCvtSS2SD]>;
1360let mayLoad = 1 in
1361def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
1362                      (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
1363                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
1364                    []>, XS, Requires<[UseSSE2]>,
1365                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
1366}
1367} // hasSideEffects = 0
1368
1369// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
1370// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
1371// vmovs{s,d} instructions
1372let Predicates = [UseAVX] in {
1373def : Pat<(v4f32 (X86Movss
1374                   (v4f32 VR128:$dst),
1375                   (v4f32 (scalar_to_vector
1376                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1377          (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1378
1379def : Pat<(v2f64 (X86Movsd
1380                   (v2f64 VR128:$dst),
1381                   (v2f64 (scalar_to_vector
1382                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1383          (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1384
1385def : Pat<(v4f32 (X86Movss
1386                   (v4f32 VR128:$dst),
1387                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1388          (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1389
1390def : Pat<(v4f32 (X86Movss
1391                   (v4f32 VR128:$dst),
1392                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1393          (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1394
1395def : Pat<(v4f32 (X86Movss
1396                   (v4f32 VR128:$dst),
1397                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1398          (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1399
1400def : Pat<(v4f32 (X86Movss
1401                   (v4f32 VR128:$dst),
1402                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1403          (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1404
1405def : Pat<(v2f64 (X86Movsd
1406                   (v2f64 VR128:$dst),
1407                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1408          (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1409
1410def : Pat<(v2f64 (X86Movsd
1411                   (v2f64 VR128:$dst),
1412                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1413          (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1414
1415def : Pat<(v2f64 (X86Movsd
1416                   (v2f64 VR128:$dst),
1417                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1418          (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1419
1420def : Pat<(v2f64 (X86Movsd
1421                   (v2f64 VR128:$dst),
1422                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1423          (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1424} // Predicates = [UseAVX]
1425
1426let Predicates = [UseSSE2] in {
1427def : Pat<(v4f32 (X86Movss
1428                   (v4f32 VR128:$dst),
1429                   (v4f32 (scalar_to_vector
1430                     (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
1431          (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
1432
1433def : Pat<(v2f64 (X86Movsd
1434                   (v2f64 VR128:$dst),
1435                   (v2f64 (scalar_to_vector
1436                     (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
1437          (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
1438
1439def : Pat<(v2f64 (X86Movsd
1440                   (v2f64 VR128:$dst),
1441                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
1442          (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
1443
1444def : Pat<(v2f64 (X86Movsd
1445                   (v2f64 VR128:$dst),
1446                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
1447          (CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
1448
1449def : Pat<(v2f64 (X86Movsd
1450                   (v2f64 VR128:$dst),
1451                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
1452          (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
1453
1454def : Pat<(v2f64 (X86Movsd
1455                   (v2f64 VR128:$dst),
1456                   (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
1457          (CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
1458} // Predicates = [UseSSE2]
1459
1460let Predicates = [UseSSE1] in {
1461def : Pat<(v4f32 (X86Movss
1462                   (v4f32 VR128:$dst),
1463                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
1464          (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
1465
1466def : Pat<(v4f32 (X86Movss
1467                   (v4f32 VR128:$dst),
1468                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
1469          (CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
1470
1471def : Pat<(v4f32 (X86Movss
1472                   (v4f32 VR128:$dst),
1473                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
1474          (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
1475
1476def : Pat<(v4f32 (X86Movss
1477                   (v4f32 VR128:$dst),
1478                   (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
1479          (CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
1480} // Predicates = [UseSSE1]
1481
1482let Predicates = [HasAVX, NoVLX] in {
1483// Convert packed single/double fp to doubleword
1484def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1485                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1486                       [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1487                       VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
1488def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1489                       "cvtps2dq\t{$src, $dst|$dst, $src}",
1490                       [(set VR128:$dst,
1491                         (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
1492                       VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
1493def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1494                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1495                        [(set VR256:$dst,
1496                          (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
1497                        VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
1498def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1499                        "cvtps2dq\t{$src, $dst|$dst, $src}",
1500                        [(set VR256:$dst,
1501                          (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
1502                        VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
1503}
1504def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1505                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1506                     [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
1507                     Sched<[WriteCvtPS2I]>, SIMD_EXC;
1508def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1509                     "cvtps2dq\t{$src, $dst|$dst, $src}",
1510                     [(set VR128:$dst,
1511                       (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
1512                     Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
1513
1514
1515// Convert Packed Double FP to Packed DW Integers
1516let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1517// The assembler can recognize rr 256-bit instructions by seeing a ymm
1518// register, but the same isn't true when using memory operands instead.
1519// Provide other assembly rr and rm forms to address this explicitly.
1520def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1521                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1522                       [(set VR128:$dst,
1523                         (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1524                       VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1525
1526// XMM only
1527def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1528                      "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
1529                      [(set VR128:$dst,
1530                        (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
1531                      Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1532
1533// YMM only
1534def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1535                       "vcvtpd2dq\t{$src, $dst|$dst, $src}",
1536                       [(set VR128:$dst,
1537                         (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
1538                       VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1539def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1540                       "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
1541                       [(set VR128:$dst,
1542                         (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
1543                       VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1544}
1545
1546def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
1547                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1548def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
1549                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1550
1551def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1552                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1553                      [(set VR128:$dst,
1554                        (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
1555                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1556def CVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1557                      "cvtpd2dq\t{$src, $dst|$dst, $src}",
1558                      [(set VR128:$dst,
1559                        (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
1560                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1561
1562// Convert with truncation packed single/double fp to doubleword
1563// SSE2 packed instructions with XS prefix
1564let Uses = [MXCSR], mayRaiseFPException = 1 in {
1565let Predicates = [HasAVX, NoVLX] in {
1566def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1567                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1568                         [(set VR128:$dst,
1569                           (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1570                         VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
1571def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1572                         "cvttps2dq\t{$src, $dst|$dst, $src}",
1573                         [(set VR128:$dst,
1574                           (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
1575                         VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
1576def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
1577                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1578                          [(set VR256:$dst,
1579                            (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
1580                          VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
1581def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
1582                          "cvttps2dq\t{$src, $dst|$dst, $src}",
1583                          [(set VR256:$dst,
1584                            (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
1585                          VEX, VEX_L,
1586                          Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
1587}
1588
1589def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1590                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1591                       [(set VR128:$dst,
1592                         (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
1593                       Sched<[WriteCvtPS2I]>;
1594def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1595                       "cvttps2dq\t{$src, $dst|$dst, $src}",
1596                       [(set VR128:$dst,
1597                         (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
1598                       Sched<[WriteCvtPS2ILd]>;
1599}
1600
1601// The assembler can recognize rr 256-bit instructions by seeing a ymm
1602// register, but the same isn't true when using memory operands instead.
1603// Provide other assembly rr and rm forms to address this explicitly.
1604let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1605// XMM only
1606def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1607                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
1608                        [(set VR128:$dst,
1609                          (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1610                        VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
1611def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1612                        "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
1613                        [(set VR128:$dst,
1614                          (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
1615                        VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
1616
1617// YMM only
1618def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1619                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
1620                         [(set VR128:$dst,
1621                           (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
1622                         VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
1623def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1624                         "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
1625                         [(set VR128:$dst,
1626                           (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
1627                         VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
1628} // Predicates = [HasAVX, NoVLX]
1629
1630def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
1631                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">;
1632def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
1633                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
1634
1635let Predicates = [HasAVX, NoVLX] in {
1636  def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
1637            (VCVTTPD2DQYrr VR256:$src)>;
1638  def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
1639            (VCVTTPD2DQYrm addr:$src)>;
1640}
1641
1642def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1643                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1644                      [(set VR128:$dst,
1645                        (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
1646                      Sched<[WriteCvtPD2I]>, SIMD_EXC;
1647def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
1648                      "cvttpd2dq\t{$src, $dst|$dst, $src}",
1649                      [(set VR128:$dst,
1650                        (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
1651                      Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
1652
1653// Convert packed single to packed double
1654let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1655                  // SSE2 instructions without OpSize prefix
1656def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1657                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1658                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1659                    PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
1660def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1661                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
1662                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1663                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
1664def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1665                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1666                     [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
1667                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
1668def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
1669                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
1670                     [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
1671                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
1672}
1673
1674let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
1675def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1676                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1677                   [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
1678                   PS, Sched<[WriteCvtPS2PD]>;
1679def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
1680                   "cvtps2pd\t{$src, $dst|$dst, $src}",
1681                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
1682                   PS, Sched<[WriteCvtPS2PD.Folded]>;
1683}
1684
1685// Convert Packed DW Integers to Packed Double FP
1686let Predicates = [HasAVX, NoVLX] in {
1687let hasSideEffects = 0, mayLoad = 1 in
1688def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1689                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1690                        [(set VR128:$dst,
1691                          (v2f64 (X86any_VSintToFP
1692                                  (bc_v4i32
1693                                   (v2i64 (scalar_to_vector
1694                                           (loadi64 addr:$src)))))))]>,
1695                        VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG;
1696def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1697                        "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1698                        [(set VR128:$dst,
1699                          (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1700                        VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
1701def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
1702                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1703                         [(set VR256:$dst,
1704                           (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
1705                         VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
1706                         VEX_WIG;
1707def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
1708                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
1709                         [(set VR256:$dst,
1710                           (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
1711                         VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
1712}
1713
1714let hasSideEffects = 0, mayLoad = 1 in
1715def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
1716                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1717                       [(set VR128:$dst,
1718                         (v2f64 (X86any_VSintToFP
1719                                 (bc_v4i32
1720                                  (v2i64 (scalar_to_vector
1721                                          (loadi64 addr:$src)))))))]>,
1722                       Sched<[WriteCvtI2PDLd]>;
1723def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1724                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
1725                       [(set VR128:$dst,
1726                         (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
1727                       Sched<[WriteCvtI2PD]>;
1728
1729// AVX register conversion intrinsics
1730let Predicates = [HasAVX, NoVLX] in {
1731  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1732            (VCVTDQ2PDrm addr:$src)>;
1733} // Predicates = [HasAVX, NoVLX]
1734
1735// SSE2 register conversion intrinsics
1736let Predicates = [UseSSE2] in {
1737  def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
1738            (CVTDQ2PDrm addr:$src)>;
1739} // Predicates = [UseSSE2]
1740
1741// Convert packed double to packed single
1742// The assembler can recognize rr 256-bit instructions by seeing a ymm
1743// register, but the same isn't true when using memory operands instead.
1744// Provide other assembly rr and rm forms to address this explicitly.
1745let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
1746// XMM only
1747def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1748                       "cvtpd2ps\t{$src, $dst|$dst, $src}",
1749                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1750                       VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
1751def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1752                       "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
1753                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
1754                       VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
1755
1756def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
1757                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
1758                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
1759                        VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
1760def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
1761                        "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
1762                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
1763                        VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
1764} // Predicates = [HasAVX, NoVLX]
1765
1766def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
1767                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">;
1768def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
1769                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">;
1770
1771def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
1772                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1773                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
1774                     Sched<[WriteCvtPD2PS]>, SIMD_EXC;
1775def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
1776                     "cvtpd2ps\t{$src, $dst|$dst, $src}",
1777                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
1778                     Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
1779
1780//===----------------------------------------------------------------------===//
1781// SSE 1 & 2 - Compare Instructions
1782//===----------------------------------------------------------------------===//
1783
1784// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
1785multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
1786                            Operand memop, SDNode OpNode, ValueType VT,
1787                            PatFrag ld_frag, string asm,
1788                            X86FoldableSchedWrite sched,
1789                            PatFrags mem_frags> {
1790  def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
1791                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
1792                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1793                                              VR128:$src2, timm:$cc))]>,
1794           Sched<[sched]>, SIMD_EXC;
1795  let mayLoad = 1 in
1796  def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
1797                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
1798                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
1799                                              (mem_frags addr:$src2), timm:$cc))]>,
1800           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1801
1802  let isCodeGenOnly = 1 in {
1803    let isCommutable = 1 in
1804    def rr : SIi8<0xC2, MRMSrcReg,
1805                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1806                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
1807                  Sched<[sched]>, SIMD_EXC;
1808    def rm : SIi8<0xC2, MRMSrcMem,
1809                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1810                  [(set RC:$dst, (OpNode RC:$src1,
1811                                         (ld_frag addr:$src2), timm:$cc))]>,
1812                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1813  }
1814}
1815
1816let ExeDomain = SSEPackedSingle in
1817defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1818                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1819                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
1820                 XS, VEX_4V, VEX_LIG, VEX_WIG;
1821let ExeDomain = SSEPackedDouble in
1822defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1823                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1824                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
1825                 XD, VEX_4V, VEX_LIG, VEX_WIG;
1826
1827let Constraints = "$src1 = $dst" in {
1828  let ExeDomain = SSEPackedSingle in
1829  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
1830                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1831                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
1832  let ExeDomain = SSEPackedDouble in
1833  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
1834                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1835                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
1836}
1837
1838// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
1839multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode,
1840                         ValueType vt, X86MemOperand x86memop,
1841                         PatFrag ld_frag, string OpcodeStr, Domain d,
1842                         X86FoldableSchedWrite sched = WriteFComX> {
1843  let ExeDomain = d in {
1844  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1845                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1846                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1847          Sched<[sched]>, SIMD_EXC;
1848  let mayLoad = 1 in
1849  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
1850                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1851                     [(set EFLAGS, (OpNode (vt RC:$src1),
1852                                           (ld_frag addr:$src2)))]>,
1853          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1854}
1855}
1856
1857// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
1858multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
1859                             ValueType vt, Operand memop,
1860                             PatFrags mem_frags, string OpcodeStr,
1861                             Domain d,
1862                             X86FoldableSchedWrite sched = WriteFComX> {
1863let ExeDomain = d in {
1864  def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
1865                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1866                     [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
1867          Sched<[sched]>, SIMD_EXC;
1868let mayLoad = 1 in
1869  def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
1870                     !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
1871                     [(set EFLAGS, (OpNode (vt RC:$src1),
1872                                           (mem_frags addr:$src2)))]>,
1873          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1874}
1875}
1876
1877let Defs = [EFLAGS] in {
1878  defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1879                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1880  defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1881                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1882  defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1883                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1884  defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1885                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1886
1887  let isCodeGenOnly = 1 in {
1888    defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1889                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1890    defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1891                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1892
1893    defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1894                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
1895    defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1896                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
1897  }
1898  defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
1899                                  "ucomiss", SSEPackedSingle>, PS;
1900  defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
1901                                  "ucomisd", SSEPackedDouble>, PD;
1902  defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
1903                                  "comiss", SSEPackedSingle>, PS;
1904  defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
1905                                  "comisd", SSEPackedDouble>, PD;
1906
1907  let isCodeGenOnly = 1 in {
1908    defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
1909                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
1910    defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
1911                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
1912
1913    defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
1914                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
1915    defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
1916                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
1917  }
1918} // Defs = [EFLAGS]
1919
1920// sse12_cmp_packed - sse 1 & 2 compare packed instructions
1921multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
1922                            ValueType VT, string asm,
1923                            X86FoldableSchedWrite sched,
1924                            Domain d, PatFrag ld_frag> {
1925  let isCommutable = 1 in
1926  def rri : PIi8<0xC2, MRMSrcReg,
1927             (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
1928             [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
1929            Sched<[sched]>, SIMD_EXC;
1930  def rmi : PIi8<0xC2, MRMSrcMem,
1931             (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
1932             [(set RC:$dst,
1933               (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
1934            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
1935}
1936
1937defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1938               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1939               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
1940defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1941               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1942               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
1943defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
1944               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1945               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG;
1946defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
1947               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
1948               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG;
1949let Constraints = "$src1 = $dst" in {
1950  defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
1951                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1952                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
1953  defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
1954                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
1955                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
1956}
1957
1958def CommutableCMPCC : PatLeaf<(timm), [{
1959  uint64_t Imm = N->getZExtValue() & 0x7;
1960  return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07);
1961}]>;
1962
1963// Patterns to select compares with loads in first operand.
1964let Predicates = [HasAVX] in {
1965  def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
1966                                CommutableCMPCC:$cc)),
1967            (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1968
1969  def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
1970                                CommutableCMPCC:$cc)),
1971            (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
1972
1973  def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
1974                                CommutableCMPCC:$cc)),
1975            (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1976
1977  def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
1978                                CommutableCMPCC:$cc)),
1979            (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
1980
1981  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1982                          CommutableCMPCC:$cc)),
1983            (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1984
1985  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
1986                          CommutableCMPCC:$cc)),
1987            (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
1988}
1989
1990let Predicates = [UseSSE2] in {
1991  def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
1992                                CommutableCMPCC:$cc)),
1993            (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
1994
1995  def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
1996                          CommutableCMPCC:$cc)),
1997            (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>;
1998}
1999
2000let Predicates = [UseSSE1] in {
2001  def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
2002                                CommutableCMPCC:$cc)),
2003            (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
2004
2005  def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
2006                          CommutableCMPCC:$cc)),
2007            (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>;
2008}
2009
2010//===----------------------------------------------------------------------===//
2011// SSE 1 & 2 - Shuffle Instructions
2012//===----------------------------------------------------------------------===//
2013
2014/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
2015multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
2016                         ValueType vt, string asm, PatFrag mem_frag,
2017                         X86FoldableSchedWrite sched, Domain d,
2018                         bit IsCommutable = 0> {
2019  def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
2020                   (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
2021                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
2022                                       (i8 timm:$src3))))], d>,
2023            Sched<[sched.Folded, sched.ReadAfterFold]>;
2024  let isCommutable = IsCommutable in
2025  def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
2026                 (ins RC:$src1, RC:$src2, u8imm:$src3), asm,
2027                 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
2028                                     (i8 timm:$src3))))], d>,
2029            Sched<[sched]>;
2030}
2031
2032let Predicates = [HasAVX, NoVLX] in {
2033  defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
2034           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2035           loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
2036           PS, VEX_4V, VEX_WIG;
2037  defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
2038           "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2039           loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
2040           PS, VEX_4V, VEX_L, VEX_WIG;
2041  defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
2042           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2043           loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
2044           PD, VEX_4V, VEX_WIG;
2045  defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
2046           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
2047           loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
2048           PD, VEX_4V, VEX_L, VEX_WIG;
2049}
2050let Constraints = "$src1 = $dst" in {
2051  defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
2052                    "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2053                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2054  defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
2055                    "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
2056                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2057}
2058
2059//===----------------------------------------------------------------------===//
2060// SSE 1 & 2 - Unpack FP Instructions
2061//===----------------------------------------------------------------------===//
2062
2063/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
2064multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
2065                                   PatFrag mem_frag, RegisterClass RC,
2066                                   X86MemOperand x86memop, string asm,
2067                                   X86FoldableSchedWrite sched, Domain d,
2068                                   bit IsCommutable = 0> {
2069    let isCommutable = IsCommutable in
2070    def rr : PI<opc, MRMSrcReg,
2071                (outs RC:$dst), (ins RC:$src1, RC:$src2),
2072                asm, [(set RC:$dst,
2073                           (vt (OpNode RC:$src1, RC:$src2)))], d>,
2074                Sched<[sched]>;
2075    def rm : PI<opc, MRMSrcMem,
2076                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2077                asm, [(set RC:$dst,
2078                           (vt (OpNode RC:$src1,
2079                                       (mem_frag addr:$src2))))], d>,
2080             Sched<[sched.Folded, sched.ReadAfterFold]>;
2081}
2082
2083let Predicates = [HasAVX, NoVLX] in {
2084defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
2085      VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2086                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2087defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
2088      VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2089                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG;
2090defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
2091      VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2092                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
2093defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
2094      VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2095                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
2096
2097defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
2098      VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2099                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2100defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
2101      VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2102                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2103defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
2104      VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2105                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
2106defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
2107      VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
2108                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
2109}// Predicates = [HasAVX, NoVLX]
2110
2111let Constraints = "$src1 = $dst" in {
2112  defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
2113        VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
2114                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2115  defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
2116        VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
2117                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
2118  defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
2119        VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
2120                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
2121  defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
2122        VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
2123                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
2124} // Constraints = "$src1 = $dst"
2125
2126let Predicates = [HasAVX1Only] in {
2127  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))),
2128            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
2129  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
2130            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
2131  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))),
2132            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
2133  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
2134            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
2135
2136  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
2137            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
2138  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
2139            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
2140  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
2141            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
2142  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
2143            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
2144}
2145
2146let Predicates = [UseSSE2] in {
2147  // Use MOVHPD if the load isn't aligned enough for UNPCKLPD.
2148  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
2149                              (v2f64 (simple_load addr:$src2)))),
2150            (MOVHPDrm VR128:$src1, addr:$src2)>;
2151}
2152
2153//===----------------------------------------------------------------------===//
2154// SSE 1 & 2 - Extract Floating-Point Sign mask
2155//===----------------------------------------------------------------------===//
2156
2157/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
2158multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
2159                                string asm, Domain d> {
2160  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
2161              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
2162              [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>,
2163              Sched<[WriteFMOVMSK]>;
2164}
2165
2166let Predicates = [HasAVX] in {
2167  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2168                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
2169  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2170                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
2171  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
2172                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
2173  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
2174                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
2175
2176  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2177  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2178            (VMOVMSKPSrr VR128:$src)>;
2179  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2180            (VMOVMSKPDrr VR128:$src)>;
2181  def : Pat<(X86movmsk (v8i32 VR256:$src)),
2182            (VMOVMSKPSYrr VR256:$src)>;
2183  def : Pat<(X86movmsk (v4i64 VR256:$src)),
2184            (VMOVMSKPDYrr VR256:$src)>;
2185}
2186
2187defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
2188                                     SSEPackedSingle>, PS;
2189defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
2190                                     SSEPackedDouble>, PD;
2191
2192let Predicates = [UseSSE2] in {
2193  // Also support integer VTs to avoid a int->fp bitcast in the DAG.
2194  def : Pat<(X86movmsk (v4i32 VR128:$src)),
2195            (MOVMSKPSrr VR128:$src)>;
2196  def : Pat<(X86movmsk (v2i64 VR128:$src)),
2197            (MOVMSKPDrr VR128:$src)>;
2198}
2199
2200//===---------------------------------------------------------------------===//
2201// SSE2 - Packed Integer Logical Instructions
2202//===---------------------------------------------------------------------===//
2203
2204let ExeDomain = SSEPackedInt in { // SSE integer instructions
2205
2206/// PDI_binop_rm - Simple SSE2 binary operator.
2207multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
2208                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
2209                        X86MemOperand x86memop, X86FoldableSchedWrite sched,
2210                        bit IsCommutable, bit Is2Addr> {
2211  let isCommutable = IsCommutable in
2212  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
2213       (ins RC:$src1, RC:$src2),
2214       !if(Is2Addr,
2215           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2216           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2217       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
2218       Sched<[sched]>;
2219  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
2220       (ins RC:$src1, x86memop:$src2),
2221       !if(Is2Addr,
2222           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
2223           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
2224       [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
2225       Sched<[sched.Folded, sched.ReadAfterFold]>;
2226}
2227} // ExeDomain = SSEPackedInt
2228
2229multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
2230                         ValueType OpVT128, ValueType OpVT256,
2231                         X86SchedWriteWidths sched, bit IsCommutable,
2232                         Predicate prd> {
2233let Predicates = [HasAVX, prd] in
2234  defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
2235                             VR128, load, i128mem, sched.XMM,
2236                             IsCommutable, 0>, VEX_4V, VEX_WIG;
2237
2238let Constraints = "$src1 = $dst" in
2239  defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
2240                           memop, i128mem, sched.XMM, IsCommutable, 1>;
2241
2242let Predicates = [HasAVX2, prd] in
2243  defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
2244                               OpVT256, VR256, load, i256mem, sched.YMM,
2245                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
2246}
2247
2248// These are ordered here for pattern ordering requirements with the fp versions
2249
2250defm PAND  : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
2251                           SchedWriteVecLogic, 1, NoVLX>;
2252defm POR   : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
2253                           SchedWriteVecLogic, 1, NoVLX>;
2254defm PXOR  : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
2255                           SchedWriteVecLogic, 1, NoVLX>;
2256defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
2257                           SchedWriteVecLogic, 0, NoVLX>;
2258
2259//===----------------------------------------------------------------------===//
2260// SSE 1 & 2 - Logical Instructions
2261//===----------------------------------------------------------------------===//
2262
2263/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
2264///
2265/// There are no patterns here because isel prefers integer versions for SSE2
2266/// and later. There are SSE1 v4f32 patterns later.
2267multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
2268                                   X86SchedWriteWidths sched> {
2269  let Predicates = [HasAVX, NoVLX] in {
2270  defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
2271        !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
2272        [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2273
2274  defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
2275        !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
2276        [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2277
2278  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2279       !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2280       [], [], 0>, PS, VEX_4V, VEX_WIG;
2281
2282  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2283       !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2284       [], [], 0>, PD, VEX_4V, VEX_WIG;
2285  }
2286
2287  let Constraints = "$src1 = $dst" in {
2288    defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
2289         !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
2290         [], []>, PS;
2291
2292    defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
2293         !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
2294         [], []>, PD;
2295  }
2296}
2297
2298defm AND  : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>;
2299defm OR   : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>;
2300defm XOR  : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>;
2301let isCommutable = 0 in
2302  defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>;
2303
2304let Predicates = [HasAVX2, NoVLX] in {
2305  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2306            (VPANDYrr VR256:$src1, VR256:$src2)>;
2307  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2308            (VPANDYrr VR256:$src1, VR256:$src2)>;
2309  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2310            (VPANDYrr VR256:$src1, VR256:$src2)>;
2311
2312  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2313            (VPORYrr VR256:$src1, VR256:$src2)>;
2314  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2315            (VPORYrr VR256:$src1, VR256:$src2)>;
2316  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2317            (VPORYrr VR256:$src1, VR256:$src2)>;
2318
2319  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2320            (VPXORYrr VR256:$src1, VR256:$src2)>;
2321  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2322            (VPXORYrr VR256:$src1, VR256:$src2)>;
2323  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2324            (VPXORYrr VR256:$src1, VR256:$src2)>;
2325
2326  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2327            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2328  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2329            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2330  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2331            (VPANDNYrr VR256:$src1, VR256:$src2)>;
2332
2333  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2334            (VPANDYrm VR256:$src1, addr:$src2)>;
2335  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2336            (VPANDYrm VR256:$src1, addr:$src2)>;
2337  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2338            (VPANDYrm VR256:$src1, addr:$src2)>;
2339
2340  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2341            (VPORYrm VR256:$src1, addr:$src2)>;
2342  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2343            (VPORYrm VR256:$src1, addr:$src2)>;
2344  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2345            (VPORYrm VR256:$src1, addr:$src2)>;
2346
2347  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2348            (VPXORYrm VR256:$src1, addr:$src2)>;
2349  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2350            (VPXORYrm VR256:$src1, addr:$src2)>;
2351  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2352            (VPXORYrm VR256:$src1, addr:$src2)>;
2353
2354  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2355            (VPANDNYrm VR256:$src1, addr:$src2)>;
2356  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2357            (VPANDNYrm VR256:$src1, addr:$src2)>;
2358  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2359            (VPANDNYrm VR256:$src1, addr:$src2)>;
2360}
2361
2362// If only AVX1 is supported, we need to handle integer operations with
2363// floating point instructions since the integer versions aren't available.
2364let Predicates = [HasAVX1Only] in {
2365  def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)),
2366            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2367  def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)),
2368            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2369  def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)),
2370            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2371  def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
2372            (VANDPSYrr VR256:$src1, VR256:$src2)>;
2373
2374  def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)),
2375            (VORPSYrr VR256:$src1, VR256:$src2)>;
2376  def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)),
2377            (VORPSYrr VR256:$src1, VR256:$src2)>;
2378  def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)),
2379            (VORPSYrr VR256:$src1, VR256:$src2)>;
2380  def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
2381            (VORPSYrr VR256:$src1, VR256:$src2)>;
2382
2383  def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)),
2384            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2385  def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)),
2386            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2387  def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)),
2388            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2389  def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
2390            (VXORPSYrr VR256:$src1, VR256:$src2)>;
2391
2392  def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)),
2393            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2394  def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)),
2395            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2396  def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)),
2397            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2398  def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
2399            (VANDNPSYrr VR256:$src1, VR256:$src2)>;
2400
2401  def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)),
2402            (VANDPSYrm VR256:$src1, addr:$src2)>;
2403  def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)),
2404            (VANDPSYrm VR256:$src1, addr:$src2)>;
2405  def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)),
2406            (VANDPSYrm VR256:$src1, addr:$src2)>;
2407  def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
2408            (VANDPSYrm VR256:$src1, addr:$src2)>;
2409
2410  def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)),
2411            (VORPSYrm VR256:$src1, addr:$src2)>;
2412  def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)),
2413            (VORPSYrm VR256:$src1, addr:$src2)>;
2414  def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)),
2415            (VORPSYrm VR256:$src1, addr:$src2)>;
2416  def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
2417            (VORPSYrm VR256:$src1, addr:$src2)>;
2418
2419  def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)),
2420            (VXORPSYrm VR256:$src1, addr:$src2)>;
2421  def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)),
2422            (VXORPSYrm VR256:$src1, addr:$src2)>;
2423  def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)),
2424            (VXORPSYrm VR256:$src1, addr:$src2)>;
2425  def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
2426            (VXORPSYrm VR256:$src1, addr:$src2)>;
2427
2428  def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)),
2429            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2430  def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)),
2431            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2432  def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)),
2433            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2434  def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
2435            (VANDNPSYrm VR256:$src1, addr:$src2)>;
2436}
2437
2438let Predicates = [HasAVX, NoVLX] in {
2439  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2440            (VPANDrr VR128:$src1, VR128:$src2)>;
2441  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2442            (VPANDrr VR128:$src1, VR128:$src2)>;
2443  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2444            (VPANDrr VR128:$src1, VR128:$src2)>;
2445
2446  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2447            (VPORrr VR128:$src1, VR128:$src2)>;
2448  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2449            (VPORrr VR128:$src1, VR128:$src2)>;
2450  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2451            (VPORrr VR128:$src1, VR128:$src2)>;
2452
2453  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2454            (VPXORrr VR128:$src1, VR128:$src2)>;
2455  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2456            (VPXORrr VR128:$src1, VR128:$src2)>;
2457  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2458            (VPXORrr VR128:$src1, VR128:$src2)>;
2459
2460  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2461            (VPANDNrr VR128:$src1, VR128:$src2)>;
2462  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2463            (VPANDNrr VR128:$src1, VR128:$src2)>;
2464  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2465            (VPANDNrr VR128:$src1, VR128:$src2)>;
2466
2467  def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)),
2468            (VPANDrm VR128:$src1, addr:$src2)>;
2469  def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)),
2470            (VPANDrm VR128:$src1, addr:$src2)>;
2471  def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)),
2472            (VPANDrm VR128:$src1, addr:$src2)>;
2473
2474  def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)),
2475            (VPORrm VR128:$src1, addr:$src2)>;
2476  def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)),
2477            (VPORrm VR128:$src1, addr:$src2)>;
2478  def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)),
2479            (VPORrm VR128:$src1, addr:$src2)>;
2480
2481  def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)),
2482            (VPXORrm VR128:$src1, addr:$src2)>;
2483  def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)),
2484            (VPXORrm VR128:$src1, addr:$src2)>;
2485  def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)),
2486            (VPXORrm VR128:$src1, addr:$src2)>;
2487
2488  def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)),
2489            (VPANDNrm VR128:$src1, addr:$src2)>;
2490  def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)),
2491            (VPANDNrm VR128:$src1, addr:$src2)>;
2492  def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)),
2493            (VPANDNrm VR128:$src1, addr:$src2)>;
2494}
2495
2496let Predicates = [UseSSE2] in {
2497  def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
2498            (PANDrr VR128:$src1, VR128:$src2)>;
2499  def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)),
2500            (PANDrr VR128:$src1, VR128:$src2)>;
2501  def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)),
2502            (PANDrr VR128:$src1, VR128:$src2)>;
2503
2504  def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)),
2505            (PORrr VR128:$src1, VR128:$src2)>;
2506  def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)),
2507            (PORrr VR128:$src1, VR128:$src2)>;
2508  def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)),
2509            (PORrr VR128:$src1, VR128:$src2)>;
2510
2511  def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)),
2512            (PXORrr VR128:$src1, VR128:$src2)>;
2513  def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)),
2514            (PXORrr VR128:$src1, VR128:$src2)>;
2515  def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)),
2516            (PXORrr VR128:$src1, VR128:$src2)>;
2517
2518  def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)),
2519            (PANDNrr VR128:$src1, VR128:$src2)>;
2520  def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)),
2521            (PANDNrr VR128:$src1, VR128:$src2)>;
2522  def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)),
2523            (PANDNrr VR128:$src1, VR128:$src2)>;
2524
2525  def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)),
2526            (PANDrm VR128:$src1, addr:$src2)>;
2527  def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)),
2528            (PANDrm VR128:$src1, addr:$src2)>;
2529  def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)),
2530            (PANDrm VR128:$src1, addr:$src2)>;
2531
2532  def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)),
2533            (PORrm VR128:$src1, addr:$src2)>;
2534  def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)),
2535            (PORrm VR128:$src1, addr:$src2)>;
2536  def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)),
2537            (PORrm VR128:$src1, addr:$src2)>;
2538
2539  def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)),
2540            (PXORrm VR128:$src1, addr:$src2)>;
2541  def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)),
2542            (PXORrm VR128:$src1, addr:$src2)>;
2543  def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)),
2544            (PXORrm VR128:$src1, addr:$src2)>;
2545
2546  def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)),
2547            (PANDNrm VR128:$src1, addr:$src2)>;
2548  def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)),
2549            (PANDNrm VR128:$src1, addr:$src2)>;
2550  def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)),
2551            (PANDNrm VR128:$src1, addr:$src2)>;
2552}
2553
2554// Patterns for packed operations when we don't have integer type available.
2555def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
2556          (ANDPSrr VR128:$src1, VR128:$src2)>;
2557def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
2558          (ORPSrr VR128:$src1, VR128:$src2)>;
2559def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
2560          (XORPSrr VR128:$src1, VR128:$src2)>;
2561def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
2562          (ANDNPSrr VR128:$src1, VR128:$src2)>;
2563
2564def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
2565          (ANDPSrm VR128:$src1, addr:$src2)>;
2566def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
2567          (ORPSrm VR128:$src1, addr:$src2)>;
2568def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
2569          (XORPSrm VR128:$src1, addr:$src2)>;
2570def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
2571          (ANDNPSrm VR128:$src1, addr:$src2)>;
2572
2573//===----------------------------------------------------------------------===//
2574// SSE 1 & 2 - Arithmetic Instructions
2575//===----------------------------------------------------------------------===//
2576
2577/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
2578/// vector forms.
2579///
2580/// In addition, we also have a special variant of the scalar form here to
2581/// represent the associated intrinsic operation.  This form is unlike the
2582/// plain scalar form, in that it takes an entire vector (instead of a scalar)
2583/// and leaves the top elements unmodified (therefore these cannot be commuted).
2584///
2585/// These three forms can each be reg+reg or reg+mem.
2586///
2587
2588/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
2589/// classes below
2590multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
2591                                  SDPatternOperator OpNode, X86SchedWriteSizes sched> {
2592let Uses = [MXCSR], mayRaiseFPException = 1 in {
2593  let Predicates = [HasAVX, NoVLX] in {
2594  defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
2595                               VR128, v4f32, f128mem, loadv4f32,
2596                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG;
2597  defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
2598                               VR128, v2f64, f128mem, loadv2f64,
2599                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG;
2600
2601  defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
2602                        OpNode, VR256, v8f32, f256mem, loadv8f32,
2603                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
2604  defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
2605                        OpNode, VR256, v4f64, f256mem, loadv4f64,
2606                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
2607  }
2608
2609  let Constraints = "$src1 = $dst" in {
2610    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
2611                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
2612                              sched.PS.XMM>, PS;
2613    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
2614                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
2615                              sched.PD.XMM>, PD;
2616  }
2617}
2618}
2619
2620multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2621                                  X86SchedWriteSizes sched> {
2622let Uses = [MXCSR], mayRaiseFPException = 1 in {
2623  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2624                         OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
2625                         XS, VEX_4V, VEX_LIG, VEX_WIG;
2626  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2627                         OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
2628                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2629
2630  let Constraints = "$src1 = $dst" in {
2631    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
2632                              OpNode, FR32, f32mem, SSEPackedSingle,
2633                              sched.PS.Scl>, XS;
2634    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
2635                              OpNode, FR64, f64mem, SSEPackedDouble,
2636                              sched.PD.Scl>, XD;
2637  }
2638}
2639}
2640
2641multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
2642                                      SDPatternOperator OpNode,
2643                                      X86SchedWriteSizes sched> {
2644let Uses = [MXCSR], mayRaiseFPException = 1 in {
2645  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2646                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2647                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
2648  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2649                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2650                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
2651
2652  let Constraints = "$src1 = $dst" in {
2653    defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
2654                   !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
2655                   SSEPackedSingle, sched.PS.Scl>, XS;
2656    defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
2657                   !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
2658                   SSEPackedDouble, sched.PD.Scl>, XD;
2659  }
2660}
2661}
2662
2663// Binary Arithmetic instructions
2664defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2665           basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
2666           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
2667defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2668           basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
2669           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
2670let isCommutable = 0 in {
2671  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2672             basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
2673             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
2674  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2675             basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
2676             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
2677  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2678             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
2679             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>;
2680  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2681             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>,
2682             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>;
2683}
2684
2685let isCodeGenOnly = 1 in {
2686  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>,
2687             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>;
2688  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>,
2689             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>;
2690}
2691
2692// Patterns used to select SSE scalar fp arithmetic instructions from
2693// either:
2694//
2695// (1) a scalar fp operation followed by a blend
2696//
2697// The effect is that the backend no longer emits unnecessary vector
2698// insert instructions immediately after SSE scalar fp instructions
2699// like addss or mulss.
2700//
2701// For example, given the following code:
2702//   __m128 foo(__m128 A, __m128 B) {
2703//     A[0] += B[0];
2704//     return A;
2705//   }
2706//
2707// Previously we generated:
2708//   addss %xmm0, %xmm1
2709//   movss %xmm1, %xmm0
2710//
2711// We now generate:
2712//   addss %xmm1, %xmm0
2713//
2714// (2) a vector packed single/double fp operation followed by a vector insert
2715//
2716// The effect is that the backend converts the packed fp instruction
2717// followed by a vector insert into a single SSE scalar fp instruction.
2718//
2719// For example, given the following code:
2720//   __m128 foo(__m128 A, __m128 B) {
2721//     __m128 C = A + B;
2722//     return (__m128) {c[0], a[1], a[2], a[3]};
2723//   }
2724//
2725// Previously we generated:
2726//   addps %xmm0, %xmm1
2727//   movss %xmm1, %xmm0
2728//
2729// We now generate:
2730//   addss %xmm1, %xmm0
2731
2732// TODO: Some canonicalization in lowering would simplify the number of
2733// patterns we have to try to match.
2734multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move,
2735                                ValueType VT, ValueType EltTy,
2736                                RegisterClass RC, PatFrag ld_frag,
2737                                Predicate BasePredicate> {
2738  let Predicates = [BasePredicate] in {
2739    // extracted scalar math op with insert via movss/movsd
2740    def : Pat<(VT (Move (VT VR128:$dst),
2741                        (VT (scalar_to_vector
2742                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2743                                 RC:$src))))),
2744              (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
2745               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2746    def : Pat<(VT (Move (VT VR128:$dst),
2747                        (VT (scalar_to_vector
2748                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2749                                 (ld_frag addr:$src)))))),
2750              (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2751  }
2752
2753  // Repeat for AVX versions of the instructions.
2754  let Predicates = [UseAVX] in {
2755    // extracted scalar math op with insert via movss/movsd
2756    def : Pat<(VT (Move (VT VR128:$dst),
2757                        (VT (scalar_to_vector
2758                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2759                                 RC:$src))))),
2760              (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
2761               (VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
2762    def : Pat<(VT (Move (VT VR128:$dst),
2763                        (VT (scalar_to_vector
2764                             (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
2765                                 (ld_frag addr:$src)))))),
2766              (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
2767  }
2768}
2769
2770defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2771defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2772defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2773defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
2774
2775defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2776defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2777defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2778defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
2779
2780/// Unop Arithmetic
2781/// In addition, we also have a special variant of the scalar form here to
2782/// represent the associated intrinsic operation.  This form is unlike the
2783/// plain scalar form, in that it takes an entire vector (instead of a
2784/// scalar) and leaves the top elements undefined.
2785///
2786/// And, we have a special variant form for a full-vector intrinsic form.
2787
2788/// sse_fp_unop_s - SSE1 unops in scalar form
2789/// For the non-AVX defs, we need $src1 to be tied to $dst because
2790/// the HW instructions are 2 operand / destructive.
2791multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2792                          X86MemOperand x86memop, Operand intmemop,
2793                          SDPatternOperator OpNode, Domain d,
2794                          X86FoldableSchedWrite sched, Predicate target> {
2795  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2796  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
2797              !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2798            [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>,
2799            Requires<[target]>;
2800  let mayLoad = 1 in
2801  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
2802            !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"),
2803            [(set RC:$dst, (OpNode (load addr:$src1)))], d>,
2804            Sched<[sched.Folded]>,
2805            Requires<[target, OptForSize]>;
2806  }
2807
2808  let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in {
2809  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
2810                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2811                Sched<[sched]>;
2812  let mayLoad = 1 in
2813  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
2814                !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>,
2815                Sched<[sched.Folded, sched.ReadAfterFold]>;
2816  }
2817
2818}
2819
2820multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2821                              Intrinsic Intr, Predicate target> {
2822  let Predicates = [target] in {
2823  // These are unary operations, but they are modeled as having 2 source operands
2824  // because the high elements of the destination are unchanged in SSE.
2825  def : Pat<(Intr VR128:$src),
2826            (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>;
2827  }
2828  // We don't want to fold scalar loads into these instructions unless
2829  // optimizing for size. This is because the folded instruction will have a
2830  // partial register update, while the unfolded sequence will not, e.g.
2831  // movss mem, %xmm0
2832  // rcpss %xmm0, %xmm0
2833  // which has a clobber before the rcp, vs.
2834  // rcpss mem, %xmm0
2835  let Predicates = [target, OptForSize] in {
2836    def : Pat<(Intr (mem_frags addr:$src2)),
2837               (!cast<Instruction>(NAME#m_Int)
2838                      (vt (IMPLICIT_DEF)), addr:$src2)>;
2839  }
2840}
2841
2842multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags,
2843                              Intrinsic Intr, Predicate target> {
2844  let Predicates = [target] in {
2845   def : Pat<(Intr VR128:$src),
2846             (!cast<Instruction>(NAME#r_Int) VR128:$src,
2847                                 VR128:$src)>;
2848  }
2849  let Predicates = [target, OptForSize] in {
2850    def : Pat<(Intr (mem_frags addr:$src2)),
2851              (!cast<Instruction>(NAME#m_Int)
2852                    (vt (IMPLICIT_DEF)), addr:$src2)>;
2853  }
2854}
2855
2856multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
2857                          ValueType ScalarVT, X86MemOperand x86memop,
2858                          Operand intmemop, SDPatternOperator OpNode, Domain d,
2859                          X86FoldableSchedWrite sched, Predicate target> {
2860  let isCodeGenOnly = 1, hasSideEffects = 0 in {
2861  def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
2862            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2863            [], d>, Sched<[sched]>;
2864  let mayLoad = 1 in
2865  def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
2866             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2867            [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2868  }
2869  let hasSideEffects = 0, ExeDomain = d in {
2870  def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
2871                (ins VR128:$src1, VR128:$src2),
2872             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2873             []>, Sched<[sched]>;
2874  let mayLoad = 1 in
2875  def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
2876                (ins VR128:$src1, intmemop:$src2),
2877             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
2878             []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
2879  }
2880
2881  // We don't want to fold scalar loads into these instructions unless
2882  // optimizing for size. This is because the folded instruction will have a
2883  // partial register update, while the unfolded sequence will not, e.g.
2884  // vmovss mem, %xmm0
2885  // vrcpss %xmm0, %xmm0, %xmm0
2886  // which has a clobber before the rcp, vs.
2887  // vrcpss mem, %xmm0, %xmm0
2888  // TODO: In theory, we could fold the load, and avoid the stall caused by
2889  // the partial register store, either in BreakFalseDeps or with smarter RA.
2890  let Predicates = [target] in {
2891   def : Pat<(OpNode RC:$src),  (!cast<Instruction>(NAME#r)
2892                                (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
2893  }
2894  let Predicates = [target, OptForSize] in {
2895    def : Pat<(ScalarVT (OpNode (load addr:$src))),
2896              (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)),
2897            addr:$src)>;
2898  }
2899}
2900
2901/// sse1_fp_unop_p - SSE1 unops in packed form.
2902multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2903                          X86SchedWriteWidths sched, list<Predicate> prds> {
2904let Predicates = prds in {
2905  def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2906                       !strconcat("v", OpcodeStr,
2907                                  "ps\t{$src, $dst|$dst, $src}"),
2908                       [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2909                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2910  def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2911                       !strconcat("v", OpcodeStr,
2912                                  "ps\t{$src, $dst|$dst, $src}"),
2913                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>,
2914                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2915  def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2916                        !strconcat("v", OpcodeStr,
2917                                   "ps\t{$src, $dst|$dst, $src}"),
2918                        [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>,
2919                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2920  def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2921                        !strconcat("v", OpcodeStr,
2922                                   "ps\t{$src, $dst|$dst, $src}"),
2923                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>,
2924                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2925}
2926
2927  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2928                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2929                [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>,
2930                Sched<[sched.XMM]>;
2931  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2932                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
2933                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>,
2934                Sched<[sched.XMM.Folded]>;
2935}
2936
2937/// sse2_fp_unop_p - SSE2 unops in vector forms.
2938multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
2939                          SDPatternOperator OpNode, X86SchedWriteWidths sched> {
2940let Predicates = [HasAVX, NoVLX] in {
2941  def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2942                       !strconcat("v", OpcodeStr,
2943                                  "pd\t{$src, $dst|$dst, $src}"),
2944                       [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2945                       VEX, Sched<[sched.XMM]>, VEX_WIG;
2946  def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2947                       !strconcat("v", OpcodeStr,
2948                                  "pd\t{$src, $dst|$dst, $src}"),
2949                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>,
2950                       VEX, Sched<[sched.XMM.Folded]>, VEX_WIG;
2951  def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
2952                        !strconcat("v", OpcodeStr,
2953                                   "pd\t{$src, $dst|$dst, $src}"),
2954                        [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>,
2955                        VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
2956  def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
2957                        !strconcat("v", OpcodeStr,
2958                                   "pd\t{$src, $dst|$dst, $src}"),
2959                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>,
2960                        VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG;
2961}
2962
2963  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
2964                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2965                [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>,
2966                Sched<[sched.XMM]>;
2967  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
2968                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
2969                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>,
2970                Sched<[sched.XMM.Folded]>;
2971}
2972
2973multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
2974  defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
2975                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
2976                      UseSSE1>, XS;
2977  defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
2978                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
2979                      AVXTarget>,
2980                      XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
2981}
2982
2983multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2984                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2985  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
2986                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
2987  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
2988                      f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
2989                       XS, VEX_4V, VEX_LIG, VEX_WIG;
2990}
2991
2992multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
2993                          X86SchedWriteWidths sched, Predicate AVXTarget> {
2994  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
2995                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
2996  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
2997                         f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
2998                         XD, VEX_4V, VEX_LIG, VEX_WIG;
2999}
3000
3001// Square root.
3002defm SQRT  : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
3003             sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
3004             sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
3005             sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
3006
3007// Reciprocal approximations. Note that these typically require refinement
3008// in order to obtain suitable precision.
3009defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>,
3010             sse1_fp_unop_s_intr<"rsqrt", HasAVX>,
3011             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>;
3012defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
3013             sse1_fp_unop_s_intr<"rcp", HasAVX>,
3014             sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>;
3015
3016// There is no f64 version of the reciprocal approximation instructions.
3017
3018multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move,
3019                                      ValueType VT, Predicate BasePredicate> {
3020  let Predicates = [BasePredicate] in {
3021    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3022                                  (OpNode (extractelt VT:$src, 0))))),
3023              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3024  }
3025
3026  // Repeat for AVX versions of the instructions.
3027  let Predicates = [UseAVX] in {
3028    def : Pat<(VT (Move VT:$dst, (scalar_to_vector
3029                                  (OpNode (extractelt VT:$src, 0))))),
3030              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3031  }
3032}
3033
3034defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
3035defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
3036
3037multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
3038                                           SDNode Move, ValueType VT,
3039                                           Predicate BasePredicate> {
3040  let Predicates = [BasePredicate] in {
3041    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3042              (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3043  }
3044
3045  // Repeat for AVX versions of the instructions.
3046  let Predicates = [HasAVX] in {
3047    def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
3048              (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
3049  }
3050}
3051
3052defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
3053                                       v4f32, UseSSE1>;
3054defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
3055                                       v4f32, UseSSE1>;
3056
3057
3058//===----------------------------------------------------------------------===//
3059// SSE 1 & 2 - Non-temporal stores
3060//===----------------------------------------------------------------------===//
3061
3062let AddedComplexity = 400 in { // Prefer non-temporal versions
3063let Predicates = [HasAVX, NoVLX] in {
3064let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3065def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
3066                     (ins f128mem:$dst, VR128:$src),
3067                     "movntps\t{$src, $dst|$dst, $src}",
3068                     [(alignednontemporalstore (v4f32 VR128:$src),
3069                                               addr:$dst)]>, VEX, VEX_WIG;
3070def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
3071                     (ins f128mem:$dst, VR128:$src),
3072                     "movntpd\t{$src, $dst|$dst, $src}",
3073                     [(alignednontemporalstore (v2f64 VR128:$src),
3074                                               addr:$dst)]>, VEX, VEX_WIG;
3075} // SchedRW
3076
3077let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in {
3078def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
3079                     (ins f256mem:$dst, VR256:$src),
3080                     "movntps\t{$src, $dst|$dst, $src}",
3081                     [(alignednontemporalstore (v8f32 VR256:$src),
3082                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3083def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
3084                     (ins f256mem:$dst, VR256:$src),
3085                     "movntpd\t{$src, $dst|$dst, $src}",
3086                     [(alignednontemporalstore (v4f64 VR256:$src),
3087                                               addr:$dst)]>, VEX, VEX_L, VEX_WIG;
3088} // SchedRW
3089
3090let ExeDomain = SSEPackedInt in {
3091def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
3092                         (ins i128mem:$dst, VR128:$src),
3093                         "movntdq\t{$src, $dst|$dst, $src}",
3094                         [(alignednontemporalstore (v2i64 VR128:$src),
3095                                                   addr:$dst)]>, VEX, VEX_WIG,
3096                         Sched<[SchedWriteVecMoveLSNT.XMM.MR]>;
3097def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
3098                    (ins i256mem:$dst, VR256:$src),
3099                    "movntdq\t{$src, $dst|$dst, $src}",
3100                    [(alignednontemporalstore (v4i64 VR256:$src),
3101                                              addr:$dst)]>, VEX, VEX_L, VEX_WIG,
3102                    Sched<[SchedWriteVecMoveLSNT.YMM.MR]>;
3103} // ExeDomain
3104} // Predicates
3105
3106let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in {
3107def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3108                    "movntps\t{$src, $dst|$dst, $src}",
3109                    [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
3110def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3111                    "movntpd\t{$src, $dst|$dst, $src}",
3112                    [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
3113} // SchedRW
3114
3115let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in
3116def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
3117                    "movntdq\t{$src, $dst|$dst, $src}",
3118                    [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>;
3119
3120let SchedRW = [WriteStoreNT] in {
3121// There is no AVX form for instructions below this point
3122def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
3123                 "movnti{l}\t{$src, $dst|$dst, $src}",
3124                 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
3125               PS, Requires<[HasSSE2]>;
3126def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
3127                     "movnti{q}\t{$src, $dst|$dst, $src}",
3128                     [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
3129                  PS, Requires<[HasSSE2]>;
3130} // SchedRW = [WriteStoreNT]
3131
3132let Predicates = [HasAVX, NoVLX] in {
3133  def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
3134            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3135  def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
3136            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3137  def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
3138            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
3139
3140  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3141            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3142  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3143            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3144  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3145            (VMOVNTDQmr addr:$dst, VR128:$src)>;
3146}
3147
3148let Predicates = [UseSSE2] in {
3149  def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
3150            (MOVNTDQmr addr:$dst, VR128:$src)>;
3151  def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
3152            (MOVNTDQmr addr:$dst, VR128:$src)>;
3153  def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
3154            (MOVNTDQmr addr:$dst, VR128:$src)>;
3155}
3156
3157} // AddedComplexity
3158
3159//===----------------------------------------------------------------------===//
3160// SSE 1 & 2 - Prefetch and memory fence
3161//===----------------------------------------------------------------------===//
3162
3163// Prefetch intrinsic.
3164let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in {
3165def PREFETCHT0   : I<0x18, MRM1m, (outs), (ins i8mem:$src),
3166    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB;
3167def PREFETCHT1   : I<0x18, MRM2m, (outs), (ins i8mem:$src),
3168    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB;
3169def PREFETCHT2   : I<0x18, MRM3m, (outs), (ins i8mem:$src),
3170    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB;
3171def PREFETCHNTA  : I<0x18, MRM0m, (outs), (ins i8mem:$src),
3172    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB;
3173}
3174
3175// FIXME: How should flush instruction be modeled?
3176let SchedRW = [WriteLoad] in {
3177// Flush cache
3178def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
3179               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
3180               PS, Requires<[HasSSE2]>;
3181}
3182
3183let SchedRW = [WriteNop] in {
3184// Pause. This "instruction" is encoded as "rep; nop", so even though it
3185// was introduced with SSE2, it's backward compatible.
3186def PAUSE : I<0x90, RawFrm, (outs), (ins),
3187              "pause", [(int_x86_sse2_pause)]>, OBXS;
3188}
3189
3190let SchedRW = [WriteFence] in {
3191// Load, store, and memory fence
3192// TODO: As with mfence, we may want to ease the availability of sfence/lfence
3193// to include any 64-bit target.
3194def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
3195               PS, Requires<[HasSSE1]>;
3196def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
3197               PS, Requires<[HasSSE2]>;
3198def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
3199               PS, Requires<[HasMFence]>;
3200} // SchedRW
3201
3202def : Pat<(X86MFence), (MFENCE)>;
3203
3204//===----------------------------------------------------------------------===//
3205// SSE 1 & 2 - Load/Store XCSR register
3206//===----------------------------------------------------------------------===//
3207
3208let mayLoad=1, hasSideEffects=1 in
3209def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
3210               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3211               VEX, Sched<[WriteLDMXCSR]>, VEX_WIG;
3212let mayStore=1, hasSideEffects=1 in
3213def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3214               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3215               VEX, Sched<[WriteSTMXCSR]>, VEX_WIG;
3216
3217let mayLoad=1, hasSideEffects=1 in
3218def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
3219              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
3220              PS, Sched<[WriteLDMXCSR]>;
3221let mayStore=1, hasSideEffects=1 in
3222def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
3223              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
3224              PS, Sched<[WriteSTMXCSR]>;
3225
3226//===---------------------------------------------------------------------===//
3227// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
3228//===---------------------------------------------------------------------===//
3229
3230let ExeDomain = SSEPackedInt in { // SSE integer instructions
3231
3232let hasSideEffects = 0 in {
3233def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3234                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3235                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3236def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3237                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3238                      Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG;
3239def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3240                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3241                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3242def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
3243                      "movdqu\t{$src, $dst|$dst, $src}", []>,
3244                      Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG;
3245}
3246
3247// For Disassembler
3248let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3249def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3250                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3251                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3252                          VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
3253def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3254                          "movdqa\t{$src, $dst|$dst, $src}", []>,
3255                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3256                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">;
3257def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3258                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3259                          Sched<[SchedWriteVecMoveLS.XMM.RR]>,
3260                          VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
3261def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
3262                          "movdqu\t{$src, $dst|$dst, $src}", []>,
3263                          Sched<[SchedWriteVecMoveLS.YMM.RR]>,
3264                          VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">;
3265}
3266
3267let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3268    hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3269def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3270                      "movdqa\t{$src, $dst|$dst, $src}",
3271                      [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>,
3272                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
3273def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3274                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3275                      Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3276                      VEX, VEX_L, VEX_WIG;
3277def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3278                   "vmovdqu\t{$src, $dst|$dst, $src}",
3279                   [(set VR128:$dst, (loadv2i64 addr:$src))]>,
3280                   Sched<[SchedWriteVecMoveLS.XMM.RM]>,
3281                   XS, VEX, VEX_WIG;
3282def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
3283                   "vmovdqu\t{$src, $dst|$dst, $src}", []>,
3284                   Sched<[SchedWriteVecMoveLS.YMM.RM]>,
3285                   XS, VEX, VEX_L, VEX_WIG;
3286}
3287
3288let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
3289def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
3290                      (ins i128mem:$dst, VR128:$src),
3291                      "movdqa\t{$src, $dst|$dst, $src}",
3292                      [(alignedstore (v2i64 VR128:$src), addr:$dst)]>,
3293                      Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG;
3294def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
3295                      (ins i256mem:$dst, VR256:$src),
3296                      "movdqa\t{$src, $dst|$dst, $src}", []>,
3297                     Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG;
3298def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3299                   "vmovdqu\t{$src, $dst|$dst, $src}",
3300                   [(store (v2i64 VR128:$src), addr:$dst)]>,
3301                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG;
3302def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
3303                   "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
3304                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG;
3305}
3306
3307let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
3308let hasSideEffects = 0 in {
3309def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3310                   "movdqa\t{$src, $dst|$dst, $src}", []>;
3311
3312def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
3313                   "movdqu\t{$src, $dst|$dst, $src}", []>,
3314                   XS, Requires<[UseSSE2]>;
3315}
3316
3317// For Disassembler
3318let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
3319def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3320                       "movdqa\t{$src, $dst|$dst, $src}", []>,
3321                       FoldGenData<"MOVDQArr">;
3322
3323def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
3324                       "movdqu\t{$src, $dst|$dst, $src}", []>,
3325                       XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">;
3326}
3327} // SchedRW
3328
3329let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
3330    hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in {
3331def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3332                   "movdqa\t{$src, $dst|$dst, $src}",
3333                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
3334def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
3335                   "movdqu\t{$src, $dst|$dst, $src}",
3336                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
3337                 XS, Requires<[UseSSE2]>;
3338}
3339
3340let mayStore = 1, hasSideEffects = 0,
3341    SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
3342def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3343                   "movdqa\t{$src, $dst|$dst, $src}",
3344                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
3345def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
3346                   "movdqu\t{$src, $dst|$dst, $src}",
3347                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
3348                 XS, Requires<[UseSSE2]>;
3349}
3350
3351} // ExeDomain = SSEPackedInt
3352
3353// Reversed version with ".s" suffix for GAS compatibility.
3354def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3355                (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3356def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}",
3357                (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>;
3358def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3359                (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3360def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}",
3361                (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>;
3362
3363// Reversed version with ".s" suffix for GAS compatibility.
3364def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}",
3365                (MOVDQArr_REV VR128:$dst, VR128:$src), 0>;
3366def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}",
3367                (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>;
3368
3369let Predicates = [HasAVX, NoVLX] in {
3370  // Additional patterns for other integer sizes.
3371  def : Pat<(alignedloadv4i32 addr:$src),
3372            (VMOVDQArm addr:$src)>;
3373  def : Pat<(alignedloadv8i16 addr:$src),
3374            (VMOVDQArm addr:$src)>;
3375  def : Pat<(alignedloadv16i8 addr:$src),
3376            (VMOVDQArm addr:$src)>;
3377  def : Pat<(loadv4i32 addr:$src),
3378            (VMOVDQUrm addr:$src)>;
3379  def : Pat<(loadv8i16 addr:$src),
3380            (VMOVDQUrm addr:$src)>;
3381  def : Pat<(loadv16i8 addr:$src),
3382            (VMOVDQUrm addr:$src)>;
3383
3384  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
3385            (VMOVDQAmr addr:$dst, VR128:$src)>;
3386  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
3387            (VMOVDQAmr addr:$dst, VR128:$src)>;
3388  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
3389            (VMOVDQAmr addr:$dst, VR128:$src)>;
3390  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
3391            (VMOVDQUmr addr:$dst, VR128:$src)>;
3392  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
3393            (VMOVDQUmr addr:$dst, VR128:$src)>;
3394  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
3395            (VMOVDQUmr addr:$dst, VR128:$src)>;
3396}
3397
3398//===---------------------------------------------------------------------===//
3399// SSE2 - Packed Integer Arithmetic Instructions
3400//===---------------------------------------------------------------------===//
3401
3402let ExeDomain = SSEPackedInt in { // SSE integer instructions
3403
3404/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
3405multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
3406                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
3407                         PatFrag memop_frag, X86MemOperand x86memop,
3408                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3409  let isCommutable = 1 in
3410  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3411       (ins RC:$src1, RC:$src2),
3412       !if(Is2Addr,
3413           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3414           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3415       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
3416       Sched<[sched]>;
3417  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3418       (ins RC:$src1, x86memop:$src2),
3419       !if(Is2Addr,
3420           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3421           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3422       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
3423                                     (memop_frag addr:$src2))))]>,
3424       Sched<[sched.Folded, sched.ReadAfterFold]>;
3425}
3426} // ExeDomain = SSEPackedInt
3427
3428defm PADDB   : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
3429                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3430defm PADDW   : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
3431                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3432defm PADDD   : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
3433                             SchedWriteVecALU, 1, NoVLX>;
3434defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
3435                             SchedWriteVecALU, 1, NoVLX>;
3436defm PADDSB  : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8,
3437                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3438defm PADDSW  : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16,
3439                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3440defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8,
3441                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3442defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16,
3443                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3444defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
3445                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3446defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
3447                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3448defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
3449                             SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>;
3450defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
3451                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3452defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
3453                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3454defm PSUBD   : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
3455                             SchedWriteVecALU, 0, NoVLX>;
3456defm PSUBQ   : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
3457                             SchedWriteVecALU, 0, NoVLX>;
3458defm PSUBSB  : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8,
3459                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3460defm PSUBSW  : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16,
3461                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3462defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8,
3463                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3464defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16,
3465                             SchedWriteVecALU, 0, NoVLX_Or_NoBWI>;
3466defm PMINUB  : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
3467                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3468defm PMINSW  : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
3469                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3470defm PMAXUB  : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
3471                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3472defm PMAXSW  : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
3473                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3474defm PAVGB   : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
3475                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3476defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
3477                             SchedWriteVecALU, 1, NoVLX_Or_NoBWI>;
3478defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
3479                             SchedWriteVecIMul, 1, NoVLX>;
3480
3481let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3482defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3483                              load, i128mem, SchedWriteVecIMul.XMM, 0>,
3484                              VEX_4V, VEX_WIG;
3485
3486let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3487defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
3488                               VR256, load, i256mem, SchedWriteVecIMul.YMM,
3489                               0>, VEX_4V, VEX_L, VEX_WIG;
3490let Constraints = "$src1 = $dst" in
3491defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
3492                             memop, i128mem, SchedWriteVecIMul.XMM>;
3493
3494let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3495defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
3496                             load, i128mem, SchedWritePSADBW.XMM, 0>,
3497                             VEX_4V, VEX_WIG;
3498let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3499defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
3500                             load, i256mem, SchedWritePSADBW.YMM, 0>,
3501                             VEX_4V, VEX_L, VEX_WIG;
3502let Constraints = "$src1 = $dst" in
3503defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
3504                            memop, i128mem, SchedWritePSADBW.XMM>;
3505
3506//===---------------------------------------------------------------------===//
3507// SSE2 - Packed Integer Logical Instructions
3508//===---------------------------------------------------------------------===//
3509
3510multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
3511                         string OpcodeStr, SDNode OpNode,
3512                         SDNode OpNode2, RegisterClass RC,
3513                         X86FoldableSchedWrite sched,
3514                         X86FoldableSchedWrite schedImm,
3515                         ValueType DstVT, ValueType SrcVT,
3516                         PatFrag ld_frag, bit Is2Addr = 1> {
3517  // src2 is always 128-bit
3518  def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
3519       (ins RC:$src1, VR128:$src2),
3520       !if(Is2Addr,
3521           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3522           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3523       [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>,
3524       Sched<[sched]>;
3525  def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
3526       (ins RC:$src1, i128mem:$src2),
3527       !if(Is2Addr,
3528           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3529           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3530       [(set RC:$dst, (DstVT (OpNode RC:$src1,
3531                       (SrcVT (ld_frag addr:$src2)))))]>,
3532       Sched<[sched.Folded, sched.ReadAfterFold]>;
3533  def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
3534       (ins RC:$src1, u8imm:$src2),
3535       !if(Is2Addr,
3536           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3537           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3538       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>,
3539       Sched<[schedImm]>;
3540}
3541
3542multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
3543                             string OpcodeStr, SDNode OpNode,
3544                             SDNode OpNode2, ValueType DstVT128,
3545                             ValueType DstVT256, ValueType SrcVT,
3546                             X86SchedWriteWidths sched,
3547                             X86SchedWriteWidths schedImm, Predicate prd> {
3548let Predicates = [HasAVX, prd] in
3549  defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3550                              OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
3551                              DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG;
3552let Predicates = [HasAVX2, prd] in
3553  defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
3554                                OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
3555                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
3556                                VEX_WIG;
3557let Constraints = "$src1 = $dst" in
3558  defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
3559                            VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT,
3560                            memop>;
3561}
3562
3563multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
3564                        SDNode OpNode, RegisterClass RC, ValueType VT,
3565                        X86FoldableSchedWrite sched, bit Is2Addr = 1> {
3566  def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
3567       !if(Is2Addr,
3568           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3569           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3570       [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>,
3571       Sched<[sched]>;
3572}
3573
3574multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
3575                            SDNode OpNode, X86SchedWriteWidths sched> {
3576let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
3577  defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3578                             VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG;
3579let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
3580  defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
3581                               VR256, v32i8, sched.YMM, 0>,
3582                               VEX_4V, VEX_L, VEX_WIG;
3583let Constraints = "$src1 = $dst" in
3584  defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
3585                           sched.XMM>;
3586}
3587
3588let ExeDomain = SSEPackedInt in {
3589  defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
3590                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3591                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3592  defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
3593                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3594                                 SchedWriteVecShiftImm, NoVLX>;
3595  defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
3596                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3597                                 SchedWriteVecShiftImm, NoVLX>;
3598
3599  defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
3600                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3601                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3602  defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
3603                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3604                                 SchedWriteVecShiftImm, NoVLX>;
3605  defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
3606                                 v2i64, v4i64, v2i64, SchedWriteVecShift,
3607                                 SchedWriteVecShiftImm, NoVLX>;
3608
3609  defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
3610                                 v8i16, v16i16, v8i16, SchedWriteVecShift,
3611                                 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>;
3612  defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
3613                                 v4i32, v8i32, v4i32, SchedWriteVecShift,
3614                                 SchedWriteVecShiftImm, NoVLX>;
3615
3616  defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq,
3617                                 SchedWriteShuffle>;
3618  defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq,
3619                                 SchedWriteShuffle>;
3620} // ExeDomain = SSEPackedInt
3621
3622//===---------------------------------------------------------------------===//
3623// SSE2 - Packed Integer Comparison Instructions
3624//===---------------------------------------------------------------------===//
3625
3626defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
3627                             SchedWriteVecALU, 1, TruePredicate>;
3628defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
3629                             SchedWriteVecALU, 1, TruePredicate>;
3630defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
3631                             SchedWriteVecALU, 1, TruePredicate>;
3632defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
3633                             SchedWriteVecALU, 0, TruePredicate>;
3634defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
3635                             SchedWriteVecALU, 0, TruePredicate>;
3636defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
3637                             SchedWriteVecALU, 0, TruePredicate>;
3638
3639//===---------------------------------------------------------------------===//
3640// SSE2 - Packed Integer Shuffle Instructions
3641//===---------------------------------------------------------------------===//
3642
3643let ExeDomain = SSEPackedInt in {
3644multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
3645                         SDNode OpNode, X86SchedWriteWidths sched,
3646                         Predicate prd> {
3647let Predicates = [HasAVX, prd] in {
3648  def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
3649                      (ins VR128:$src1, u8imm:$src2),
3650                      !strconcat("v", OpcodeStr,
3651                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3652                      [(set VR128:$dst,
3653                        (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3654                      VEX, Sched<[sched.XMM]>, VEX_WIG;
3655  def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
3656                      (ins i128mem:$src1, u8imm:$src2),
3657                      !strconcat("v", OpcodeStr,
3658                                 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3659                     [(set VR128:$dst,
3660                       (vt128 (OpNode (load addr:$src1),
3661                        (i8 timm:$src2))))]>, VEX,
3662                  Sched<[sched.XMM.Folded]>, VEX_WIG;
3663}
3664
3665let Predicates = [HasAVX2, prd] in {
3666  def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
3667                       (ins VR256:$src1, u8imm:$src2),
3668                       !strconcat("v", OpcodeStr,
3669                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3670                       [(set VR256:$dst,
3671                         (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>,
3672                       VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG;
3673  def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
3674                       (ins i256mem:$src1, u8imm:$src2),
3675                       !strconcat("v", OpcodeStr,
3676                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3677                      [(set VR256:$dst,
3678                        (vt256 (OpNode (load addr:$src1),
3679                         (i8 timm:$src2))))]>, VEX, VEX_L,
3680                   Sched<[sched.YMM.Folded]>, VEX_WIG;
3681}
3682
3683let Predicates = [UseSSE2] in {
3684  def ri : Ii8<0x70, MRMSrcReg,
3685               (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
3686               !strconcat(OpcodeStr,
3687                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3688               [(set VR128:$dst,
3689                 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>,
3690               Sched<[sched.XMM]>;
3691  def mi : Ii8<0x70, MRMSrcMem,
3692               (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
3693               !strconcat(OpcodeStr,
3694                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
3695               [(set VR128:$dst,
3696                 (vt128 (OpNode (memop addr:$src1),
3697                        (i8 timm:$src2))))]>,
3698               Sched<[sched.XMM.Folded]>;
3699}
3700}
3701} // ExeDomain = SSEPackedInt
3702
3703defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
3704                             SchedWriteShuffle, NoVLX>, PD;
3705defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
3706                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
3707defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
3708                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
3709
3710//===---------------------------------------------------------------------===//
3711// Packed Integer Pack Instructions (SSE & AVX)
3712//===---------------------------------------------------------------------===//
3713
3714let ExeDomain = SSEPackedInt in {
3715multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3716                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3717                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3718                     PatFrag ld_frag, bit Is2Addr = 1> {
3719  def rr : PDI<opc, MRMSrcReg,
3720               (outs RC:$dst), (ins RC:$src1, RC:$src2),
3721               !if(Is2Addr,
3722                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3723                   !strconcat(OpcodeStr,
3724                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3725               [(set RC:$dst,
3726                     (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3727               Sched<[sched]>;
3728  def rm : PDI<opc, MRMSrcMem,
3729               (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3730               !if(Is2Addr,
3731                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3732                   !strconcat(OpcodeStr,
3733                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3734               [(set RC:$dst,
3735                     (OutVT (OpNode (ArgVT RC:$src1),
3736                                    (ld_frag addr:$src2))))]>,
3737               Sched<[sched.Folded, sched.ReadAfterFold]>;
3738}
3739
3740multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
3741                     ValueType ArgVT, SDNode OpNode, RegisterClass RC,
3742                     X86MemOperand x86memop, X86FoldableSchedWrite sched,
3743                     PatFrag ld_frag, bit Is2Addr = 1> {
3744  def rr : SS48I<opc, MRMSrcReg,
3745                 (outs RC:$dst), (ins RC:$src1, RC:$src2),
3746                 !if(Is2Addr,
3747                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3748                     !strconcat(OpcodeStr,
3749                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3750                 [(set RC:$dst,
3751                       (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>,
3752                 Sched<[sched]>;
3753  def rm : SS48I<opc, MRMSrcMem,
3754                 (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3755                 !if(Is2Addr,
3756                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
3757                     !strconcat(OpcodeStr,
3758                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3759                 [(set RC:$dst,
3760                       (OutVT (OpNode (ArgVT RC:$src1),
3761                                      (ld_frag addr:$src2))))]>,
3762                 Sched<[sched.Folded, sched.ReadAfterFold]>;
3763}
3764
3765let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3766  defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
3767                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3768                             VEX_4V, VEX_WIG;
3769  defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
3770                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3771                             VEX_4V, VEX_WIG;
3772
3773  defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
3774                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3775                             VEX_4V, VEX_WIG;
3776  defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
3777                             i128mem, SchedWriteShuffle.XMM, load, 0>,
3778                             VEX_4V, VEX_WIG;
3779}
3780
3781let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3782  defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
3783                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3784                              VEX_4V, VEX_L, VEX_WIG;
3785  defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
3786                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3787                              VEX_4V, VEX_L, VEX_WIG;
3788
3789  defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
3790                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3791                              VEX_4V, VEX_L, VEX_WIG;
3792  defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
3793                              i256mem, SchedWriteShuffle.YMM, load, 0>,
3794                              VEX_4V, VEX_L, VEX_WIG;
3795}
3796
3797let Constraints = "$src1 = $dst" in {
3798  defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
3799                            i128mem, SchedWriteShuffle.XMM, memop>;
3800  defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
3801                            i128mem, SchedWriteShuffle.XMM, memop>;
3802
3803  defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
3804                            i128mem, SchedWriteShuffle.XMM, memop>;
3805
3806  defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
3807                            i128mem, SchedWriteShuffle.XMM, memop>;
3808}
3809} // ExeDomain = SSEPackedInt
3810
3811//===---------------------------------------------------------------------===//
3812// SSE2 - Packed Integer Unpack Instructions
3813//===---------------------------------------------------------------------===//
3814
3815let ExeDomain = SSEPackedInt in {
3816multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
3817                       SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
3818                       X86FoldableSchedWrite sched, PatFrag ld_frag,
3819                       bit Is2Addr = 1> {
3820  def rr : PDI<opc, MRMSrcReg,
3821      (outs RC:$dst), (ins RC:$src1, RC:$src2),
3822      !if(Is2Addr,
3823          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3824          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3825      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
3826      Sched<[sched]>;
3827  def rm : PDI<opc, MRMSrcMem,
3828      (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
3829      !if(Is2Addr,
3830          !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
3831          !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
3832      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
3833      Sched<[sched.Folded, sched.ReadAfterFold]>;
3834}
3835
3836let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
3837  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
3838                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3839                                 VEX_4V, VEX_WIG;
3840  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
3841                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3842                                 VEX_4V, VEX_WIG;
3843  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
3844                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3845                                 VEX_4V, VEX_WIG;
3846  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
3847                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3848                                 VEX_4V, VEX_WIG;
3849}
3850
3851let Predicates = [HasAVX, NoVLX] in {
3852  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
3853                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3854                                 VEX_4V, VEX_WIG;
3855  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
3856                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3857                                 VEX_4V, VEX_WIG;
3858  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
3859                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3860                                 VEX_4V, VEX_WIG;
3861  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
3862                                 i128mem, SchedWriteShuffle.XMM, load, 0>,
3863                                 VEX_4V, VEX_WIG;
3864}
3865
3866let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
3867  defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
3868                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3869                                  VEX_4V, VEX_L, VEX_WIG;
3870  defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
3871                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3872                                  VEX_4V, VEX_L, VEX_WIG;
3873  defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
3874                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3875                                  VEX_4V, VEX_L, VEX_WIG;
3876  defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
3877                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3878                                  VEX_4V, VEX_L, VEX_WIG;
3879}
3880
3881let Predicates = [HasAVX2, NoVLX] in {
3882  defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
3883                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3884                                  VEX_4V, VEX_L, VEX_WIG;
3885  defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
3886                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3887                                  VEX_4V, VEX_L, VEX_WIG;
3888  defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
3889                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3890                                  VEX_4V, VEX_L, VEX_WIG;
3891  defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
3892                                  i256mem, SchedWriteShuffle.YMM, load, 0>,
3893                                  VEX_4V, VEX_L, VEX_WIG;
3894}
3895
3896let Constraints = "$src1 = $dst" in {
3897  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
3898                                i128mem, SchedWriteShuffle.XMM, memop>;
3899  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
3900                                i128mem, SchedWriteShuffle.XMM, memop>;
3901  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
3902                                i128mem, SchedWriteShuffle.XMM, memop>;
3903  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
3904                                i128mem, SchedWriteShuffle.XMM, memop>;
3905
3906  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
3907                                i128mem, SchedWriteShuffle.XMM, memop>;
3908  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
3909                                i128mem, SchedWriteShuffle.XMM, memop>;
3910  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
3911                                i128mem, SchedWriteShuffle.XMM, memop>;
3912  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
3913                                i128mem, SchedWriteShuffle.XMM, memop>;
3914}
3915} // ExeDomain = SSEPackedInt
3916
3917//===---------------------------------------------------------------------===//
3918// SSE2 - Packed Integer Extract and Insert
3919//===---------------------------------------------------------------------===//
3920
3921let ExeDomain = SSEPackedInt in {
3922multiclass sse2_pinsrw<bit Is2Addr = 1> {
3923  def rr : Ii8<0xC4, MRMSrcReg,
3924       (outs VR128:$dst), (ins VR128:$src1,
3925        GR32orGR64:$src2, u8imm:$src3),
3926       !if(Is2Addr,
3927           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3928           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3929       [(set VR128:$dst,
3930         (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
3931       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
3932  def rm : Ii8<0xC4, MRMSrcMem,
3933                      (outs VR128:$dst), (ins VR128:$src1,
3934                       i16mem:$src2, u8imm:$src3),
3935       !if(Is2Addr,
3936           "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
3937           "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
3938       [(set VR128:$dst,
3939         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
3940                    timm:$src3))]>,
3941       Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
3942}
3943
3944// Extract
3945let Predicates = [HasAVX, NoBWI] in
3946def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
3947                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3948                    "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3949                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3950                                            timm:$src2))]>,
3951                PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>;
3952def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
3953                    (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
3954                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
3955                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
3956                                            timm:$src2))]>,
3957               Sched<[WriteVecExtract]>;
3958
3959// Insert
3960let Predicates = [HasAVX, NoBWI] in
3961defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG;
3962
3963let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
3964defm PINSRW : sse2_pinsrw, PD;
3965
3966} // ExeDomain = SSEPackedInt
3967
3968//===---------------------------------------------------------------------===//
3969// SSE2 - Packed Mask Creation
3970//===---------------------------------------------------------------------===//
3971
3972let ExeDomain = SSEPackedInt in {
3973
3974def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3975           (ins VR128:$src),
3976           "pmovmskb\t{$src, $dst|$dst, $src}",
3977           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3978           Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG;
3979
3980let Predicates = [HasAVX2] in {
3981def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
3982           (ins VR256:$src),
3983           "pmovmskb\t{$src, $dst|$dst, $src}",
3984           [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
3985           Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG;
3986}
3987
3988def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
3989           "pmovmskb\t{$src, $dst|$dst, $src}",
3990           [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>,
3991           Sched<[WriteVecMOVMSK]>;
3992
3993} // ExeDomain = SSEPackedInt
3994
3995//===---------------------------------------------------------------------===//
3996// SSE2 - Conditional Store
3997//===---------------------------------------------------------------------===//
3998
3999let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in {
4000let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
4001def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
4002           (ins VR128:$src, VR128:$mask),
4003           "maskmovdqu\t{$mask, $src|$src, $mask}",
4004           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4005           VEX, VEX_WIG;
4006let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
4007def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
4008           (ins VR128:$src, VR128:$mask),
4009           "maskmovdqu\t{$mask, $src|$src, $mask}",
4010           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4011           VEX, VEX_WIG, AdSize64;
4012let Uses = [EDI], Predicates = [HasAVX,In64BitMode] in
4013def VMASKMOVDQUX32 : VPDI<0xF7, MRMSrcReg, (outs),
4014           (ins VR128:$src, VR128:$mask), "",
4015           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4016           VEX, VEX_WIG, AdSize32 {
4017  let AsmString = "addr32 vmaskmovdqu\t{$mask, $src|$src, $mask}";
4018  let AsmVariantName = "NonParsable";
4019}
4020
4021let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
4022def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4023           "maskmovdqu\t{$mask, $src|$src, $mask}",
4024           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
4025let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
4026def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4027           "maskmovdqu\t{$mask, $src|$src, $mask}",
4028           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>,
4029           AdSize64;
4030let Uses = [EDI], Predicates = [UseSSE2,In64BitMode] in
4031def MASKMOVDQUX32 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
4032           "addr32 maskmovdqu\t{$mask, $src|$src, $mask}",
4033           [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
4034           AdSize32 {
4035  let AsmVariantName = "NonParsable";
4036}
4037
4038} // ExeDomain = SSEPackedInt
4039
4040//===---------------------------------------------------------------------===//
4041// SSE2 - Move Doubleword/Quadword
4042//===---------------------------------------------------------------------===//
4043
4044//===---------------------------------------------------------------------===//
4045// Move Int Doubleword to Packed Double Int
4046//
4047let ExeDomain = SSEPackedInt in {
4048def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4049                        "movd\t{$src, $dst|$dst, $src}",
4050                        [(set VR128:$dst,
4051                          (v4i32 (scalar_to_vector GR32:$src)))]>,
4052                          VEX, Sched<[WriteVecMoveFromGpr]>;
4053def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4054                        "movd\t{$src, $dst|$dst, $src}",
4055                        [(set VR128:$dst,
4056                          (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4057                        VEX, Sched<[WriteVecLoad]>;
4058def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4059                          "movq\t{$src, $dst|$dst, $src}",
4060                          [(set VR128:$dst,
4061                            (v2i64 (scalar_to_vector GR64:$src)))]>,
4062                          VEX, Sched<[WriteVecMoveFromGpr]>;
4063let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4064def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4065                          "movq\t{$src, $dst|$dst, $src}", []>,
4066                          VEX, Sched<[WriteVecLoad]>;
4067let isCodeGenOnly = 1 in
4068def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4069                         "movq\t{$src, $dst|$dst, $src}",
4070                         [(set FR64:$dst, (bitconvert GR64:$src))]>,
4071                         VEX, Sched<[WriteVecMoveFromGpr]>;
4072
4073def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
4074                      "movd\t{$src, $dst|$dst, $src}",
4075                      [(set VR128:$dst,
4076                        (v4i32 (scalar_to_vector GR32:$src)))]>,
4077                      Sched<[WriteVecMoveFromGpr]>;
4078def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
4079                      "movd\t{$src, $dst|$dst, $src}",
4080                      [(set VR128:$dst,
4081                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
4082                      Sched<[WriteVecLoad]>;
4083def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
4084                        "movq\t{$src, $dst|$dst, $src}",
4085                        [(set VR128:$dst,
4086                          (v2i64 (scalar_to_vector GR64:$src)))]>,
4087                        Sched<[WriteVecMoveFromGpr]>;
4088let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
4089def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4090                        "movq\t{$src, $dst|$dst, $src}", []>,
4091                        Sched<[WriteVecLoad]>;
4092let isCodeGenOnly = 1 in
4093def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
4094                       "movq\t{$src, $dst|$dst, $src}",
4095                       [(set FR64:$dst, (bitconvert GR64:$src))]>,
4096                       Sched<[WriteVecMoveFromGpr]>;
4097} // ExeDomain = SSEPackedInt
4098
4099//===---------------------------------------------------------------------===//
4100// Move Int Doubleword to Single Scalar
4101//
4102let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4103  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4104                        "movd\t{$src, $dst|$dst, $src}",
4105                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4106                        VEX, Sched<[WriteVecMoveFromGpr]>;
4107
4108  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
4109                        "movd\t{$src, $dst|$dst, $src}",
4110                        [(set FR32:$dst, (bitconvert GR32:$src))]>,
4111                        Sched<[WriteVecMoveFromGpr]>;
4112
4113} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4114
4115//===---------------------------------------------------------------------===//
4116// Move Packed Doubleword Int to Packed Double Int
4117//
4118let ExeDomain = SSEPackedInt in {
4119def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4120                         "movd\t{$src, $dst|$dst, $src}",
4121                         [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4122                                          (iPTR 0)))]>, VEX,
4123                         Sched<[WriteVecMoveToGpr]>;
4124def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
4125                         (ins i32mem:$dst, VR128:$src),
4126                         "movd\t{$src, $dst|$dst, $src}",
4127                         [(store (i32 (extractelt (v4i32 VR128:$src),
4128                                       (iPTR 0))), addr:$dst)]>,
4129                         VEX, Sched<[WriteVecStore]>;
4130def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
4131                       "movd\t{$src, $dst|$dst, $src}",
4132                       [(set GR32:$dst, (extractelt (v4i32 VR128:$src),
4133                                        (iPTR 0)))]>,
4134                   Sched<[WriteVecMoveToGpr]>;
4135def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
4136                       "movd\t{$src, $dst|$dst, $src}",
4137                       [(store (i32 (extractelt (v4i32 VR128:$src),
4138                                     (iPTR 0))), addr:$dst)]>,
4139                       Sched<[WriteVecStore]>;
4140} // ExeDomain = SSEPackedInt
4141
4142//===---------------------------------------------------------------------===//
4143// Move Packed Doubleword Int first element to Doubleword Int
4144//
4145let ExeDomain = SSEPackedInt in {
4146let SchedRW = [WriteVecMoveToGpr] in {
4147def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4148                          "movq\t{$src, $dst|$dst, $src}",
4149                          [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4150                                                        (iPTR 0)))]>,
4151                      VEX;
4152
4153def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
4154                        "movq\t{$src, $dst|$dst, $src}",
4155                        [(set GR64:$dst, (extractelt (v2i64 VR128:$src),
4156                                                         (iPTR 0)))]>;
4157} //SchedRW
4158
4159let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4160def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
4161                          (ins i64mem:$dst, VR128:$src),
4162                          "movq\t{$src, $dst|$dst, $src}", []>,
4163                          VEX, Sched<[WriteVecStore]>;
4164let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
4165def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4166                        "movq\t{$src, $dst|$dst, $src}", []>,
4167                        Sched<[WriteVecStore]>;
4168} // ExeDomain = SSEPackedInt
4169
4170//===---------------------------------------------------------------------===//
4171// Bitcast FR64 <-> GR64
4172//
4173let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4174  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4175                           "movq\t{$src, $dst|$dst, $src}",
4176                           [(set GR64:$dst, (bitconvert FR64:$src))]>,
4177                           VEX, Sched<[WriteVecMoveToGpr]>;
4178
4179  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
4180                         "movq\t{$src, $dst|$dst, $src}",
4181                         [(set GR64:$dst, (bitconvert FR64:$src))]>,
4182                         Sched<[WriteVecMoveToGpr]>;
4183} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4184
4185//===---------------------------------------------------------------------===//
4186// Move Scalar Single to Double Int
4187//
4188let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
4189  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4190                        "movd\t{$src, $dst|$dst, $src}",
4191                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4192                        VEX, Sched<[WriteVecMoveToGpr]>;
4193  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
4194                        "movd\t{$src, $dst|$dst, $src}",
4195                        [(set GR32:$dst, (bitconvert FR32:$src))]>,
4196                        Sched<[WriteVecMoveToGpr]>;
4197} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
4198
4199let Predicates = [UseAVX] in {
4200  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4201            (VMOVDI2PDIrr GR32:$src)>;
4202
4203  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4204            (VMOV64toPQIrr GR64:$src)>;
4205
4206  // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
4207  // These instructions also write zeros in the high part of a 256-bit register.
4208  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4209            (VMOVDI2PDIrm addr:$src)>;
4210  def : Pat<(v8i32 (X86vzload32 addr:$src)),
4211            (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
4212}
4213
4214let Predicates = [UseSSE2] in {
4215  def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
4216            (MOVDI2PDIrr GR32:$src)>;
4217
4218  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
4219            (MOV64toPQIrr GR64:$src)>;
4220  def : Pat<(v4i32 (X86vzload32 addr:$src)),
4221            (MOVDI2PDIrm addr:$src)>;
4222}
4223
4224// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
4225// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
4226// these aliases.
4227def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4228                (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4229def : InstAlias<"movd\t{$src, $dst|$dst, $src}",
4230                (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4231// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
4232def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4233                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
4234def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
4235                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
4236
4237//===---------------------------------------------------------------------===//
4238// SSE2 - Move Quadword
4239//===---------------------------------------------------------------------===//
4240
4241//===---------------------------------------------------------------------===//
4242// Move Quadword Int to Packed Quadword Int
4243//
4244
4245let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
4246def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4247                    "vmovq\t{$src, $dst|$dst, $src}",
4248                    [(set VR128:$dst,
4249                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
4250                    VEX, Requires<[UseAVX]>, VEX_WIG;
4251def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
4252                    "movq\t{$src, $dst|$dst, $src}",
4253                    [(set VR128:$dst,
4254                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
4255                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
4256} // ExeDomain, SchedRW
4257
4258//===---------------------------------------------------------------------===//
4259// Move Packed Quadword Int to Quadword Int
4260//
4261let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
4262def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4263                        "movq\t{$src, $dst|$dst, $src}",
4264                        [(store (i64 (extractelt (v2i64 VR128:$src),
4265                                      (iPTR 0))), addr:$dst)]>,
4266                        VEX, VEX_WIG;
4267def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
4268                      "movq\t{$src, $dst|$dst, $src}",
4269                      [(store (i64 (extractelt (v2i64 VR128:$src),
4270                                    (iPTR 0))), addr:$dst)]>;
4271} // ExeDomain, SchedRW
4272
4273// For disassembler only
4274let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
4275    SchedRW = [SchedWriteVecLogic.XMM] in {
4276def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4277                     "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG;
4278def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
4279                      "movq\t{$src, $dst|$dst, $src}", []>;
4280}
4281
4282def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
4283                (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4284def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
4285                (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
4286
4287let Predicates = [UseAVX] in {
4288  def : Pat<(v2i64 (X86vzload64 addr:$src)),
4289            (VMOVQI2PQIrm addr:$src)>;
4290  def : Pat<(v4i64 (X86vzload64 addr:$src)),
4291            (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
4292
4293  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4294            (VMOVPQI2QImr addr:$dst, VR128:$src)>;
4295}
4296
4297let Predicates = [UseSSE2] in {
4298  def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
4299
4300  def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
4301            (MOVPQI2QImr addr:$dst, VR128:$src)>;
4302}
4303
4304//===---------------------------------------------------------------------===//
4305// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
4306// IA32 document. movq xmm1, xmm2 does clear the high bits.
4307//
4308let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
4309def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4310                        "vmovq\t{$src, $dst|$dst, $src}",
4311                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4312                         XS, VEX, Requires<[UseAVX]>, VEX_WIG;
4313def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4314                        "movq\t{$src, $dst|$dst, $src}",
4315                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
4316                        XS, Requires<[UseSSE2]>;
4317} // ExeDomain, SchedRW
4318
4319let Predicates = [UseAVX] in {
4320  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4321            (VMOVZPQILo2PQIrr VR128:$src)>;
4322}
4323let Predicates = [UseSSE2] in {
4324  def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
4325            (MOVZPQILo2PQIrr VR128:$src)>;
4326}
4327
4328let Predicates = [UseAVX] in {
4329  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
4330            (SUBREG_TO_REG (i32 0),
4331             (v2f64 (VMOVZPQILo2PQIrr
4332                     (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))),
4333             sub_xmm)>;
4334  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
4335            (SUBREG_TO_REG (i32 0),
4336             (v2i64 (VMOVZPQILo2PQIrr
4337                     (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))),
4338             sub_xmm)>;
4339}
4340
4341//===---------------------------------------------------------------------===//
4342// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
4343//===---------------------------------------------------------------------===//
4344
4345multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
4346                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
4347                              X86MemOperand x86memop, X86FoldableSchedWrite sched> {
4348def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
4349                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4350                      [(set RC:$dst, (vt (OpNode RC:$src)))]>,
4351                      Sched<[sched]>;
4352def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
4353                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4354                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>,
4355                      Sched<[sched.Folded]>;
4356}
4357
4358let Predicates = [HasAVX, NoVLX] in {
4359  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4360                                       v4f32, VR128, loadv4f32, f128mem,
4361                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4362  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4363                                       v4f32, VR128, loadv4f32, f128mem,
4364                                       SchedWriteFShuffle.XMM>, VEX, VEX_WIG;
4365  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
4366                                       v8f32, VR256, loadv8f32, f256mem,
4367                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4368  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
4369                                       v8f32, VR256, loadv8f32, f256mem,
4370                                       SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG;
4371}
4372defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
4373                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4374defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
4375                                   memopv4f32, f128mem, SchedWriteFShuffle.XMM>;
4376
4377let Predicates = [HasAVX, NoVLX] in {
4378  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4379            (VMOVSHDUPrr VR128:$src)>;
4380  def : Pat<(v4i32 (X86Movshdup (load addr:$src))),
4381            (VMOVSHDUPrm addr:$src)>;
4382  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4383            (VMOVSLDUPrr VR128:$src)>;
4384  def : Pat<(v4i32 (X86Movsldup (load addr:$src))),
4385            (VMOVSLDUPrm addr:$src)>;
4386  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
4387            (VMOVSHDUPYrr VR256:$src)>;
4388  def : Pat<(v8i32 (X86Movshdup (load addr:$src))),
4389            (VMOVSHDUPYrm addr:$src)>;
4390  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
4391            (VMOVSLDUPYrr VR256:$src)>;
4392  def : Pat<(v8i32 (X86Movsldup (load addr:$src))),
4393            (VMOVSLDUPYrm addr:$src)>;
4394}
4395
4396let Predicates = [UseSSE3] in {
4397  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
4398            (MOVSHDUPrr VR128:$src)>;
4399  def : Pat<(v4i32 (X86Movshdup (memop addr:$src))),
4400            (MOVSHDUPrm addr:$src)>;
4401  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
4402            (MOVSLDUPrr VR128:$src)>;
4403  def : Pat<(v4i32 (X86Movsldup (memop addr:$src))),
4404            (MOVSLDUPrm addr:$src)>;
4405}
4406
4407//===---------------------------------------------------------------------===//
4408// SSE3 - Replicate Double FP - MOVDDUP
4409//===---------------------------------------------------------------------===//
4410
4411multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> {
4412def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
4413                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4414                    [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>,
4415                    Sched<[sched.XMM]>;
4416def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
4417                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4418                    [(set VR128:$dst,
4419                      (v2f64 (X86Movddup
4420                              (scalar_to_vector (loadf64 addr:$src)))))]>,
4421                    Sched<[sched.XMM.Folded]>;
4422}
4423
4424// FIXME: Merge with above classes when there are patterns for the ymm version
4425multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> {
4426def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
4427                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4428                    [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
4429                    Sched<[sched.YMM]>;
4430def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
4431                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4432                    [(set VR256:$dst,
4433                      (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
4434                    Sched<[sched.YMM.Folded]>;
4435}
4436
4437let Predicates = [HasAVX, NoVLX] in {
4438  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>,
4439                                      VEX, VEX_WIG;
4440  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>,
4441                                        VEX, VEX_L, VEX_WIG;
4442}
4443
4444defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
4445
4446
4447let Predicates = [HasAVX, NoVLX] in {
4448  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4449            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
4450}
4451
4452let Predicates = [UseSSE3] in {
4453  def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
4454            (MOVDDUPrm addr:$src)>;
4455}
4456
4457//===---------------------------------------------------------------------===//
4458// SSE3 - Move Unaligned Integer
4459//===---------------------------------------------------------------------===//
4460
4461let Predicates = [HasAVX] in {
4462  def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4463                      "vlddqu\t{$src, $dst|$dst, $src}",
4464                      [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4465                      Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG;
4466  def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
4467                       "vlddqu\t{$src, $dst|$dst, $src}",
4468                       [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
4469                       Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG;
4470} // Predicates
4471
4472def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
4473                   "lddqu\t{$src, $dst|$dst, $src}",
4474                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>,
4475                   Sched<[SchedWriteVecMoveLS.XMM.RM]>;
4476
4477//===---------------------------------------------------------------------===//
4478// SSE3 - Arithmetic
4479//===---------------------------------------------------------------------===//
4480
4481multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
4482                       X86MemOperand x86memop, X86FoldableSchedWrite sched,
4483                       PatFrag ld_frag, bit Is2Addr = 1> {
4484let Uses = [MXCSR], mayRaiseFPException = 1 in {
4485  def rr : I<0xD0, MRMSrcReg,
4486       (outs RC:$dst), (ins RC:$src1, RC:$src2),
4487       !if(Is2Addr,
4488           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4489           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4490       [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>,
4491       Sched<[sched]>;
4492  def rm : I<0xD0, MRMSrcMem,
4493       (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4494       !if(Is2Addr,
4495           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4496           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4497       [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
4498       Sched<[sched.Folded, sched.ReadAfterFold]>;
4499}
4500}
4501
4502let Predicates = [HasAVX] in {
4503  let ExeDomain = SSEPackedSingle in {
4504    defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
4505                                 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
4506                                 XD, VEX_4V, VEX_WIG;
4507    defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
4508                                  SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
4509                                  XD, VEX_4V, VEX_L, VEX_WIG;
4510  }
4511  let ExeDomain = SSEPackedDouble in {
4512    defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
4513                                 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
4514                                 PD, VEX_4V, VEX_WIG;
4515    defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
4516                                  SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
4517                                  PD, VEX_4V, VEX_L, VEX_WIG;
4518  }
4519}
4520let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
4521  let ExeDomain = SSEPackedSingle in
4522  defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
4523                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
4524  let ExeDomain = SSEPackedDouble in
4525  defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
4526                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
4527}
4528
4529//===---------------------------------------------------------------------===//
4530// SSE3 Instructions
4531//===---------------------------------------------------------------------===//
4532
4533// Horizontal ops
4534multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4535                   X86MemOperand x86memop, SDNode OpNode,
4536                   X86FoldableSchedWrite sched, PatFrag ld_frag,
4537                   bit Is2Addr = 1> {
4538let Uses = [MXCSR], mayRaiseFPException = 1 in {
4539  def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4540       !if(Is2Addr,
4541         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4542         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4543      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4544      Sched<[sched]>;
4545
4546  def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4547       !if(Is2Addr,
4548         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4549         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4550      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4551      Sched<[sched.Folded, sched.ReadAfterFold]>;
4552}
4553}
4554multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
4555                  X86MemOperand x86memop, SDNode OpNode,
4556                  X86FoldableSchedWrite sched, PatFrag ld_frag,
4557                  bit Is2Addr = 1> {
4558let Uses = [MXCSR], mayRaiseFPException = 1 in {
4559  def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
4560       !if(Is2Addr,
4561         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4562         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4563      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>,
4564        Sched<[sched]>;
4565
4566  def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
4567       !if(Is2Addr,
4568         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4569         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4570      [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
4571        Sched<[sched.Folded, sched.ReadAfterFold]>;
4572}
4573}
4574
4575let Predicates = [HasAVX] in {
4576  let ExeDomain = SSEPackedSingle in {
4577    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
4578                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4579    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
4580                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG;
4581    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
4582                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4583    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
4584                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
4585  }
4586  let ExeDomain = SSEPackedDouble in {
4587    defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
4588                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4589    defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
4590                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG;
4591    defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
4592                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4593    defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
4594                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
4595  }
4596}
4597
4598let Constraints = "$src1 = $dst" in {
4599  let ExeDomain = SSEPackedSingle in {
4600    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
4601                          WriteFHAdd, memopv4f32>;
4602    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
4603                          WriteFHAdd, memopv4f32>;
4604  }
4605  let ExeDomain = SSEPackedDouble in {
4606    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
4607                         WriteFHAdd, memopv2f64>;
4608    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
4609                         WriteFHAdd, memopv2f64>;
4610  }
4611}
4612
4613//===---------------------------------------------------------------------===//
4614// SSSE3 - Packed Absolute Instructions
4615//===---------------------------------------------------------------------===//
4616
4617/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4618multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
4619                        SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> {
4620  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4621                 (ins VR128:$src),
4622                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4623                 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>,
4624                 Sched<[sched.XMM]>;
4625
4626  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4627                 (ins i128mem:$src),
4628                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4629                 [(set VR128:$dst,
4630                   (vt (OpNode (ld_frag addr:$src))))]>,
4631                 Sched<[sched.XMM.Folded]>;
4632}
4633
4634/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
4635multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
4636                          SDNode OpNode, X86SchedWriteWidths sched> {
4637  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4638                  (ins VR256:$src),
4639                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4640                  [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
4641                  Sched<[sched.YMM]>;
4642
4643  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4644                  (ins i256mem:$src),
4645                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
4646                  [(set VR256:$dst,
4647                    (vt (OpNode (load addr:$src))))]>,
4648                  Sched<[sched.YMM.Folded]>;
4649}
4650
4651let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4652  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU,
4653                              load>, VEX, VEX_WIG;
4654  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU,
4655                              load>, VEX, VEX_WIG;
4656}
4657let Predicates = [HasAVX, NoVLX] in {
4658  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU,
4659                              load>, VEX, VEX_WIG;
4660}
4661let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4662  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>,
4663                                VEX, VEX_L, VEX_WIG;
4664  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>,
4665                                VEX, VEX_L, VEX_WIG;
4666}
4667let Predicates = [HasAVX2, NoVLX] in {
4668  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>,
4669                                VEX, VEX_L, VEX_WIG;
4670}
4671
4672defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU,
4673                          memop>;
4674defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU,
4675                          memop>;
4676defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU,
4677                          memop>;
4678
4679//===---------------------------------------------------------------------===//
4680// SSSE3 - Packed Binary Operator Instructions
4681//===---------------------------------------------------------------------===//
4682
4683/// SS3I_binop_rm - Simple SSSE3 bin op
4684multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
4685                         ValueType DstVT, ValueType OpVT, RegisterClass RC,
4686                         PatFrag memop_frag, X86MemOperand x86memop,
4687                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4688  let isCommutable = 1 in
4689  def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
4690       (ins RC:$src1, RC:$src2),
4691       !if(Is2Addr,
4692         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4693         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4694       [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>,
4695       Sched<[sched]>;
4696  def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
4697       (ins RC:$src1, x86memop:$src2),
4698       !if(Is2Addr,
4699         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4700         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4701       [(set RC:$dst,
4702         (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>,
4703       Sched<[sched.Folded, sched.ReadAfterFold]>;
4704}
4705
4706/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
4707multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
4708                             Intrinsic IntId128, X86FoldableSchedWrite sched,
4709                             PatFrag ld_frag, bit Is2Addr = 1> {
4710  let isCommutable = 1 in
4711  def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
4712       (ins VR128:$src1, VR128:$src2),
4713       !if(Is2Addr,
4714         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4715         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4716       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
4717       Sched<[sched]>;
4718  def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
4719       (ins VR128:$src1, i128mem:$src2),
4720       !if(Is2Addr,
4721         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
4722         !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
4723       [(set VR128:$dst,
4724         (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
4725       Sched<[sched.Folded, sched.ReadAfterFold]>;
4726}
4727
4728multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
4729                               Intrinsic IntId256,
4730                               X86FoldableSchedWrite sched> {
4731  let isCommutable = 1 in
4732  def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
4733       (ins VR256:$src1, VR256:$src2),
4734       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4735       [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
4736       Sched<[sched]>;
4737  def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
4738       (ins VR256:$src1, i256mem:$src2),
4739       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
4740       [(set VR256:$dst,
4741         (IntId256 VR256:$src1, (load addr:$src2)))]>,
4742       Sched<[sched.Folded, sched.ReadAfterFold]>;
4743}
4744
4745let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
4746let isCommutable = 0 in {
4747  defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
4748                                  VR128, load, i128mem,
4749                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4750  defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
4751                                  v16i8, VR128, load, i128mem,
4752                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4753}
4754defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
4755                                  VR128, load, i128mem,
4756                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG;
4757}
4758
4759let ImmT = NoImm, Predicates = [HasAVX] in {
4760let isCommutable = 0 in {
4761  defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
4762                                  load, i128mem,
4763                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4764  defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
4765                                  load, i128mem,
4766                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4767  defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
4768                                  load, i128mem,
4769                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4770  defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
4771                                  load, i128mem,
4772                                  SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG;
4773  defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
4774                                      int_x86_ssse3_psign_b_128,
4775                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4776  defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
4777                                      int_x86_ssse3_psign_w_128,
4778                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4779  defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
4780                                      int_x86_ssse3_psign_d_128,
4781                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG;
4782  defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
4783                                      int_x86_ssse3_phadd_sw_128,
4784                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4785  defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
4786                                      int_x86_ssse3_phsub_sw_128,
4787                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG;
4788}
4789}
4790
4791let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4792let isCommutable = 0 in {
4793  defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
4794                                  VR256, load, i256mem,
4795                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4796  defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
4797                                   v32i8, VR256, load, i256mem,
4798                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4799}
4800defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
4801                                  VR256, load, i256mem,
4802                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4803}
4804
4805let ImmT = NoImm, Predicates = [HasAVX2] in {
4806let isCommutable = 0 in {
4807  defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
4808                                  VR256, load, i256mem,
4809                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4810  defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
4811                                  load, i256mem,
4812                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4813  defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
4814                                  VR256, load, i256mem,
4815                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4816  defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
4817                                  load, i256mem,
4818                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4819  defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
4820                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4821  defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
4822                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4823  defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
4824                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG;
4825  defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
4826                                       int_x86_avx2_phadd_sw,
4827                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4828  defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
4829                                       int_x86_avx2_phsub_sw,
4830                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG;
4831}
4832}
4833
4834// None of these have i8 immediate fields.
4835let ImmT = NoImm, Constraints = "$src1 = $dst" in {
4836let isCommutable = 0 in {
4837  defm PHADDW    : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
4838                                 memop, i128mem, SchedWritePHAdd.XMM>;
4839  defm PHADDD    : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
4840                                 memop, i128mem, SchedWritePHAdd.XMM>;
4841  defm PHSUBW    : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
4842                                 memop, i128mem, SchedWritePHAdd.XMM>;
4843  defm PHSUBD    : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
4844                                 memop, i128mem, SchedWritePHAdd.XMM>;
4845  defm PSIGNB    : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
4846                                     SchedWriteVecALU.XMM, memop>;
4847  defm PSIGNW    : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
4848                                     SchedWriteVecALU.XMM, memop>;
4849  defm PSIGND    : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
4850                                     SchedWriteVecALU.XMM, memop>;
4851  defm PSHUFB    : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
4852                                 memop, i128mem, SchedWriteVarShuffle.XMM>;
4853  defm PHADDSW   : SS3I_binop_rm_int<0x03, "phaddsw",
4854                                     int_x86_ssse3_phadd_sw_128,
4855                                     SchedWritePHAdd.XMM, memop>;
4856  defm PHSUBSW   : SS3I_binop_rm_int<0x07, "phsubsw",
4857                                     int_x86_ssse3_phsub_sw_128,
4858                                     SchedWritePHAdd.XMM, memop>;
4859  defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
4860                                 v16i8, VR128, memop, i128mem,
4861                                 SchedWriteVecIMul.XMM>;
4862}
4863defm PMULHRSW    : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
4864                                 VR128, memop, i128mem, SchedWriteVecIMul.XMM>;
4865}
4866
4867//===---------------------------------------------------------------------===//
4868// SSSE3 - Packed Align Instruction Patterns
4869//===---------------------------------------------------------------------===//
4870
4871multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
4872                         PatFrag memop_frag, X86MemOperand x86memop,
4873                         X86FoldableSchedWrite sched, bit Is2Addr = 1> {
4874  let hasSideEffects = 0 in {
4875  def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
4876      (ins RC:$src1, RC:$src2, u8imm:$src3),
4877      !if(Is2Addr,
4878        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4879        !strconcat(asm,
4880                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4881      [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>,
4882      Sched<[sched]>;
4883  let mayLoad = 1 in
4884  def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
4885      (ins RC:$src1, x86memop:$src2, u8imm:$src3),
4886      !if(Is2Addr,
4887        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
4888        !strconcat(asm,
4889                  "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
4890      [(set RC:$dst, (VT (X86PAlignr RC:$src1,
4891                                     (memop_frag addr:$src2),
4892                                     (i8 timm:$src3))))]>,
4893      Sched<[sched.Folded, sched.ReadAfterFold]>;
4894  }
4895}
4896
4897let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
4898  defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
4899                                SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG;
4900let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
4901  defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
4902                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG;
4903let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
4904  defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
4905                               SchedWriteShuffle.XMM>;
4906
4907//===---------------------------------------------------------------------===//
4908// SSSE3 - Thread synchronization
4909//===---------------------------------------------------------------------===//
4910
4911let SchedRW = [WriteSystem] in {
4912let Uses = [EAX, ECX, EDX] in
4913def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4914                     TB, Requires<[HasSSE3, Not64BitMode]>;
4915let Uses = [RAX, ECX, EDX] in
4916def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>,
4917                     TB, Requires<[HasSSE3, In64BitMode]>;
4918
4919let Uses = [ECX, EAX] in
4920def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
4921                  [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
4922} // SchedRW
4923
4924def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
4925def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
4926
4927def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>,
4928      Requires<[Not64BitMode]>;
4929def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>,
4930      Requires<[In64BitMode]>;
4931
4932//===----------------------------------------------------------------------===//
4933// SSE4.1 - Packed Move with Sign/Zero Extend
4934// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp
4935//===----------------------------------------------------------------------===//
4936
4937multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4938                            RegisterClass OutRC, RegisterClass InRC,
4939                            X86FoldableSchedWrite sched> {
4940  def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
4941                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4942                 Sched<[sched]>;
4943
4944  def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
4945                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
4946                 Sched<[sched.Folded]>;
4947}
4948
4949multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
4950                              X86MemOperand MemOp, X86MemOperand MemYOp,
4951                              Predicate prd> {
4952  defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128,
4953                               SchedWriteShuffle.XMM>;
4954  let Predicates = [HasAVX, prd] in
4955    defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
4956                                     VR128, VR128, SchedWriteShuffle.XMM>,
4957                                     VEX, VEX_WIG;
4958  let Predicates = [HasAVX2, prd] in
4959    defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
4960                                     VR256, VR128, WriteVPMOV256>,
4961                                     VEX, VEX_L, VEX_WIG;
4962}
4963
4964multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
4965                          X86MemOperand MemYOp, Predicate prd> {
4966  defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
4967                                        MemOp, MemYOp, prd>;
4968  defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
4969                                        !strconcat("pmovzx", OpcodeStr),
4970                                        MemOp, MemYOp, prd>;
4971}
4972
4973defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
4974defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
4975defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;
4976
4977defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
4978defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;
4979
4980defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;
4981
4982// AVX2 Patterns
4983multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
4984                                     SDNode ExtOp, SDNode InVecOp> {
4985  // Register-Register patterns
4986  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
4987  def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
4988            (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
4989  }
4990  let Predicates = [HasAVX2, NoVLX] in {
4991  def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))),
4992            (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
4993  def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))),
4994            (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;
4995
4996  def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
4997            (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
4998  def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))),
4999            (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;
5000
5001  def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
5002            (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
5003  }
5004
5005  // Simple Register-Memory patterns
5006  let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5007  def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5008            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5009
5010  def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))),
5011            (!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
5012  }
5013
5014  let Predicates = [HasAVX2, NoVLX] in {
5015  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5016            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5017  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5018            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5019
5020  def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5021            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5022  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5023            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5024
5025  def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5026            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5027  }
5028
5029  // AVX2 Register-Memory patterns
5030  let Predicates = [HasAVX2, NoVLX] in {
5031  def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))),
5032            (!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
5033
5034  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5035            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5036  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5037            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5038  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5039            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
5040
5041  def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
5042            (!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
5043
5044  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5045            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5046  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
5047            (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
5048
5049  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5050            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5051  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5052            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5053  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5054            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
5055  }
5056}
5057
5058defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>;
5059defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>;
5060
5061// SSE4.1/AVX patterns.
5062multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
5063                                SDNode ExtOp> {
5064  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5065  def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
5066            (!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
5067  }
5068  let Predicates = [HasAVX, NoVLX] in {
5069  def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
5070            (!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
5071  def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
5072            (!cast<I>(OpcPrefix#BQrr) VR128:$src)>;
5073
5074  def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
5075            (!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
5076  def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
5077            (!cast<I>(OpcPrefix#WQrr) VR128:$src)>;
5078
5079  def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
5080            (!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
5081  }
5082  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5083  def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5084            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5085  }
5086  let Predicates = [HasAVX, NoVLX] in {
5087  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5088            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5089  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
5090            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5091
5092  def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5093            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5094  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
5095            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5096
5097  def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
5098            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5099  }
5100  let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5101  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5102            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5103  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5104            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5105  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
5106            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5107  def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
5108            (!cast<I>(OpcPrefix#BWrm) addr:$src)>;
5109  }
5110  let Predicates = [HasAVX, NoVLX] in {
5111  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5112            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5113  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
5114            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5115  def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
5116            (!cast<I>(OpcPrefix#BDrm) addr:$src)>;
5117
5118  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
5119            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5120  def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))),
5121            (!cast<I>(OpcPrefix#BQrm) addr:$src)>;
5122
5123  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5124            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5125  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5126            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5127  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
5128            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5129  def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
5130            (!cast<I>(OpcPrefix#WDrm) addr:$src)>;
5131
5132  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
5133            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5134  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
5135            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5136  def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
5137            (!cast<I>(OpcPrefix#WQrm) addr:$src)>;
5138
5139  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
5140            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5141  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
5142            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5143  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
5144            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5145  def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
5146            (!cast<I>(OpcPrefix#DQrm) addr:$src)>;
5147  }
5148}
5149
5150defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>;
5151defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>;
5152
5153let Predicates = [UseSSE41] in {
5154  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>;
5155  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>;
5156}
5157
5158//===----------------------------------------------------------------------===//
5159// SSE4.1 - Extract Instructions
5160//===----------------------------------------------------------------------===//
5161
5162/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
5163multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
5164  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5165                 (ins VR128:$src1, u8imm:$src2),
5166                 !strconcat(OpcodeStr,
5167                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5168                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
5169                                         timm:$src2))]>,
5170                  Sched<[WriteVecExtract]>;
5171  let hasSideEffects = 0, mayStore = 1 in
5172  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5173                 (ins i8mem:$dst, VR128:$src1, u8imm:$src2),
5174                 !strconcat(OpcodeStr,
5175                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5176                 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))),
5177                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5178}
5179
5180let Predicates = [HasAVX, NoBWI] in
5181  defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG;
5182
5183defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
5184
5185
5186/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
5187multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
5188  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
5189  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5190                   (ins VR128:$src1, u8imm:$src2),
5191                   !strconcat(OpcodeStr,
5192                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
5193                   Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>;
5194
5195  let hasSideEffects = 0, mayStore = 1 in
5196  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5197                 (ins i16mem:$dst, VR128:$src1, u8imm:$src2),
5198                 !strconcat(OpcodeStr,
5199                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5200                 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))),
5201                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5202}
5203
5204let Predicates = [HasAVX, NoBWI] in
5205  defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG;
5206
5207defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
5208
5209
5210/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5211multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
5212  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
5213                 (ins VR128:$src1, u8imm:$src2),
5214                 !strconcat(OpcodeStr,
5215                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5216                 [(set GR32:$dst,
5217                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>,
5218                  Sched<[WriteVecExtract]>;
5219  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5220                 (ins i32mem:$dst, VR128:$src1, u8imm:$src2),
5221                 !strconcat(OpcodeStr,
5222                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5223                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
5224                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5225}
5226
5227let Predicates = [HasAVX, NoDQI] in
5228  defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;
5229
5230defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
5231
5232/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
5233multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
5234  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
5235                 (ins VR128:$src1, u8imm:$src2),
5236                 !strconcat(OpcodeStr,
5237                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5238                 [(set GR64:$dst,
5239                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
5240                  Sched<[WriteVecExtract]>;
5241  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5242                 (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
5243                 !strconcat(OpcodeStr,
5244                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5245                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
5246                          addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5247}
5248
5249let Predicates = [HasAVX, NoDQI] in
5250  defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
5251
5252defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
5253
5254/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
5255/// destination
5256multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
5257  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
5258                   (ins VR128:$src1, u8imm:$src2),
5259                   !strconcat(OpcodeStr,
5260                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5261                   [(set GR32orGR64:$dst,
5262                      (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
5263                   Sched<[WriteVecExtract]>;
5264  def mr : SS4AIi8<opc, MRMDestMem, (outs),
5265                   (ins f32mem:$dst, VR128:$src1, u8imm:$src2),
5266                   !strconcat(OpcodeStr,
5267                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5268                   [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
5269                            addr:$dst)]>, Sched<[WriteVecExtractSt]>;
5270}
5271
5272let ExeDomain = SSEPackedSingle in {
5273  let Predicates = [UseAVX] in
5274    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
5275  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
5276}
5277
5278//===----------------------------------------------------------------------===//
5279// SSE4.1 - Insert Instructions
5280//===----------------------------------------------------------------------===//
5281
5282multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
5283  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5284      (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
5285      !if(Is2Addr,
5286        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5287        !strconcat(asm,
5288                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5289      [(set VR128:$dst,
5290        (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>,
5291      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5292  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5293      (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
5294      !if(Is2Addr,
5295        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5296        !strconcat(asm,
5297                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5298      [(set VR128:$dst,
5299        (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>,
5300                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5301}
5302
5303let Predicates = [HasAVX, NoBWI] in
5304  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG;
5305let Constraints = "$src1 = $dst" in
5306  defm PINSRB  : SS41I_insert8<0x20, "pinsrb">;
5307
5308multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
5309  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5310      (ins VR128:$src1, GR32:$src2, u8imm:$src3),
5311      !if(Is2Addr,
5312        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5313        !strconcat(asm,
5314                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5315      [(set VR128:$dst,
5316        (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
5317      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5318  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5319      (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
5320      !if(Is2Addr,
5321        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5322        !strconcat(asm,
5323                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5324      [(set VR128:$dst,
5325        (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>,
5326                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5327}
5328
5329let Predicates = [HasAVX, NoDQI] in
5330  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
5331let Constraints = "$src1 = $dst" in
5332  defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
5333
5334multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
5335  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5336      (ins VR128:$src1, GR64:$src2, u8imm:$src3),
5337      !if(Is2Addr,
5338        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5339        !strconcat(asm,
5340                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5341      [(set VR128:$dst,
5342        (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
5343      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
5344  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5345      (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
5346      !if(Is2Addr,
5347        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5348        !strconcat(asm,
5349                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5350      [(set VR128:$dst,
5351        (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>,
5352                   Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
5353}
5354
5355let Predicates = [HasAVX, NoDQI] in
5356  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
5357let Constraints = "$src1 = $dst" in
5358  defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
5359
5360// insertps has a few different modes, there's the first two here below which
5361// are optimized inserts that won't zero arbitrary elements in the destination
5362// vector. The next one matches the intrinsic and could zero arbitrary elements
5363// in the target vector.
5364multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
5365  let isCommutable = 1 in
5366  def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
5367      (ins VR128:$src1, VR128:$src2, u8imm:$src3),
5368      !if(Is2Addr,
5369        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5370        !strconcat(asm,
5371                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5372      [(set VR128:$dst,
5373        (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>,
5374      Sched<[SchedWriteFShuffle.XMM]>;
5375  def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
5376      (ins VR128:$src1, f32mem:$src2, u8imm:$src3),
5377      !if(Is2Addr,
5378        !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5379        !strconcat(asm,
5380                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5381      [(set VR128:$dst,
5382        (X86insertps VR128:$src1,
5383                   (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
5384                    timm:$src3))]>,
5385      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
5386}
5387
5388let ExeDomain = SSEPackedSingle in {
5389  let Predicates = [UseAVX] in
5390    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
5391                     VEX_4V, VEX_WIG;
5392  let Constraints = "$src1 = $dst" in
5393    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
5394}
5395
5396//===----------------------------------------------------------------------===//
5397// SSE4.1 - Round Instructions
5398//===----------------------------------------------------------------------===//
5399
5400multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
5401                           X86MemOperand x86memop, RegisterClass RC,
5402                           ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode,
5403                           X86FoldableSchedWrite sched> {
5404  // Intrinsic operation, reg.
5405  // Vector intrinsic operation, reg
5406let Uses = [MXCSR], mayRaiseFPException = 1 in {
5407  def r : SS4AIi8<opc, MRMSrcReg,
5408                  (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
5409                  !strconcat(OpcodeStr,
5410                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5411                  [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>,
5412                  Sched<[sched]>;
5413
5414  // Vector intrinsic operation, mem
5415  def m : SS4AIi8<opc, MRMSrcMem,
5416                  (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
5417                  !strconcat(OpcodeStr,
5418                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5419                  [(set RC:$dst,
5420                        (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
5421                  Sched<[sched.Folded]>;
5422}
5423}
5424
5425multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
5426                          string OpcodeStr, X86FoldableSchedWrite sched> {
5427let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5428  def SSr : SS4AIi8<opcss, MRMSrcReg,
5429        (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
5430        !strconcat(OpcodeStr,
5431            "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5432      []>, Sched<[sched]>;
5433
5434  let mayLoad = 1 in
5435  def SSm : SS4AIi8<opcss, MRMSrcMem,
5436        (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
5437        !strconcat(OpcodeStr,
5438             "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5439        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5440} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5441
5442let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5443  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5444        (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
5445        !strconcat(OpcodeStr,
5446              "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5447        []>, Sched<[sched]>;
5448
5449  let mayLoad = 1 in
5450  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5451        (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
5452        !strconcat(OpcodeStr,
5453             "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
5454        []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5455} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5456}
5457
5458multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
5459                           string OpcodeStr, X86FoldableSchedWrite sched> {
5460let Uses = [MXCSR], mayRaiseFPException = 1 in {
5461let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
5462  def SSr : SS4AIi8<opcss, MRMSrcReg,
5463                    (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
5464                    !strconcat(OpcodeStr,
5465                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5466                    []>, Sched<[sched]>;
5467
5468  let mayLoad = 1 in
5469  def SSm : SS4AIi8<opcss, MRMSrcMem,
5470                    (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
5471                    !strconcat(OpcodeStr,
5472                               "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5473                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5474} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
5475
5476let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
5477  def SDr : SS4AIi8<opcsd, MRMSrcReg,
5478                    (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
5479                    !strconcat(OpcodeStr,
5480                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5481                    []>, Sched<[sched]>;
5482
5483  let mayLoad = 1 in
5484  def SDm : SS4AIi8<opcsd, MRMSrcMem,
5485                    (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
5486                    !strconcat(OpcodeStr,
5487                               "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
5488                    []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
5489} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
5490}
5491}
5492
5493multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
5494                            string OpcodeStr, X86FoldableSchedWrite sched,
5495                            ValueType VT32, ValueType VT64,
5496                            SDNode OpNode, bit Is2Addr = 1> {
5497let Uses = [MXCSR], mayRaiseFPException = 1 in {
5498let ExeDomain = SSEPackedSingle in {
5499  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
5500        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5501        !if(Is2Addr,
5502            !strconcat(OpcodeStr,
5503                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5504            !strconcat(OpcodeStr,
5505                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5506        [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5507        Sched<[sched]>;
5508
5509  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
5510        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
5511        !if(Is2Addr,
5512            !strconcat(OpcodeStr,
5513                "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5514            !strconcat(OpcodeStr,
5515                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5516        [(set VR128:$dst,
5517             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
5518        Sched<[sched.Folded, sched.ReadAfterFold]>;
5519} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
5520
5521let ExeDomain = SSEPackedDouble in {
5522  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
5523        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
5524        !if(Is2Addr,
5525            !strconcat(OpcodeStr,
5526                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5527            !strconcat(OpcodeStr,
5528                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5529        [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>,
5530        Sched<[sched]>;
5531
5532  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
5533        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
5534        !if(Is2Addr,
5535            !strconcat(OpcodeStr,
5536                "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5537            !strconcat(OpcodeStr,
5538                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5539        [(set VR128:$dst,
5540              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
5541        Sched<[sched.Folded, sched.ReadAfterFold]>;
5542} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
5543}
5544}
5545
5546// FP round - roundss, roundps, roundsd, roundpd
5547let Predicates = [HasAVX, NoVLX] in {
5548  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
5549    // Intrinsic form
5550    defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
5551                                     loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
5552                                   VEX, VEX_WIG;
5553    defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
5554                                     loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
5555                                   VEX, VEX_L, VEX_WIG;
5556  }
5557
5558  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
5559    defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
5560                                     loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
5561                                   VEX, VEX_WIG;
5562    defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
5563                                     loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
5564                                   VEX, VEX_L, VEX_WIG;
5565  }
5566}
5567let Predicates = [UseAVX] in {
5568  defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
5569                                  v4f32, v2f64, X86RndScales, 0>,
5570                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5571  defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
5572                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
5573}
5574
5575let Predicates = [UseAVX] in {
5576  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5577            (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
5578  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5579            (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
5580}
5581
5582let Predicates = [UseAVX, OptForSize] in {
5583  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5584            (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5585  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5586            (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
5587}
5588
5589let ExeDomain = SSEPackedSingle in
5590defm ROUNDPS  : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
5591                                memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
5592let ExeDomain = SSEPackedDouble in
5593defm ROUNDPD  : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
5594                                memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
5595
5596defm ROUND  : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
5597
5598let Constraints = "$src1 = $dst" in
5599defm ROUND  : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
5600                               v4f32, v2f64, X86RndScales>;
5601
5602let Predicates = [UseSSE41] in {
5603  def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
5604            (ROUNDSSr FR32:$src1, timm:$src2)>;
5605  def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
5606            (ROUNDSDr FR64:$src1, timm:$src2)>;
5607}
5608
5609let Predicates = [UseSSE41, OptForSize] in {
5610  def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
5611            (ROUNDSSm addr:$src1, timm:$src2)>;
5612  def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
5613            (ROUNDSDm addr:$src1, timm:$src2)>;
5614}
5615
5616//===----------------------------------------------------------------------===//
5617// SSE4.1 - Packed Bit Test
5618//===----------------------------------------------------------------------===//
5619
5620// ptest instruction we'll lower to this in X86ISelLowering primarily from
5621// the intel intrinsic that corresponds to this.
5622let Defs = [EFLAGS], Predicates = [HasAVX] in {
5623def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5624                "vptest\t{$src2, $src1|$src1, $src2}",
5625                [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5626                Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG;
5627def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5628                "vptest\t{$src2, $src1|$src1, $src2}",
5629                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
5630                Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>,
5631                VEX, VEX_WIG;
5632
5633def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
5634                "vptest\t{$src2, $src1|$src1, $src2}",
5635                [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
5636                Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG;
5637def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
5638                "vptest\t{$src2, $src1|$src1, $src2}",
5639                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
5640                Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>,
5641                VEX, VEX_L, VEX_WIG;
5642}
5643
5644let Defs = [EFLAGS] in {
5645def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
5646              "ptest\t{$src2, $src1|$src1, $src2}",
5647              [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
5648              Sched<[SchedWriteVecTest.XMM]>;
5649def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
5650              "ptest\t{$src2, $src1|$src1, $src2}",
5651              [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
5652              Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>;
5653}
5654
5655// The bit test instructions below are AVX only
5656multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
5657                       X86MemOperand x86memop, PatFrag mem_frag, ValueType vt,
5658                       X86FoldableSchedWrite sched> {
5659  def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
5660            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5661            [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
5662            Sched<[sched]>, VEX;
5663  def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
5664            !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
5665            [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
5666            Sched<[sched.Folded, sched.ReadAfterFold]>, VEX;
5667}
5668
5669let Defs = [EFLAGS], Predicates = [HasAVX] in {
5670let ExeDomain = SSEPackedSingle in {
5671defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32,
5672                            SchedWriteFTest.XMM>;
5673defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32,
5674                            SchedWriteFTest.YMM>, VEX_L;
5675}
5676let ExeDomain = SSEPackedDouble in {
5677defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64,
5678                            SchedWriteFTest.XMM>;
5679defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64,
5680                            SchedWriteFTest.YMM>, VEX_L;
5681}
5682}
5683
5684//===----------------------------------------------------------------------===//
5685// SSE4.1 - Misc Instructions
5686//===----------------------------------------------------------------------===//
5687
5688let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
5689  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
5690                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5691                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
5692                     Sched<[WritePOPCNT]>, OpSize16, XS;
5693  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
5694                     "popcnt{w}\t{$src, $dst|$dst, $src}",
5695                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
5696                      (implicit EFLAGS)]>,
5697                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
5698
5699  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
5700                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5701                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
5702                     Sched<[WritePOPCNT]>, OpSize32, XS;
5703
5704  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
5705                     "popcnt{l}\t{$src, $dst|$dst, $src}",
5706                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
5707                      (implicit EFLAGS)]>,
5708                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
5709
5710  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
5711                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5712                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
5713                      Sched<[WritePOPCNT]>, XS;
5714  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
5715                      "popcnt{q}\t{$src, $dst|$dst, $src}",
5716                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
5717                       (implicit EFLAGS)]>,
5718                       Sched<[WritePOPCNT.Folded]>, XS;
5719}
5720
5721// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
5722multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
5723                                 SDNode OpNode, PatFrag ld_frag,
5724                                 X86FoldableSchedWrite Sched> {
5725  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
5726                 (ins VR128:$src),
5727                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5728                 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
5729                 Sched<[Sched]>;
5730  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
5731                  (ins i128mem:$src),
5732                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
5733                  [(set VR128:$dst,
5734                    (v8i16 (OpNode (ld_frag addr:$src))))]>,
5735                 Sched<[Sched.Folded]>;
5736}
5737
5738// PHMIN has the same profile as PSAD, thus we use the same scheduling
5739// model, although the naming is misleading.
5740let Predicates = [HasAVX] in
5741defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw",
5742                                         X86phminpos, load,
5743                                         WritePHMINPOS>, VEX, VEX_WIG;
5744defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw",
5745                                         X86phminpos, memop,
5746                                         WritePHMINPOS>;
5747
5748/// SS48I_binop_rm - Simple SSE41 binary operator.
5749multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
5750                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5751                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
5752                          bit Is2Addr = 1> {
5753  let isCommutable = 1 in
5754  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
5755       (ins RC:$src1, RC:$src2),
5756       !if(Is2Addr,
5757           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5758           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5759       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
5760       Sched<[sched]>;
5761  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
5762       (ins RC:$src1, x86memop:$src2),
5763       !if(Is2Addr,
5764           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
5765           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
5766       [(set RC:$dst,
5767         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
5768       Sched<[sched.Folded, sched.ReadAfterFold]>;
5769}
5770
5771let Predicates = [HasAVX, NoVLX] in {
5772  defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
5773                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5774                                  VEX_4V, VEX_WIG;
5775  defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
5776                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5777                                  VEX_4V, VEX_WIG;
5778  defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
5779                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5780                                  VEX_4V, VEX_WIG;
5781  defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
5782                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5783                                  VEX_4V, VEX_WIG;
5784  defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
5785                                  load, i128mem, SchedWriteVecIMul.XMM, 0>,
5786                                  VEX_4V, VEX_WIG;
5787}
5788let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
5789  defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
5790                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5791                                  VEX_4V, VEX_WIG;
5792  defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
5793                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5794                                  VEX_4V, VEX_WIG;
5795  defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
5796                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5797                                  VEX_4V, VEX_WIG;
5798  defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
5799                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
5800                                  VEX_4V, VEX_WIG;
5801}
5802
5803let Predicates = [HasAVX2, NoVLX] in {
5804  defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
5805                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5806                                  VEX_4V, VEX_L, VEX_WIG;
5807  defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
5808                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5809                                  VEX_4V, VEX_L, VEX_WIG;
5810  defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
5811                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5812                                  VEX_4V, VEX_L, VEX_WIG;
5813  defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
5814                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5815                                  VEX_4V, VEX_L, VEX_WIG;
5816  defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
5817                                  load, i256mem, SchedWriteVecIMul.YMM, 0>,
5818                                  VEX_4V, VEX_L, VEX_WIG;
5819}
5820let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
5821  defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
5822                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5823                                  VEX_4V, VEX_L, VEX_WIG;
5824  defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
5825                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5826                                  VEX_4V, VEX_L, VEX_WIG;
5827  defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
5828                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5829                                  VEX_4V, VEX_L, VEX_WIG;
5830  defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
5831                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5832                                  VEX_4V, VEX_L, VEX_WIG;
5833}
5834
5835let Constraints = "$src1 = $dst" in {
5836  defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
5837                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5838  defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
5839                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5840  defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
5841                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5842  defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
5843                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5844  defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
5845                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5846  defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
5847                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5848  defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
5849                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5850  defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
5851                                 memop, i128mem, SchedWriteVecALU.XMM, 1>;
5852  defm PMULDQ   : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128,
5853                                 memop, i128mem, SchedWriteVecIMul.XMM, 1>;
5854}
5855
5856let Predicates = [HasAVX, NoVLX] in
5857  defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
5858                                 load, i128mem, SchedWritePMULLD.XMM, 0>,
5859                                 VEX_4V, VEX_WIG;
5860let Predicates = [HasAVX] in
5861  defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
5862                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
5863                                 VEX_4V, VEX_WIG;
5864
5865let Predicates = [HasAVX2, NoVLX] in
5866  defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
5867                                  load, i256mem, SchedWritePMULLD.YMM, 0>,
5868                                  VEX_4V, VEX_L, VEX_WIG;
5869let Predicates = [HasAVX2] in
5870  defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
5871                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
5872                                  VEX_4V, VEX_L, VEX_WIG;
5873
5874let Constraints = "$src1 = $dst" in {
5875  defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
5876                                memop, i128mem, SchedWritePMULLD.XMM, 1>;
5877  defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
5878                                memop, i128mem, SchedWriteVecALU.XMM, 1>;
5879}
5880
5881/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
5882multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
5883                 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
5884                 X86MemOperand x86memop, bit Is2Addr,
5885                 X86FoldableSchedWrite sched> {
5886  let isCommutable = 1 in
5887  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5888        (ins RC:$src1, RC:$src2, u8imm:$src3),
5889        !if(Is2Addr,
5890            !strconcat(OpcodeStr,
5891                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5892            !strconcat(OpcodeStr,
5893                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5894        [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>,
5895        Sched<[sched]>;
5896  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5897        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5898        !if(Is2Addr,
5899            !strconcat(OpcodeStr,
5900                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5901            !strconcat(OpcodeStr,
5902                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5903        [(set RC:$dst,
5904          (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>,
5905        Sched<[sched.Folded, sched.ReadAfterFold]>;
5906}
5907
5908/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
5909multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
5910                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
5911                           X86MemOperand x86memop, bit Is2Addr,
5912                           X86FoldableSchedWrite sched> {
5913  let isCommutable = 1 in
5914  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
5915        (ins RC:$src1, RC:$src2, u8imm:$src3),
5916        !if(Is2Addr,
5917            !strconcat(OpcodeStr,
5918                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5919            !strconcat(OpcodeStr,
5920                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5921        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
5922        Sched<[sched]>;
5923  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
5924        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
5925        !if(Is2Addr,
5926            !strconcat(OpcodeStr,
5927                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
5928            !strconcat(OpcodeStr,
5929                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
5930        [(set RC:$dst,
5931          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
5932        Sched<[sched.Folded, sched.ReadAfterFold]>;
5933}
5934
5935def BlendCommuteImm2 : SDNodeXForm<timm, [{
5936  uint8_t Imm = N->getZExtValue() & 0x03;
5937  return getI8Imm(Imm ^ 0x03, SDLoc(N));
5938}]>;
5939
5940def BlendCommuteImm4 : SDNodeXForm<timm, [{
5941  uint8_t Imm = N->getZExtValue() & 0x0f;
5942  return getI8Imm(Imm ^ 0x0f, SDLoc(N));
5943}]>;
5944
5945def BlendCommuteImm8 : SDNodeXForm<timm, [{
5946  uint8_t Imm = N->getZExtValue() & 0xff;
5947  return getI8Imm(Imm ^ 0xff, SDLoc(N));
5948}]>;
5949
5950// Turn a 4-bit blendi immediate to 8-bit for use with pblendw.
5951def BlendScaleImm4 : SDNodeXForm<timm, [{
5952  uint8_t Imm = N->getZExtValue();
5953  uint8_t NewImm = 0;
5954  for (unsigned i = 0; i != 4; ++i) {
5955    if (Imm & (1 << i))
5956      NewImm |= 0x3 << (i * 2);
5957  }
5958  return getI8Imm(NewImm, SDLoc(N));
5959}]>;
5960
5961// Turn a 2-bit blendi immediate to 8-bit for use with pblendw.
5962def BlendScaleImm2 : SDNodeXForm<timm, [{
5963  uint8_t Imm = N->getZExtValue();
5964  uint8_t NewImm = 0;
5965  for (unsigned i = 0; i != 2; ++i) {
5966    if (Imm & (1 << i))
5967      NewImm |= 0xf << (i * 4);
5968  }
5969  return getI8Imm(NewImm, SDLoc(N));
5970}]>;
5971
5972// Turn a 2-bit blendi immediate to 4-bit for use with pblendd.
5973def BlendScaleImm2to4 : SDNodeXForm<timm, [{
5974  uint8_t Imm = N->getZExtValue();
5975  uint8_t NewImm = 0;
5976  for (unsigned i = 0; i != 2; ++i) {
5977    if (Imm & (1 << i))
5978      NewImm |= 0x3 << (i * 2);
5979  }
5980  return getI8Imm(NewImm, SDLoc(N));
5981}]>;
5982
5983// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it.
5984def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{
5985  uint8_t Imm = N->getZExtValue();
5986  uint8_t NewImm = 0;
5987  for (unsigned i = 0; i != 4; ++i) {
5988    if (Imm & (1 << i))
5989      NewImm |= 0x3 << (i * 2);
5990  }
5991  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
5992}]>;
5993
5994// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it.
5995def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{
5996  uint8_t Imm = N->getZExtValue();
5997  uint8_t NewImm = 0;
5998  for (unsigned i = 0; i != 2; ++i) {
5999    if (Imm & (1 << i))
6000      NewImm |= 0xf << (i * 4);
6001  }
6002  return getI8Imm(NewImm ^ 0xff, SDLoc(N));
6003}]>;
6004
6005// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it.
6006def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{
6007  uint8_t Imm = N->getZExtValue();
6008  uint8_t NewImm = 0;
6009  for (unsigned i = 0; i != 2; ++i) {
6010    if (Imm & (1 << i))
6011      NewImm |= 0x3 << (i * 2);
6012  }
6013  return getI8Imm(NewImm ^ 0xf, SDLoc(N));
6014}]>;
6015
6016let Predicates = [HasAVX] in {
6017  let isCommutable = 0 in {
6018    defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
6019                                        VR128, load, i128mem, 0,
6020                                        SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
6021  }
6022
6023let Uses = [MXCSR], mayRaiseFPException = 1 in {
6024  let ExeDomain = SSEPackedSingle in
6025  defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
6026                                   VR128, load, f128mem, 0,
6027                                   SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG;
6028  let ExeDomain = SSEPackedDouble in
6029  defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
6030                                   VR128, load, f128mem, 0,
6031                                   SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG;
6032  let ExeDomain = SSEPackedSingle in
6033  defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
6034                                    VR256, load, i256mem, 0,
6035                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
6036}
6037}
6038
6039let Predicates = [HasAVX2] in {
6040  let isCommutable = 0 in {
6041  defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
6042                                  VR256, load, i256mem, 0,
6043                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG;
6044  }
6045}
6046
6047let Constraints = "$src1 = $dst" in {
6048  let isCommutable = 0 in {
6049  defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
6050                                     VR128, memop, i128mem, 1,
6051                                     SchedWriteMPSAD.XMM>;
6052  }
6053
6054  let ExeDomain = SSEPackedSingle in
6055  defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
6056                                  VR128, memop, f128mem, 1,
6057                                  SchedWriteDPPS.XMM>, SIMD_EXC;
6058  let ExeDomain = SSEPackedDouble in
6059  defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
6060                                  VR128, memop, f128mem, 1,
6061                                  SchedWriteDPPD.XMM>, SIMD_EXC;
6062}
6063
6064/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
6065multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
6066                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6067                           X86MemOperand x86memop, bit Is2Addr, Domain d,
6068                           X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> {
6069let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
6070  let isCommutable = 1 in
6071  def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
6072        (ins RC:$src1, RC:$src2, u8imm:$src3),
6073        !if(Is2Addr,
6074            !strconcat(OpcodeStr,
6075                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6076            !strconcat(OpcodeStr,
6077                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6078        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
6079        Sched<[sched]>;
6080  def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
6081        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
6082        !if(Is2Addr,
6083            !strconcat(OpcodeStr,
6084                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
6085            !strconcat(OpcodeStr,
6086                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
6087        [(set RC:$dst,
6088          (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>,
6089        Sched<[sched.Folded, sched.ReadAfterFold]>;
6090}
6091
6092  // Pattern to commute if load is in first source.
6093  def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)),
6094            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
6095                                            (commuteXForm timm:$src3))>;
6096}
6097
6098let Predicates = [HasAVX] in {
6099  defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
6100                                  VR128, load, f128mem, 0, SSEPackedSingle,
6101                                  SchedWriteFBlend.XMM, BlendCommuteImm4>,
6102                                  VEX_4V, VEX_WIG;
6103  defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
6104                                   VR256, load, f256mem, 0, SSEPackedSingle,
6105                                   SchedWriteFBlend.YMM, BlendCommuteImm8>,
6106                                   VEX_4V, VEX_L, VEX_WIG;
6107  defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
6108                                  VR128, load, f128mem, 0, SSEPackedDouble,
6109                                  SchedWriteFBlend.XMM, BlendCommuteImm2>,
6110                                  VEX_4V, VEX_WIG;
6111  defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
6112                                   VR256, load, f256mem, 0, SSEPackedDouble,
6113                                   SchedWriteFBlend.YMM, BlendCommuteImm4>,
6114                                   VEX_4V, VEX_L, VEX_WIG;
6115  defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
6116                                  VR128, load, i128mem, 0, SSEPackedInt,
6117                                  SchedWriteBlend.XMM, BlendCommuteImm8>,
6118                                  VEX_4V, VEX_WIG;
6119}
6120
6121let Predicates = [HasAVX2] in {
6122  defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
6123                                   VR256, load, i256mem, 0, SSEPackedInt,
6124                                   SchedWriteBlend.YMM, BlendCommuteImm8>,
6125                                   VEX_4V, VEX_L, VEX_WIG;
6126}
6127
6128// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
6129// ExecutionDomainFixPass will cleanup domains later on.
6130let Predicates = [HasAVX1Only] in {
6131def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
6132          (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6133def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
6134          (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6135def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
6136          (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>;
6137
6138// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6139// it from becoming movsd via commuting under optsize.
6140def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6141          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6142def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
6143          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6144def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
6145          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6146
6147def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3),
6148          (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>;
6149def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3),
6150          (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>;
6151def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3),
6152          (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>;
6153
6154// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6155// it from becoming movss via commuting under optsize.
6156def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6157          (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6158def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3),
6159          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6160def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3),
6161          (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6162}
6163
6164defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
6165                               VR128, memop, f128mem, 1, SSEPackedSingle,
6166                               SchedWriteFBlend.XMM, BlendCommuteImm4>;
6167defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
6168                               VR128, memop, f128mem, 1, SSEPackedDouble,
6169                               SchedWriteFBlend.XMM, BlendCommuteImm2>;
6170defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
6171                               VR128, memop, i128mem, 1, SSEPackedInt,
6172                               SchedWriteBlend.XMM, BlendCommuteImm8>;
6173
6174let Predicates = [UseSSE41] in {
6175// Use pblendw for 128-bit integer to keep it in the integer domain and prevent
6176// it from becoming movss via commuting under optsize.
6177def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
6178          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>;
6179def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3),
6180          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>;
6181def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3),
6182          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>;
6183
6184def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3),
6185          (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>;
6186def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3),
6187          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
6188def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3),
6189          (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
6190}
6191
6192// For insertion into the zero index (low half) of a 256-bit vector, it is
6193// more efficient to generate a blend with immediate instead of an insert*128.
6194let Predicates = [HasAVX] in {
6195def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
6196          (VBLENDPDYrri VR256:$src1,
6197                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6198                                       VR128:$src2, sub_xmm), 0x3)>;
6199def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
6200          (VBLENDPSYrri VR256:$src1,
6201                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6202                                       VR128:$src2, sub_xmm), 0xf)>;
6203
6204def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)),
6205          (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
6206                                       VR128:$src1, sub_xmm), addr:$src2, 0xc)>;
6207def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)),
6208          (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
6209                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
6210}
6211
6212/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators
6213multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
6214                                X86MemOperand x86memop, ValueType VT,
6215                                PatFrag mem_frag, SDNode OpNode,
6216                                X86FoldableSchedWrite sched> {
6217  def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
6218                  (ins RC:$src1, RC:$src2, RC:$src3),
6219                  !strconcat(OpcodeStr,
6220                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6221                  [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
6222                  SSEPackedInt>, TAPD, VEX_4V,
6223                Sched<[sched]>;
6224
6225  def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
6226                  (ins RC:$src1, x86memop:$src2, RC:$src3),
6227                  !strconcat(OpcodeStr,
6228                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
6229                  [(set RC:$dst,
6230                        (OpNode RC:$src3, (mem_frag addr:$src2),
6231                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
6232                Sched<[sched.Folded, sched.ReadAfterFold,
6233                       // x86memop:$src2
6234                       ReadDefault, ReadDefault, ReadDefault, ReadDefault,
6235                       ReadDefault,
6236                       // RC::$src3
6237                       sched.ReadAfterFold]>;
6238}
6239
6240let Predicates = [HasAVX] in {
6241let ExeDomain = SSEPackedDouble in {
6242defm VBLENDVPD  : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem,
6243                                       v2f64, loadv2f64, X86Blendv,
6244                                       SchedWriteFVarBlend.XMM>;
6245defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem,
6246                                       v4f64, loadv4f64, X86Blendv,
6247                                       SchedWriteFVarBlend.YMM>, VEX_L;
6248} // ExeDomain = SSEPackedDouble
6249let ExeDomain = SSEPackedSingle in {
6250defm VBLENDVPS  : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem,
6251                                       v4f32, loadv4f32, X86Blendv,
6252                                       SchedWriteFVarBlend.XMM>;
6253defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem,
6254                                       v8f32, loadv8f32, X86Blendv,
6255                                       SchedWriteFVarBlend.YMM>, VEX_L;
6256} // ExeDomain = SSEPackedSingle
6257defm VPBLENDVB  : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem,
6258                                       v16i8, loadv16i8, X86Blendv,
6259                                       SchedWriteVarBlend.XMM>;
6260}
6261
6262let Predicates = [HasAVX2] in {
6263defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem,
6264                                       v32i8, loadv32i8, X86Blendv,
6265                                       SchedWriteVarBlend.YMM>, VEX_L;
6266}
6267
6268let Predicates = [HasAVX] in {
6269  def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1),
6270                              (v4i32 VR128:$src2))),
6271            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6272  def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1),
6273                              (v2i64 VR128:$src2))),
6274            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
6275  def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1),
6276                              (v8i32 VR256:$src2))),
6277            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6278  def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1),
6279                              (v4i64 VR256:$src2))),
6280            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
6281}
6282
6283// Prefer a movss or movsd over a blendps when optimizing for size. these were
6284// changed to use blends because blends have better throughput on sandybridge
6285// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6286let Predicates = [HasAVX, OptForSpeed] in {
6287  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6288            (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6289  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6290            (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6291
6292  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6293            (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6294  def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))),
6295            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6296  def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)),
6297            (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6298
6299  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6300            (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6301  def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))),
6302            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6303  def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)),
6304            (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6305
6306  // Move low f32 and clear high bits.
6307  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
6308            (SUBREG_TO_REG (i32 0),
6309             (v4f32 (VBLENDPSrri (v4f32 (V_SET0)),
6310                          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)),
6311                          (i8 1))), sub_xmm)>;
6312  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
6313            (SUBREG_TO_REG (i32 0),
6314             (v4i32 (VPBLENDWrri (v4i32 (V_SET0)),
6315                          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)),
6316                          (i8 3))), sub_xmm)>;
6317}
6318
6319// Prefer a movss or movsd over a blendps when optimizing for size. these were
6320// changed to use blends because blends have better throughput on sandybridge
6321// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
6322let Predicates = [UseSSE41, OptForSpeed] in {
6323  // With SSE41 we can use blends for these patterns.
6324  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
6325            (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
6326  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
6327            (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
6328
6329  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
6330            (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>;
6331  def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))),
6332            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>;
6333  def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)),
6334            (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>;
6335
6336  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
6337            (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
6338  def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))),
6339            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>;
6340  def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)),
6341            (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>;
6342}
6343
6344
6345/// SS41I_ternary - SSE 4.1 ternary operator
6346let Uses = [XMM0], Constraints = "$src1 = $dst" in {
6347  multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT,
6348                           PatFrag mem_frag, X86MemOperand x86memop,
6349                           SDNode OpNode, X86FoldableSchedWrite sched> {
6350    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
6351                    (ins VR128:$src1, VR128:$src2),
6352                    !strconcat(OpcodeStr,
6353                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6354                    [(set VR128:$dst,
6355                      (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>,
6356                    Sched<[sched]>;
6357
6358    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
6359                    (ins VR128:$src1, x86memop:$src2),
6360                    !strconcat(OpcodeStr,
6361                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6362                    [(set VR128:$dst,
6363                      (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>,
6364                    Sched<[sched.Folded, sched.ReadAfterFold]>;
6365  }
6366}
6367
6368let ExeDomain = SSEPackedDouble in
6369defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem,
6370                              X86Blendv, SchedWriteFVarBlend.XMM>;
6371let ExeDomain = SSEPackedSingle in
6372defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem,
6373                              X86Blendv, SchedWriteFVarBlend.XMM>;
6374defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem,
6375                              X86Blendv, SchedWriteVarBlend.XMM>;
6376
6377// Aliases with the implicit xmm0 argument
6378def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6379                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
6380def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
6381                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
6382def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6383                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
6384def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
6385                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
6386def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6387                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
6388def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
6389                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
6390
6391let Predicates = [UseSSE41] in {
6392  def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1),
6393                              (v4i32 VR128:$src2))),
6394            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
6395  def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1),
6396                              (v2i64 VR128:$src2))),
6397            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
6398}
6399
6400let AddedComplexity = 400 in { // Prefer non-temporal versions
6401
6402let Predicates = [HasAVX, NoVLX] in
6403def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6404                        "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6405                        Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG;
6406let Predicates = [HasAVX2, NoVLX] in
6407def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
6408                         "vmovntdqa\t{$src, $dst|$dst, $src}", []>,
6409                         Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG;
6410def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
6411                       "movntdqa\t{$src, $dst|$dst, $src}", []>,
6412                       Sched<[SchedWriteVecMoveLSNT.XMM.RM]>;
6413
6414let Predicates = [HasAVX2, NoVLX] in {
6415  def : Pat<(v8f32 (alignednontemporalload addr:$src)),
6416            (VMOVNTDQAYrm addr:$src)>;
6417  def : Pat<(v4f64 (alignednontemporalload addr:$src)),
6418            (VMOVNTDQAYrm addr:$src)>;
6419  def : Pat<(v4i64 (alignednontemporalload addr:$src)),
6420            (VMOVNTDQAYrm addr:$src)>;
6421  def : Pat<(v8i32 (alignednontemporalload addr:$src)),
6422            (VMOVNTDQAYrm addr:$src)>;
6423  def : Pat<(v16i16 (alignednontemporalload addr:$src)),
6424            (VMOVNTDQAYrm addr:$src)>;
6425  def : Pat<(v32i8 (alignednontemporalload addr:$src)),
6426            (VMOVNTDQAYrm addr:$src)>;
6427}
6428
6429let Predicates = [HasAVX, NoVLX] in {
6430  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6431            (VMOVNTDQArm addr:$src)>;
6432  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6433            (VMOVNTDQArm addr:$src)>;
6434  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6435            (VMOVNTDQArm addr:$src)>;
6436  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6437            (VMOVNTDQArm addr:$src)>;
6438  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6439            (VMOVNTDQArm addr:$src)>;
6440  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6441            (VMOVNTDQArm addr:$src)>;
6442}
6443
6444let Predicates = [UseSSE41] in {
6445  def : Pat<(v4f32 (alignednontemporalload addr:$src)),
6446            (MOVNTDQArm addr:$src)>;
6447  def : Pat<(v2f64 (alignednontemporalload addr:$src)),
6448            (MOVNTDQArm addr:$src)>;
6449  def : Pat<(v2i64 (alignednontemporalload addr:$src)),
6450            (MOVNTDQArm addr:$src)>;
6451  def : Pat<(v4i32 (alignednontemporalload addr:$src)),
6452            (MOVNTDQArm addr:$src)>;
6453  def : Pat<(v8i16 (alignednontemporalload addr:$src)),
6454            (MOVNTDQArm addr:$src)>;
6455  def : Pat<(v16i8 (alignednontemporalload addr:$src)),
6456            (MOVNTDQArm addr:$src)>;
6457}
6458
6459} // AddedComplexity
6460
6461//===----------------------------------------------------------------------===//
6462// SSE4.2 - Compare Instructions
6463//===----------------------------------------------------------------------===//
6464
6465/// SS42I_binop_rm - Simple SSE 4.2 binary operator
6466multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
6467                          ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
6468                          X86MemOperand x86memop, X86FoldableSchedWrite sched,
6469                          bit Is2Addr = 1> {
6470  def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
6471       (ins RC:$src1, RC:$src2),
6472       !if(Is2Addr,
6473           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6474           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6475       [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
6476       Sched<[sched]>;
6477  def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
6478       (ins RC:$src1, x86memop:$src2),
6479       !if(Is2Addr,
6480           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
6481           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
6482       [(set RC:$dst,
6483         (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
6484       Sched<[sched.Folded, sched.ReadAfterFold]>;
6485}
6486
6487let Predicates = [HasAVX] in
6488  defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
6489                                 load, i128mem, SchedWriteVecALU.XMM, 0>,
6490                                 VEX_4V, VEX_WIG;
6491
6492let Predicates = [HasAVX2] in
6493  defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
6494                                  load, i256mem, SchedWriteVecALU.YMM, 0>,
6495                                  VEX_4V, VEX_L, VEX_WIG;
6496
6497let Constraints = "$src1 = $dst" in
6498  defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
6499                                memop, i128mem, SchedWriteVecALU.XMM>;
6500
6501//===----------------------------------------------------------------------===//
6502// SSE4.2 - String/text Processing Instructions
6503//===----------------------------------------------------------------------===//
6504
6505multiclass pcmpistrm_SS42AI<string asm> {
6506  def rr : SS42AI<0x62, MRMSrcReg, (outs),
6507    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6508    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6509    []>, Sched<[WritePCmpIStrM]>;
6510  let mayLoad = 1 in
6511  def rm :SS42AI<0x62, MRMSrcMem, (outs),
6512    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6513    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6514    []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>;
6515}
6516
6517let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
6518  let Predicates = [HasAVX] in
6519  defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG;
6520  defm PCMPISTRM  : pcmpistrm_SS42AI<"pcmpistrm"> ;
6521}
6522
6523multiclass SS42AI_pcmpestrm<string asm> {
6524  def rr : SS42AI<0x60, MRMSrcReg, (outs),
6525    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6526    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6527    []>, Sched<[WritePCmpEStrM]>;
6528  let mayLoad = 1 in
6529  def rm : SS42AI<0x60, MRMSrcMem, (outs),
6530    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6531    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6532    []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>;
6533}
6534
6535let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6536  let Predicates = [HasAVX] in
6537  defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG;
6538  defm PCMPESTRM :  SS42AI_pcmpestrm<"pcmpestrm">;
6539}
6540
6541multiclass SS42AI_pcmpistri<string asm> {
6542  def rr : SS42AI<0x63, MRMSrcReg, (outs),
6543    (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6544    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6545    []>, Sched<[WritePCmpIStrI]>;
6546  let mayLoad = 1 in
6547  def rm : SS42AI<0x63, MRMSrcMem, (outs),
6548    (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6549    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
6550    []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>;
6551}
6552
6553let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
6554  let Predicates = [HasAVX] in
6555  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG;
6556  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
6557}
6558
6559multiclass SS42AI_pcmpestri<string asm> {
6560  def rr : SS42AI<0x61, MRMSrcReg, (outs),
6561    (ins VR128:$src1, VR128:$src3, u8imm:$src5),
6562    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6563    []>, Sched<[WritePCmpEStrI]>;
6564  let mayLoad = 1 in
6565  def rm : SS42AI<0x61, MRMSrcMem, (outs),
6566    (ins VR128:$src1, i128mem:$src3, u8imm:$src5),
6567    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
6568    []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>;
6569}
6570
6571let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
6572  let Predicates = [HasAVX] in
6573  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG;
6574  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
6575}
6576
6577//===----------------------------------------------------------------------===//
6578// SSE4.2 - CRC Instructions
6579//===----------------------------------------------------------------------===//
6580
6581// No CRC instructions have AVX equivalents
6582
6583// crc intrinsic instruction
6584// This set of instructions are only rm, the only difference is the size
6585// of r and m.
6586class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
6587                   RegisterClass RCIn, SDPatternOperator Int> :
6588  CRC32I<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
6589         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6590         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>,
6591         Sched<[WriteCRC32]>;
6592
6593class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
6594                   X86MemOperand x86memop, SDPatternOperator Int> :
6595  CRC32I<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
6596         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
6597         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>,
6598         Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>;
6599
6600let Constraints = "$src1 = $dst" in {
6601  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
6602                                 int_x86_sse42_crc32_32_8>;
6603  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
6604                                 int_x86_sse42_crc32_32_8>;
6605  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
6606                                 int_x86_sse42_crc32_32_16>, OpSize16;
6607  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
6608                                 int_x86_sse42_crc32_32_16>, OpSize16;
6609  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
6610                                 int_x86_sse42_crc32_32_32>, OpSize32;
6611  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
6612                                 int_x86_sse42_crc32_32_32>, OpSize32;
6613  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
6614                                 int_x86_sse42_crc32_64_64>, REX_W;
6615  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
6616                                 int_x86_sse42_crc32_64_64>, REX_W;
6617  let hasSideEffects = 0 in {
6618    let mayLoad = 1 in
6619    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
6620                                   null_frag>, REX_W;
6621    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
6622                                   null_frag>, REX_W;
6623  }
6624}
6625
6626//===----------------------------------------------------------------------===//
6627// SHA-NI Instructions
6628//===----------------------------------------------------------------------===//
6629
6630// FIXME: Is there a better scheduler class for SHA than WriteVecIMul?
6631multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
6632                      X86FoldableSchedWrite sched, bit UsesXMM0 = 0> {
6633  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
6634             (ins VR128:$src1, VR128:$src2),
6635             !if(UsesXMM0,
6636                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6637                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6638             [!if(UsesXMM0,
6639                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
6640                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
6641             T8PS, Sched<[sched]>;
6642
6643  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
6644             (ins VR128:$src1, i128mem:$src2),
6645             !if(UsesXMM0,
6646                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
6647                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
6648             [!if(UsesXMM0,
6649                  (set VR128:$dst, (IntId VR128:$src1,
6650                    (memop addr:$src2), XMM0)),
6651                  (set VR128:$dst, (IntId VR128:$src1,
6652                    (memop addr:$src2))))]>, T8PS,
6653             Sched<[sched.Folded, sched.ReadAfterFold]>;
6654}
6655
6656let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
6657  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
6658                         (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6659                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6660                         [(set VR128:$dst,
6661                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
6662                            (i8 timm:$src3)))]>, TAPS,
6663                         Sched<[SchedWriteVecIMul.XMM]>;
6664  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
6665                         (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6666                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6667                         [(set VR128:$dst,
6668                           (int_x86_sha1rnds4 VR128:$src1,
6669                            (memop addr:$src2),
6670                            (i8 timm:$src3)))]>, TAPS,
6671                         Sched<[SchedWriteVecIMul.XMM.Folded,
6672                                SchedWriteVecIMul.XMM.ReadAfterFold]>;
6673
6674  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
6675                              SchedWriteVecIMul.XMM>;
6676  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
6677                              SchedWriteVecIMul.XMM>;
6678  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
6679                              SchedWriteVecIMul.XMM>;
6680
6681  let Uses=[XMM0] in
6682  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
6683                                SchedWriteVecIMul.XMM, 1>;
6684
6685  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
6686                               SchedWriteVecIMul.XMM>;
6687  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
6688                               SchedWriteVecIMul.XMM>;
6689}
6690
6691// Aliases with explicit %xmm0
6692def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6693                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
6694def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
6695                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
6696
6697//===----------------------------------------------------------------------===//
6698// AES-NI Instructions
6699//===----------------------------------------------------------------------===//
6700
6701multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
6702                             Intrinsic IntId, PatFrag ld_frag,
6703                             bit Is2Addr = 0, RegisterClass RC = VR128,
6704                             X86MemOperand MemOp = i128mem> {
6705  let AsmString = OpcodeStr#
6706                  !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
6707                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
6708    def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
6709                   (ins RC:$src1, RC:$src2), "",
6710                   [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
6711                   Sched<[WriteAESDecEnc]>;
6712    def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
6713                   (ins RC:$src1, MemOp:$src2), "",
6714                   [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
6715                   Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>;
6716  }
6717}
6718
6719// Perform One Round of an AES Encryption/Decryption Flow
6720let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
6721  defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
6722                         int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG;
6723  defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
6724                         int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG;
6725  defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
6726                         int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG;
6727  defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
6728                         int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG;
6729}
6730
6731let Predicates = [NoVLX, HasVAES] in {
6732  defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
6733                         int_x86_aesni_aesenc_256, load, 0, VR256,
6734                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6735  defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
6736                         int_x86_aesni_aesenclast_256, load, 0, VR256,
6737                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6738  defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
6739                         int_x86_aesni_aesdec_256, load, 0, VR256,
6740                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6741  defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
6742                         int_x86_aesni_aesdeclast_256, load, 0, VR256,
6743                         i256mem>, VEX_4V, VEX_L, VEX_WIG;
6744}
6745
6746let Constraints = "$src1 = $dst" in {
6747  defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
6748                         int_x86_aesni_aesenc, memop, 1>;
6749  defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
6750                         int_x86_aesni_aesenclast, memop, 1>;
6751  defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
6752                         int_x86_aesni_aesdec, memop, 1>;
6753  defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
6754                         int_x86_aesni_aesdeclast, memop, 1>;
6755}
6756
6757// Perform the AES InvMixColumn Transformation
6758let Predicates = [HasAVX, HasAES] in {
6759  def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6760      (ins VR128:$src1),
6761      "vaesimc\t{$src1, $dst|$dst, $src1}",
6762      [(set VR128:$dst,
6763        (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
6764      VEX, VEX_WIG;
6765  def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6766      (ins i128mem:$src1),
6767      "vaesimc\t{$src1, $dst|$dst, $src1}",
6768      [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>,
6769      Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG;
6770}
6771def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
6772  (ins VR128:$src1),
6773  "aesimc\t{$src1, $dst|$dst, $src1}",
6774  [(set VR128:$dst,
6775    (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
6776def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
6777  (ins i128mem:$src1),
6778  "aesimc\t{$src1, $dst|$dst, $src1}",
6779  [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>,
6780  Sched<[WriteAESIMC.Folded]>;
6781
6782// AES Round Key Generation Assist
6783let Predicates = [HasAVX, HasAES] in {
6784  def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6785      (ins VR128:$src1, u8imm:$src2),
6786      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6787      [(set VR128:$dst,
6788        (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6789      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
6790  def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6791      (ins i128mem:$src1, u8imm:$src2),
6792      "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6793      [(set VR128:$dst,
6794        (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>,
6795      Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG;
6796}
6797def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
6798  (ins VR128:$src1, u8imm:$src2),
6799  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6800  [(set VR128:$dst,
6801    (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>,
6802  Sched<[WriteAESKeyGen]>;
6803def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
6804  (ins i128mem:$src1, u8imm:$src2),
6805  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6806  [(set VR128:$dst,
6807    (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>,
6808  Sched<[WriteAESKeyGen.Folded]>;
6809
6810//===----------------------------------------------------------------------===//
6811// PCLMUL Instructions
6812//===----------------------------------------------------------------------===//
6813
6814// Immediate transform to help with commuting.
6815def PCLMULCommuteImm : SDNodeXForm<timm, [{
6816  uint8_t Imm = N->getZExtValue();
6817  return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
6818}]>;
6819
6820// SSE carry-less Multiplication instructions
6821let Predicates = [NoAVX, HasPCLMUL] in {
6822  let Constraints = "$src1 = $dst" in {
6823    let isCommutable = 1 in
6824    def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
6825              (ins VR128:$src1, VR128:$src2, u8imm:$src3),
6826              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6827              [(set VR128:$dst,
6828                (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>,
6829                Sched<[WriteCLMul]>;
6830
6831    def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
6832              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
6833              "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
6834              [(set VR128:$dst,
6835                 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2),
6836                  timm:$src3))]>,
6837              Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6838  } // Constraints = "$src1 = $dst"
6839
6840  def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1,
6841                                (i8 timm:$src3)),
6842            (PCLMULQDQrm VR128:$src1, addr:$src2,
6843                          (PCLMULCommuteImm timm:$src3))>;
6844} // Predicates = [NoAVX, HasPCLMUL]
6845
6846// SSE aliases
6847foreach HI = ["hq","lq"] in
6848foreach LO = ["hq","lq"] in {
6849  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6850                  (PCLMULQDQrr VR128:$dst, VR128:$src,
6851                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6852  def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
6853                  (PCLMULQDQrm VR128:$dst, i128mem:$src,
6854                   !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
6855}
6856
6857// AVX carry-less Multiplication instructions
6858multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
6859                      PatFrag LdFrag, Intrinsic IntId> {
6860  let isCommutable = 1 in
6861  def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
6862            (ins RC:$src1, RC:$src2, u8imm:$src3),
6863            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6864            [(set RC:$dst,
6865              (IntId RC:$src1, RC:$src2, timm:$src3))]>,
6866            Sched<[WriteCLMul]>;
6867
6868  def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
6869            (ins RC:$src1, MemOp:$src2, u8imm:$src3),
6870            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
6871            [(set RC:$dst,
6872               (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>,
6873            Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>;
6874
6875  // We can commute a load in the first operand by swapping the sources and
6876  // rotating the immediate.
6877  def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)),
6878            (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
6879                                           (PCLMULCommuteImm timm:$src3))>;
6880}
6881
6882let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
6883defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
6884                             int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
6885
6886let Predicates = [NoVLX, HasVPCLMULQDQ] in
6887defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
6888                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
6889
6890multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
6891                                   X86MemOperand MemOp, string Hi, string Lo> {
6892  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6893                  (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
6894                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6895  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
6896                  (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
6897                        !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
6898}
6899
6900multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
6901                              X86MemOperand MemOp> {
6902  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
6903  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
6904  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
6905  defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
6906}
6907
6908// AVX aliases
6909defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
6910defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
6911
6912//===----------------------------------------------------------------------===//
6913// SSE4A Instructions
6914//===----------------------------------------------------------------------===//
6915
6916let Predicates = [HasSSE4A] in {
6917
6918let ExeDomain = SSEPackedInt in {
6919let Constraints = "$src = $dst" in {
6920def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
6921                 (ins VR128:$src, u8imm:$len, u8imm:$idx),
6922                 "extrq\t{$idx, $len, $src|$src, $len, $idx}",
6923                 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
6924                                    timm:$idx))]>,
6925                 PD, Sched<[SchedWriteVecALU.XMM]>;
6926def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6927              (ins VR128:$src, VR128:$mask),
6928              "extrq\t{$mask, $src|$src, $mask}",
6929              [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
6930                                 VR128:$mask))]>,
6931              PD, Sched<[SchedWriteVecALU.XMM]>;
6932
6933def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
6934                   (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
6935                   "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
6936                   [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
6937                                      timm:$len, timm:$idx))]>,
6938                   XD, Sched<[SchedWriteVecALU.XMM]>;
6939def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
6940                 (ins VR128:$src, VR128:$mask),
6941                 "insertq\t{$mask, $src|$src, $mask}",
6942                 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
6943                                    VR128:$mask))]>,
6944                 XD, Sched<[SchedWriteVecALU.XMM]>;
6945}
6946} // ExeDomain = SSEPackedInt
6947
6948// Non-temporal (unaligned) scalar stores.
6949let AddedComplexity = 400 in { // Prefer non-temporal versions
6950let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
6951def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
6952                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
6953
6954def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
6955                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
6956} // SchedRW
6957
6958def : Pat<(nontemporalstore FR32:$src, addr:$dst),
6959          (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
6960
6961def : Pat<(nontemporalstore FR64:$src, addr:$dst),
6962          (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
6963
6964} // AddedComplexity
6965} // HasSSE4A
6966
6967//===----------------------------------------------------------------------===//
6968// AVX Instructions
6969//===----------------------------------------------------------------------===//
6970
6971//===----------------------------------------------------------------------===//
6972// VBROADCAST - Load from memory and broadcast to all elements of the
6973//              destination operand
6974//
6975class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
6976                           X86MemOperand x86memop, ValueType VT,
6977                           PatFrag bcast_frag, SchedWrite Sched> :
6978  AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
6979        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6980        [(set RC:$dst, (VT (bcast_frag addr:$src)))]>,
6981        Sched<[Sched]>, VEX;
6982
6983// AVX2 adds register forms
6984class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
6985                        ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
6986  AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
6987         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
6988         [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
6989         Sched<[Sched]>, VEX;
6990
6991let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
6992  def VBROADCASTSSrm  : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
6993                                         f32mem, v4f32, X86VBroadcastld32,
6994                                         SchedWriteFShuffle.XMM.Folded>;
6995  def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
6996                                         f32mem, v8f32, X86VBroadcastld32,
6997                                         SchedWriteFShuffle.XMM.Folded>, VEX_L;
6998}
6999let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
7000def VBROADCASTSDYrm  : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
7001                                        v4f64, X86VBroadcastld64,
7002                                        SchedWriteFShuffle.XMM.Folded>, VEX_L;
7003
7004let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
7005  def VBROADCASTSSrr  : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
7006                                          v4f32, v4f32, SchedWriteFShuffle.XMM>;
7007  def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
7008                                          v8f32, v4f32, WriteFShuffle256>, VEX_L;
7009}
7010let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
7011def VBROADCASTSDYrr  : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
7012                                         v4f64, v2f64, WriteFShuffle256>, VEX_L;
7013
7014//===----------------------------------------------------------------------===//
7015// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
7016//                  halves of a 256-bit vector.
7017//
7018let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
7019def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
7020                           (ins i128mem:$src),
7021                           "vbroadcasti128\t{$src, $dst|$dst, $src}", []>,
7022                           Sched<[WriteShuffleLd]>, VEX, VEX_L;
7023
7024let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX],
7025    ExeDomain = SSEPackedSingle in
7026def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
7027                           (ins f128mem:$src),
7028                           "vbroadcastf128\t{$src, $dst|$dst, $src}", []>,
7029                           Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L;
7030
7031let Predicates = [HasAVX, NoVLX] in {
7032def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)),
7033          (VBROADCASTF128 addr:$src)>;
7034def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
7035          (VBROADCASTF128 addr:$src)>;
7036// NOTE: We're using FP instructions here, but execution domain fixing can
7037// convert to integer when profitable.
7038def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
7039          (VBROADCASTF128 addr:$src)>;
7040def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
7041          (VBROADCASTF128 addr:$src)>;
7042def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
7043          (VBROADCASTF128 addr:$src)>;
7044def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
7045          (VBROADCASTF128 addr:$src)>;
7046}
7047
7048//===----------------------------------------------------------------------===//
7049// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
7050//
7051
7052let ExeDomain = SSEPackedSingle in {
7053let isCommutable = 1 in
7054def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
7055          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7056          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7057          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
7058def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
7059          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7060          "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7061          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
7062}
7063
7064// Immediate transform to help with commuting.
7065def Perm2XCommuteImm : SDNodeXForm<timm, [{
7066  return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
7067}]>;
7068
7069multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> {
7070  def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))),
7071            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>;
7072  def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))),
7073            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>;
7074  // Pattern with load in other operand.
7075  def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))),
7076            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7077                                             (Perm2XCommuteImm timm:$imm))>;
7078}
7079
7080let Predicates = [HasAVX] in {
7081  defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>;
7082  defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>;
7083}
7084
7085let Predicates = [HasAVX1Only] in {
7086  defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
7087  defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
7088  defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
7089  defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
7090}
7091
7092//===----------------------------------------------------------------------===//
7093// VINSERTF128 - Insert packed floating-point values
7094//
7095let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7096def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
7097          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7098          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7099          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
7100let mayLoad = 1 in
7101def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
7102          (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
7103          "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7104          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7105}
7106
7107// To create a 256-bit all ones value, we should produce VCMPTRUEPS
7108// with YMM register containing zero.
7109// FIXME: Avoid producing vxorps to clear the fake inputs.
7110let Predicates = [HasAVX1Only] in {
7111def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
7112}
7113
7114multiclass vinsert_lowering<string InstrStr, string PermStr,
7115                            ValueType From, ValueType To,
7116                            PatFrag frommemop_frag, PatFrag tomemop_frag> {
7117  def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
7118                                   (iPTR imm)),
7119            (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
7120                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7121  def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
7122                                    (From (frommemop_frag addr:$src2)),
7123                                    (iPTR imm)),
7124            (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
7125                                       (INSERT_get_vinsert128_imm VR256:$ins))>;
7126  // Folding "To" vector - convert to perm2x128 and commute inputs.
7127  def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)),
7128                                    (From VR128:$src2),
7129                                    (iPTR imm)),
7130            (!cast<Instruction>(PermStr#rm)
7131              (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm),
7132              addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>;
7133}
7134
7135let Predicates = [HasAVX, NoVLX] in {
7136  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>;
7137  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>;
7138}
7139
7140let Predicates = [HasAVX1Only] in {
7141  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
7142  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
7143  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
7144  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
7145}
7146
7147//===----------------------------------------------------------------------===//
7148// VEXTRACTF128 - Extract packed floating-point values
7149//
7150let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
7151def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
7152          (ins VR256:$src1, u8imm:$src2),
7153          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7154          []>, Sched<[WriteFShuffle256]>, VEX, VEX_L;
7155let mayStore = 1 in
7156def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
7157          (ins f128mem:$dst, VR256:$src1, u8imm:$src2),
7158          "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7159          []>, Sched<[WriteFStoreX]>, VEX, VEX_L;
7160}
7161
7162multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
7163  def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
7164            (To (!cast<Instruction>(InstrStr#rr)
7165                                    (From VR256:$src1),
7166                                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
7167  def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
7168                                                 (iPTR imm))), addr:$dst),
7169            (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
7170             (EXTRACT_get_vextract128_imm VR128:$ext))>;
7171}
7172
7173// AVX1 patterns
7174let Predicates = [HasAVX, NoVLX] in {
7175  defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
7176  defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
7177}
7178
7179let Predicates = [HasAVX1Only] in {
7180  defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
7181  defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
7182  defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
7183  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
7184}
7185
7186//===----------------------------------------------------------------------===//
7187// VMASKMOV - Conditional SIMD Packed Loads and Stores
7188//
7189multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
7190                          Intrinsic IntLd, Intrinsic IntLd256,
7191                          Intrinsic IntSt, Intrinsic IntSt256,
7192                          X86SchedWriteMaskMove schedX,
7193                          X86SchedWriteMaskMove schedY> {
7194  def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
7195             (ins VR128:$src1, f128mem:$src2),
7196             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7197             [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
7198             VEX_4V, Sched<[schedX.RM]>;
7199  def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
7200             (ins VR256:$src1, f256mem:$src2),
7201             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7202             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7203             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7204  def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
7205             (ins f128mem:$dst, VR128:$src1, VR128:$src2),
7206             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7207             [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
7208             VEX_4V, Sched<[schedX.MR]>;
7209  def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
7210             (ins f256mem:$dst, VR256:$src1, VR256:$src2),
7211             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7212             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7213             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7214}
7215
7216let ExeDomain = SSEPackedSingle in
7217defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
7218                                 int_x86_avx_maskload_ps,
7219                                 int_x86_avx_maskload_ps_256,
7220                                 int_x86_avx_maskstore_ps,
7221                                 int_x86_avx_maskstore_ps_256,
7222                                 WriteFMaskMove32, WriteFMaskMove32Y>;
7223let ExeDomain = SSEPackedDouble in
7224defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
7225                                 int_x86_avx_maskload_pd,
7226                                 int_x86_avx_maskload_pd_256,
7227                                 int_x86_avx_maskstore_pd,
7228                                 int_x86_avx_maskstore_pd_256,
7229                                 WriteFMaskMove64, WriteFMaskMove64Y>;
7230
7231//===----------------------------------------------------------------------===//
7232// AVX_VNNI
7233//===----------------------------------------------------------------------===//
7234let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst",
7235    ExplicitVEXPrefix = 1, checkVEXPredicate = 1 in
7236multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
7237                       bit IsCommutable> {
7238  let isCommutable = IsCommutable in
7239  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
7240             (ins VR128:$src1, VR128:$src2, VR128:$src3),
7241             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7242             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
7243                                       VR128:$src2, VR128:$src3)))]>,
7244             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7245
7246  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
7247             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
7248             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7249             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
7250                                      (loadv4i32 addr:$src3))))]>,
7251             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
7252
7253  let isCommutable = IsCommutable in
7254  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
7255             (ins VR256:$src1, VR256:$src2, VR256:$src3),
7256             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7257             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
7258                                       VR256:$src2, VR256:$src3)))]>,
7259             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7260
7261  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
7262             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
7263             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
7264             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
7265                                      (loadv8i32 addr:$src3))))]>,
7266             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
7267}
7268
7269defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>;
7270defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>;
7271defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>;
7272defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>;
7273
7274def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs),
7275                             (X86vpmaddwd node:$lhs, node:$rhs), [{
7276  return N->hasOneUse();
7277}]>;
7278
7279let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in {
7280  def : Pat<(v8i32 (add VR256:$src1,
7281                        (X86vpmaddwd_su VR256:$src2, VR256:$src3))),
7282            (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>;
7283  def : Pat<(v8i32 (add VR256:$src1,
7284                        (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))),
7285            (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>;
7286  def : Pat<(v4i32 (add VR128:$src1,
7287                        (X86vpmaddwd_su VR128:$src2, VR128:$src3))),
7288            (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>;
7289  def : Pat<(v4i32 (add VR128:$src1,
7290                        (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))),
7291            (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>;
7292}
7293
7294//===----------------------------------------------------------------------===//
7295// VPERMIL - Permute Single and Double Floating-Point Values
7296//
7297
7298multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
7299                      RegisterClass RC, X86MemOperand x86memop_f,
7300                      X86MemOperand x86memop_i,
7301                      ValueType f_vt, ValueType i_vt,
7302                      X86FoldableSchedWrite sched,
7303                      X86FoldableSchedWrite varsched> {
7304  let Predicates = [HasAVX, NoVLX] in {
7305    def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
7306               (ins RC:$src1, RC:$src2),
7307               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7308               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
7309               Sched<[varsched]>;
7310    def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
7311               (ins RC:$src1, x86memop_i:$src2),
7312               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7313               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
7314                              (i_vt (load addr:$src2)))))]>, VEX_4V,
7315               Sched<[varsched.Folded, sched.ReadAfterFold]>;
7316
7317    def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
7318             (ins RC:$src1, u8imm:$src2),
7319             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7320             [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX,
7321             Sched<[sched]>;
7322    def mi  : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
7323             (ins x86memop_f:$src1, u8imm:$src2),
7324             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7325             [(set RC:$dst,
7326               (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX,
7327             Sched<[sched.Folded]>;
7328  }// Predicates = [HasAVX, NoVLX]
7329}
7330
7331let ExeDomain = SSEPackedSingle in {
7332  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
7333                               v4f32, v4i32, SchedWriteFShuffle.XMM,
7334                               SchedWriteFVarShuffle.XMM>;
7335  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
7336                               v8f32, v8i32, SchedWriteFShuffle.YMM,
7337                               SchedWriteFVarShuffle.YMM>, VEX_L;
7338}
7339let ExeDomain = SSEPackedDouble in {
7340  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
7341                               v2f64, v2i64, SchedWriteFShuffle.XMM,
7342                               SchedWriteFVarShuffle.XMM>;
7343  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
7344                               v4f64, v4i64, SchedWriteFShuffle.YMM,
7345                               SchedWriteFVarShuffle.YMM>, VEX_L;
7346}
7347
7348//===----------------------------------------------------------------------===//
7349// VZERO - Zero YMM registers
7350// Note: These instruction do not affect the YMM16-YMM31.
7351//
7352
7353let SchedRW = [WriteSystem] in {
7354let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
7355            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
7356  // Zero All YMM registers
7357  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
7358                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
7359                  Requires<[HasAVX]>, VEX_WIG;
7360
7361  // Zero Upper bits of YMM registers
7362  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
7363                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
7364                     Requires<[HasAVX]>, VEX_WIG;
7365} // Defs
7366} // SchedRW
7367
7368//===----------------------------------------------------------------------===//
7369// Half precision conversion instructions
7370//
7371
7372multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
7373                      X86FoldableSchedWrite sched> {
7374  def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
7375             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7376             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
7377             T8PD, VEX, Sched<[sched]>;
7378  let hasSideEffects = 0, mayLoad = 1 in
7379  def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
7380             "vcvtph2ps\t{$src, $dst|$dst, $src}",
7381             []>, T8PD, VEX, Sched<[sched.Folded]>;
7382}
7383
7384multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
7385                      SchedWrite RR, SchedWrite MR> {
7386  def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
7387               (ins RC:$src1, i32u8imm:$src2),
7388               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
7389               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
7390               TAPD, VEX, Sched<[RR]>;
7391  let hasSideEffects = 0, mayStore = 1 in
7392  def mr : Ii8<0x1D, MRMDestMem, (outs),
7393               (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
7394               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7395               TAPD, VEX, Sched<[MR]>;
7396}
7397
7398let Predicates = [HasF16C, NoVLX] in {
7399  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
7400  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
7401  defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
7402                               WriteCvtPS2PHSt>, SIMD_EXC;
7403  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
7404                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
7405
7406  // Pattern match vcvtph2ps of a scalar i64 load.
7407  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
7408            (VCVTPH2PSrm addr:$src)>;
7409  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
7410              (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
7411            (VCVTPH2PSrm addr:$src)>;
7412  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
7413            (VCVTPH2PSYrm addr:$src)>;
7414
7415  def : Pat<(store (f64 (extractelt
7416                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7417                         (iPTR 0))), addr:$dst),
7418            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7419  def : Pat<(store (i64 (extractelt
7420                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
7421                         (iPTR 0))), addr:$dst),
7422            (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
7423  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
7424            (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
7425}
7426
7427//===----------------------------------------------------------------------===//
7428// AVX2 Instructions
7429//===----------------------------------------------------------------------===//
7430
7431/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
7432multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
7433                          ValueType OpVT, X86FoldableSchedWrite sched,
7434                          RegisterClass RC,
7435                          X86MemOperand x86memop, SDNodeXForm commuteXForm> {
7436  let isCommutable = 1 in
7437  def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
7438        (ins RC:$src1, RC:$src2, u8imm:$src3),
7439        !strconcat(OpcodeStr,
7440            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7441        [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
7442        Sched<[sched]>, VEX_4V;
7443  def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
7444        (ins RC:$src1, x86memop:$src2, u8imm:$src3),
7445        !strconcat(OpcodeStr,
7446            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
7447        [(set RC:$dst,
7448          (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
7449        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
7450
7451  // Pattern to commute if load is in first source.
7452  def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
7453            (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
7454                                            (commuteXForm timm:$src3))>;
7455}
7456
7457let Predicates = [HasAVX2] in {
7458defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
7459                               SchedWriteBlend.XMM, VR128, i128mem,
7460                               BlendCommuteImm4>;
7461defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
7462                                SchedWriteBlend.YMM, VR256, i256mem,
7463                                BlendCommuteImm8>, VEX_L;
7464
7465def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3),
7466          (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>;
7467def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3),
7468          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>;
7469def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3),
7470          (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>;
7471
7472def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3),
7473          (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>;
7474def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3),
7475          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>;
7476def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
7477          (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>;
7478}
7479
7480// For insertion into the zero index (low half) of a 256-bit vector, it is
7481// more efficient to generate a blend with immediate instead of an insert*128.
7482// NOTE: We're using FP instructions here, but execution domain fixing should
7483// take care of using integer instructions when profitable.
7484let Predicates = [HasAVX] in {
7485def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
7486          (VBLENDPSYrri VR256:$src1,
7487                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7488                                       VR128:$src2, sub_xmm), 0xf)>;
7489def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
7490          (VBLENDPSYrri VR256:$src1,
7491                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7492                                       VR128:$src2, sub_xmm), 0xf)>;
7493def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
7494          (VBLENDPSYrri VR256:$src1,
7495                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7496                                       VR128:$src2, sub_xmm), 0xf)>;
7497def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
7498          (VBLENDPSYrri VR256:$src1,
7499                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7500                                       VR128:$src2, sub_xmm), 0xf)>;
7501
7502def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)),
7503          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7504                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7505def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)),
7506          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7507                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7508def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
7509          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7510                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7511def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
7512          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7513                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
7514}
7515
7516//===----------------------------------------------------------------------===//
7517// VPBROADCAST - Load from memory and broadcast to all elements of the
7518//               destination operand
7519//
7520multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
7521                          X86MemOperand x86memop, PatFrag bcast_frag,
7522                          ValueType OpVT128, ValueType OpVT256, Predicate prd> {
7523  let Predicates = [HasAVX2, prd] in {
7524    def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
7525                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7526                  [(set VR128:$dst,
7527                   (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7528                  Sched<[SchedWriteShuffle.XMM]>, VEX;
7529    def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
7530                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7531                  [(set VR128:$dst,
7532                   (OpVT128 (bcast_frag addr:$src)))]>,
7533                  Sched<[SchedWriteShuffle.XMM.Folded]>, VEX;
7534    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
7535                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7536                   [(set VR256:$dst,
7537                    (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
7538                   Sched<[WriteShuffle256]>, VEX, VEX_L;
7539    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
7540                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
7541                   [(set VR256:$dst,
7542                    (OpVT256 (bcast_frag addr:$src)))]>,
7543                   Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L;
7544
7545    // Provide aliases for broadcast from the same register class that
7546    // automatically does the extract.
7547    def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
7548              (!cast<Instruction>(NAME#"Yrr")
7549                  (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
7550  }
7551}
7552
7553defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8,
7554                                    v16i8, v32i8, NoVLX_Or_NoBWI>;
7555defm VPBROADCASTW  : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16,
7556                                    v8i16, v16i16, NoVLX_Or_NoBWI>;
7557defm VPBROADCASTD  : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32,
7558                                    v4i32, v8i32, NoVLX>;
7559defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64,
7560                                    v2i64, v4i64, NoVLX>;
7561
7562let Predicates = [HasAVX2, NoVLX] in {
7563  // Provide fallback in case the load node that is used in the patterns above
7564  // is used by additional users, which prevents the pattern selection.
7565    def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7566              (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7567    def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7568              (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>;
7569    def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7570              (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7571}
7572
7573let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
7574  def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
7575        (VPBROADCASTBrr (VMOVDI2PDIrr
7576                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7577                                             GR8:$src, sub_8bit))))>;
7578  def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
7579        (VPBROADCASTBYrr (VMOVDI2PDIrr
7580                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7581                                              GR8:$src, sub_8bit))))>;
7582
7583  def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
7584        (VPBROADCASTWrr (VMOVDI2PDIrr
7585                         (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7586                                             GR16:$src, sub_16bit))))>;
7587  def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
7588        (VPBROADCASTWYrr (VMOVDI2PDIrr
7589                          (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
7590                                              GR16:$src, sub_16bit))))>;
7591}
7592let Predicates = [HasAVX2, NoVLX] in {
7593  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7594            (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>;
7595  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7596            (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>;
7597  def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
7598            (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>;
7599  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7600            (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>;
7601}
7602
7603// AVX1 broadcast patterns
7604let Predicates = [HasAVX1Only] in {
7605def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)),
7606          (VBROADCASTSSYrm addr:$src)>;
7607def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)),
7608          (VBROADCASTSDYrm addr:$src)>;
7609def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)),
7610          (VBROADCASTSSrm addr:$src)>;
7611}
7612
7613  // Provide fallback in case the load node that is used in the patterns above
7614  // is used by additional users, which prevents the pattern selection.
7615let Predicates = [HasAVX, NoVLX] in {
7616  // 128bit broadcasts:
7617  def : Pat<(v2f64 (X86VBroadcast f64:$src)),
7618            (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>;
7619  def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)),
7620            (VMOVDDUPrm addr:$src)>;
7621
7622  def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
7623            (VMOVDDUPrr VR128:$src)>;
7624}
7625
7626let Predicates = [HasAVX1Only] in {
7627  def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
7628            (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>;
7629  def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
7630            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7631              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm),
7632              (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>;
7633  def : Pat<(v8f32 (X86VBroadcast v4f32:$src)),
7634            (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
7635              (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm),
7636              (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>;
7637  def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
7638            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7639              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm),
7640              (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>;
7641  def : Pat<(v4f64 (X86VBroadcast v2f64:$src)),
7642            (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
7643              (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm),
7644              (v2f64 (VMOVDDUPrr VR128:$src)), 1)>;
7645
7646  def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
7647            (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>;
7648  def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
7649            (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
7650              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm),
7651              (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>;
7652  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
7653            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
7654              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm),
7655              (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>;
7656
7657  def : Pat<(v2i64 (X86VBroadcast i64:$src)),
7658            (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
7659  def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
7660            (VMOVDDUPrm addr:$src)>;
7661}
7662
7663//===----------------------------------------------------------------------===//
7664// VPERM - Permute instructions
7665//
7666
7667multiclass avx2_perm<bits<8> opc, string OpcodeStr,
7668                     ValueType OpVT, X86FoldableSchedWrite Sched,
7669                     X86MemOperand memOp> {
7670  let Predicates = [HasAVX2, NoVLX] in {
7671    def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7672                     (ins VR256:$src1, VR256:$src2),
7673                     !strconcat(OpcodeStr,
7674                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7675                     [(set VR256:$dst,
7676                       (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
7677                     Sched<[Sched]>, VEX_4V, VEX_L;
7678    def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7679                     (ins VR256:$src1, memOp:$src2),
7680                     !strconcat(OpcodeStr,
7681                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7682                     [(set VR256:$dst,
7683                       (OpVT (X86VPermv VR256:$src1,
7684                              (load addr:$src2))))]>,
7685                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
7686  }
7687}
7688
7689defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>;
7690let ExeDomain = SSEPackedSingle in
7691defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>;
7692
7693multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
7694                         ValueType OpVT, X86FoldableSchedWrite Sched,
7695                         X86MemOperand memOp> {
7696  let Predicates = [HasAVX2, NoVLX] in {
7697    def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
7698                       (ins VR256:$src1, u8imm:$src2),
7699                       !strconcat(OpcodeStr,
7700                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7701                       [(set VR256:$dst,
7702                         (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>,
7703                       Sched<[Sched]>, VEX, VEX_L;
7704    def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
7705                       (ins memOp:$src1, u8imm:$src2),
7706                       !strconcat(OpcodeStr,
7707                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7708                       [(set VR256:$dst,
7709                         (OpVT (X86VPermi (mem_frag addr:$src1),
7710                                (i8 timm:$src2))))]>,
7711                       Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L;
7712  }
7713}
7714
7715defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
7716                            WriteShuffle256, i256mem>, VEX_W;
7717let ExeDomain = SSEPackedDouble in
7718defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
7719                             WriteFShuffle256, f256mem>, VEX_W;
7720
7721//===----------------------------------------------------------------------===//
7722// VPERM2I128 - Permute Integer vector Values in 128-bit chunks
7723//
7724let isCommutable = 1 in
7725def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
7726          (ins VR256:$src1, VR256:$src2, u8imm:$src3),
7727          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7728          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7729def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
7730          (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
7731          "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
7732          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7733
7734let Predicates = [HasAVX2] in {
7735  defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
7736  defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
7737  defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
7738  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
7739}
7740
7741//===----------------------------------------------------------------------===//
7742// VINSERTI128 - Insert packed integer values
7743//
7744let hasSideEffects = 0 in {
7745def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
7746          (ins VR256:$src1, VR128:$src2, u8imm:$src3),
7747          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7748          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
7749let mayLoad = 1 in
7750def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
7751          (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
7752          "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
7753          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
7754}
7755
7756let Predicates = [HasAVX2, NoVLX] in {
7757  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
7758  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
7759  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
7760  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
7761}
7762
7763//===----------------------------------------------------------------------===//
7764// VEXTRACTI128 - Extract packed integer values
7765//
7766def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
7767          (ins VR256:$src1, u8imm:$src2),
7768          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7769          Sched<[WriteShuffle256]>, VEX, VEX_L;
7770let hasSideEffects = 0, mayStore = 1 in
7771def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
7772          (ins i128mem:$dst, VR256:$src1, u8imm:$src2),
7773          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
7774          Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L;
7775
7776let Predicates = [HasAVX2, NoVLX] in {
7777  defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
7778  defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
7779  defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
7780  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
7781}
7782
7783//===----------------------------------------------------------------------===//
7784// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
7785//
7786multiclass avx2_pmovmask<string OpcodeStr,
7787                         Intrinsic IntLd128, Intrinsic IntLd256,
7788                         Intrinsic IntSt128, Intrinsic IntSt256,
7789                         X86SchedWriteMaskMove schedX,
7790                         X86SchedWriteMaskMove schedY> {
7791  def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
7792             (ins VR128:$src1, i128mem:$src2),
7793             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7794             [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
7795             VEX_4V, Sched<[schedX.RM]>;
7796  def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
7797             (ins VR256:$src1, i256mem:$src2),
7798             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7799             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
7800             VEX_4V, VEX_L, Sched<[schedY.RM]>;
7801  def mr  : AVX28I<0x8e, MRMDestMem, (outs),
7802             (ins i128mem:$dst, VR128:$src1, VR128:$src2),
7803             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7804             [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
7805             VEX_4V, Sched<[schedX.MR]>;
7806  def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
7807             (ins i256mem:$dst, VR256:$src1, VR256:$src2),
7808             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7809             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
7810             VEX_4V, VEX_L, Sched<[schedY.MR]>;
7811}
7812
7813defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
7814                                int_x86_avx2_maskload_d,
7815                                int_x86_avx2_maskload_d_256,
7816                                int_x86_avx2_maskstore_d,
7817                                int_x86_avx2_maskstore_d_256,
7818                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
7819defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
7820                                int_x86_avx2_maskload_q,
7821                                int_x86_avx2_maskload_q_256,
7822                                int_x86_avx2_maskstore_q,
7823                                int_x86_avx2_maskstore_q_256,
7824                                WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
7825
7826multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
7827                          ValueType MaskVT> {
7828    // masked store
7829    def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)),
7830             (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
7831    // masked load
7832    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)),
7833             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7834    def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask),
7835                              (VT immAllZerosV))),
7836             (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
7837}
7838let Predicates = [HasAVX] in {
7839  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>;
7840  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>;
7841  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>;
7842  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>;
7843}
7844let Predicates = [HasAVX1Only] in {
7845  // load/store i32/i64 not supported use ps/pd version
7846  defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>;
7847  defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>;
7848  defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>;
7849  defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>;
7850}
7851let Predicates = [HasAVX2] in {
7852  defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>;
7853  defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>;
7854  defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>;
7855  defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>;
7856}
7857
7858//===----------------------------------------------------------------------===//
7859// Variable Bit Shifts
7860//
7861multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
7862                          ValueType vt128, ValueType vt256> {
7863  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
7864             (ins VR128:$src1, VR128:$src2),
7865             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7866             [(set VR128:$dst,
7867               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
7868             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
7869  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
7870             (ins VR128:$src1, i128mem:$src2),
7871             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7872             [(set VR128:$dst,
7873               (vt128 (OpNode VR128:$src1,
7874                       (vt128 (load addr:$src2)))))]>,
7875             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
7876                            SchedWriteVarVecShift.XMM.ReadAfterFold]>;
7877  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
7878             (ins VR256:$src1, VR256:$src2),
7879             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7880             [(set VR256:$dst,
7881               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
7882             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
7883  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
7884             (ins VR256:$src1, i256mem:$src2),
7885             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
7886             [(set VR256:$dst,
7887               (vt256 (OpNode VR256:$src1,
7888                       (vt256 (load addr:$src2)))))]>,
7889             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
7890                                   SchedWriteVarVecShift.YMM.ReadAfterFold]>;
7891}
7892
7893let Predicates = [HasAVX2, NoVLX] in {
7894  defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>;
7895  defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W;
7896  defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>;
7897  defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W;
7898  defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>;
7899}
7900
7901//===----------------------------------------------------------------------===//
7902// VGATHER - GATHER Operations
7903
7904// FIXME: Improve scheduling of gather instructions.
7905multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
7906                       X86MemOperand memop128, X86MemOperand memop256> {
7907let mayLoad = 1, hasSideEffects = 0 in {
7908  def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
7909            (ins VR128:$src1, memop128:$src2, VR128:$mask),
7910            !strconcat(OpcodeStr,
7911              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7912            []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
7913  def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
7914            (ins RC256:$src1, memop256:$src2, RC256:$mask),
7915            !strconcat(OpcodeStr,
7916              "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
7917            []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
7918}
7919}
7920
7921let Predicates = [HasAVX2] in {
7922  let mayLoad = 1, hasSideEffects = 0, Constraints
7923    = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
7924    in {
7925    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq",
7926                                  VR256, vx128mem, vx256mem>, VEX_W;
7927    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq",
7928                                  VR256, vx128mem, vy256mem>, VEX_W;
7929    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd",
7930                                  VR256, vx128mem, vy256mem>;
7931    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd",
7932                                  VR128, vx64mem, vy128mem>;
7933
7934    let ExeDomain = SSEPackedDouble in {
7935      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd",
7936                                    VR256, vx128mem, vx256mem>, VEX_W;
7937      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd",
7938                                    VR256, vx128mem, vy256mem>, VEX_W;
7939    }
7940
7941    let ExeDomain = SSEPackedSingle in {
7942      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps",
7943                                    VR256, vx128mem, vy256mem>;
7944      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps",
7945                                    VR128, vx64mem, vy128mem>;
7946    }
7947  }
7948}
7949
7950//===----------------------------------------------------------------------===//
7951// GFNI instructions
7952//===----------------------------------------------------------------------===//
7953
7954multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
7955                        RegisterClass RC, PatFrag MemOpFrag,
7956                        X86MemOperand X86MemOp, bit Is2Addr = 0> {
7957  let ExeDomain = SSEPackedInt,
7958      AsmString = !if(Is2Addr,
7959        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
7960        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
7961    let isCommutable = 1 in
7962    def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
7963                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
7964             Sched<[SchedWriteVecALU.XMM]>, T8PD;
7965
7966    def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
7967                 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
7968                                 (MemOpFrag addr:$src2))))]>,
7969             Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD;
7970  }
7971}
7972
7973multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
7974                           SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
7975                           X86MemOperand X86MemOp, bit Is2Addr = 0> {
7976  let AsmString = !if(Is2Addr,
7977      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
7978      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
7979  def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
7980              (ins RC:$src1, RC:$src2, u8imm:$src3), "",
7981              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
7982              SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>;
7983  def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
7984              (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
7985              [(set RC:$dst, (OpVT (OpNode RC:$src1,
7986                                    (MemOpFrag addr:$src2),
7987                              timm:$src3)))], SSEPackedInt>,
7988              Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>;
7989  }
7990}
7991
7992multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
7993  let Constraints = "$src1 = $dst",
7994      Predicates  = [HasGFNI, UseSSE2] in
7995  defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
7996                                      VR128, load, i128mem, 1>;
7997  let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
7998    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
7999                                      load, i128mem>, VEX_4V, VEX_W;
8000    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
8001                                      load, i256mem>, VEX_4V, VEX_L, VEX_W;
8002  }
8003}
8004
8005// GF2P8MULB
8006let Constraints = "$src1 = $dst",
8007    Predicates  = [HasGFNI, UseSSE2] in
8008defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
8009                                    i128mem, 1>;
8010let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
8011  defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
8012                                   i128mem>, VEX_4V;
8013  defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
8014                                   i256mem>, VEX_4V, VEX_L;
8015}
8016// GF2P8AFFINEINVQB, GF2P8AFFINEQB
8017let isCommutable = 0 in {
8018  defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
8019                                             X86GF2P8affineinvqb>, TAPD;
8020  defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
8021                                             X86GF2P8affineqb>, TAPD;
8022}
8023
8024