1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 PatFrags mem_frags, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 118 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 119 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 120 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 121} 122 123//===----------------------------------------------------------------------===// 124// AVX & SSE - Zero/One Vectors 125//===----------------------------------------------------------------------===// 126 127// Alias instruction that maps zero vector to pxor / xorp* for sse. 128// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 129// swizzled by ExecutionDomainFix to pxor. 130// We set canFoldAsLoad because this can be converted to a constant-pool 131// load of an all-zeros value if folding it would be beneficial. 132let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 133 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 134def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 135 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 136} 137 138let Predicates = [NoAVX512] in { 139def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 140def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 141def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 142def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 143def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 144} 145 146 147// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 148// and doesn't need it because on sandy bridge the register is set to zero 149// at the rename stage without using any execution unit, so SET0PSY 150// and SET0PDY can be used for vector int instructions without penalty 151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 152 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 153def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 154 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 155} 156 157let Predicates = [NoAVX512] in { 158def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 159def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 160def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 161def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 163} 164 165// We set canFoldAsLoad because this can be converted to a constant-pool 166// load of an all-ones value if folding it would be beneficial. 167let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 168 isPseudo = 1, SchedRW = [WriteZero] in { 169 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 170 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 171 let Predicates = [HasAVX1Only, OptForMinSize] in { 172 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 173 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 174 } 175 let Predicates = [HasAVX2] in 176 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178} 179 180//===----------------------------------------------------------------------===// 181// SSE 1 & 2 - Move FP Scalar Instructions 182// 183// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 184// register copies because it's a partial register update; Register-to-register 185// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 186// that the insert be implementable in terms of a copy, and just mentioned, we 187// don't use movss/movsd for copies. 188//===----------------------------------------------------------------------===// 189 190multiclass sse12_move_rr<SDNode OpNode, ValueType vt, 191 X86MemOperand x86memop, string base_opc, 192 string asm_opr, Domain d, string Name> { 193 let isCommutable = 1 in 194 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 195 (ins VR128:$src1, VR128:$src2), 196 !strconcat(base_opc, asm_opr), 197 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 198 Sched<[SchedWriteFShuffle.XMM]>; 199 200 // For the disassembler 201 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 202 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 203 (ins VR128:$src1, VR128:$src2), 204 !strconcat(base_opc, asm_opr), []>, 205 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 206} 207 208multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 209 X86MemOperand x86memop, string OpcodeStr, 210 Domain d, string Name, Predicate pred> { 211 // AVX 212 let Predicates = [UseAVX, OptForSize] in 213 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 214 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 215 "V"#Name>, 216 VEX_4V, VEX_LIG, VEX_WIG; 217 218 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 219 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 220 [(store RC:$src, addr:$dst)], d>, 221 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 222 // SSE1 & 2 223 let Constraints = "$src1 = $dst" in { 224 let Predicates = [pred, NoSSE41_Or_OptForSize] in 225 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 226 "\t{$src2, $dst|$dst, $src2}", d, Name>; 227 } 228 229 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 231 [(store RC:$src, addr:$dst)], d>, 232 Sched<[WriteFStore]>; 233 234 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 235 (!cast<Instruction>("V"#NAME#"rr_REV") 236 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 237 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 238 (!cast<Instruction>(NAME#"rr_REV") 239 VR128:$dst, VR128:$src2), 0>; 240} 241 242// Loading from memory automatically zeroing upper bits. 243multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 244 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 245 Domain d> { 246 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 248 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 249 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 250 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 251 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 252 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 253 Sched<[WriteFLoad]>; 254 255 // _alt version uses FR32/FR64 register class. 256 let isCodeGenOnly = 1 in { 257 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 258 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 259 [(set RC:$dst, (mem_pat addr:$src))], d>, 260 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 261 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 263 [(set RC:$dst, (mem_pat addr:$src))], d>, 264 Sched<[WriteFLoad]>; 265 } 266} 267 268defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 269 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 270defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 271 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 272 273let canFoldAsLoad = 1, isReMaterializable = 1 in { 274 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 275 SSEPackedSingle>, XS; 276 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 277 SSEPackedDouble>, XD; 278} 279 280// Patterns 281let Predicates = [UseAVX] in { 282 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 283 (VMOVSSrm addr:$src)>; 284 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 285 (VMOVSDrm addr:$src)>; 286 287 // Represent the same patterns above but in the form they appear for 288 // 256-bit types 289 def : Pat<(v8f32 (X86vzload32 addr:$src)), 290 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 291 def : Pat<(v4f64 (X86vzload64 addr:$src)), 292 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 293} 294 295let Predicates = [UseAVX, OptForSize] in { 296 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 297 // MOVSS to the lower bits. 298 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 299 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 300 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 301 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 302 303 // Move low f32 and clear high bits. 304 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 305 (SUBREG_TO_REG (i32 0), 306 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 307 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 308 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 309 (SUBREG_TO_REG (i32 0), 310 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 311 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 312} 313 314let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 315// Move scalar to XMM zero-extended, zeroing a VR128 then do a 316// MOVSS to the lower bits. 317def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 318 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 319def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 320 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 321} 322 323let Predicates = [UseSSE2] in 324def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 325 (MOVSDrm addr:$src)>; 326 327let Predicates = [UseSSE1] in 328def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 329 (MOVSSrm addr:$src)>; 330 331//===----------------------------------------------------------------------===// 332// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 333//===----------------------------------------------------------------------===// 334 335multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 336 X86MemOperand x86memop, PatFrag ld_frag, 337 string asm, Domain d, 338 X86SchedWriteMoveLS sched> { 339let hasSideEffects = 0, isMoveReg = 1 in 340 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 341 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 342 Sched<[sched.RR]>; 343let canFoldAsLoad = 1, isReMaterializable = 1 in 344 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 345 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 346 [(set RC:$dst, (ld_frag addr:$src))], d>, 347 Sched<[sched.RM]>; 348} 349 350let Predicates = [HasAVX, NoVLX] in { 351defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 352 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 353 PS, VEX, VEX_WIG; 354defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 355 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 356 PD, VEX, VEX_WIG; 357defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 358 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 359 PS, VEX, VEX_WIG; 360defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 361 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 362 PD, VEX, VEX_WIG; 363 364defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 365 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 366 PS, VEX, VEX_L, VEX_WIG; 367defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 368 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 369 PD, VEX, VEX_L, VEX_WIG; 370defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 371 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 372 PS, VEX, VEX_L, VEX_WIG; 373defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 374 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 375 PD, VEX, VEX_L, VEX_WIG; 376} 377 378let Predicates = [UseSSE1] in { 379defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 380 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 381 PS; 382defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 383 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 384 PS; 385} 386let Predicates = [UseSSE2] in { 387defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 388 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 389 PD; 390defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 391 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 392 PD; 393} 394 395let Predicates = [HasAVX, NoVLX] in { 396let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 397def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 398 "movaps\t{$src, $dst|$dst, $src}", 399 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 400 VEX, VEX_WIG; 401def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 402 "movapd\t{$src, $dst|$dst, $src}", 403 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 404 VEX, VEX_WIG; 405def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 406 "movups\t{$src, $dst|$dst, $src}", 407 [(store (v4f32 VR128:$src), addr:$dst)]>, 408 VEX, VEX_WIG; 409def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 410 "movupd\t{$src, $dst|$dst, $src}", 411 [(store (v2f64 VR128:$src), addr:$dst)]>, 412 VEX, VEX_WIG; 413} // SchedRW 414 415let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 416def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 417 "movaps\t{$src, $dst|$dst, $src}", 418 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 419 VEX, VEX_L, VEX_WIG; 420def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 421 "movapd\t{$src, $dst|$dst, $src}", 422 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 423 VEX, VEX_L, VEX_WIG; 424def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 425 "movups\t{$src, $dst|$dst, $src}", 426 [(store (v8f32 VR256:$src), addr:$dst)]>, 427 VEX, VEX_L, VEX_WIG; 428def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 429 "movupd\t{$src, $dst|$dst, $src}", 430 [(store (v4f64 VR256:$src), addr:$dst)]>, 431 VEX, VEX_L, VEX_WIG; 432} // SchedRW 433} // Predicate 434 435// For disassembler 436let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 437 isMoveReg = 1 in { 438let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 439 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 440 (ins VR128:$src), 441 "movaps\t{$src, $dst|$dst, $src}", []>, 442 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 443 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 444 (ins VR128:$src), 445 "movapd\t{$src, $dst|$dst, $src}", []>, 446 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 447 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 448 (ins VR128:$src), 449 "movups\t{$src, $dst|$dst, $src}", []>, 450 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 451 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 452 (ins VR128:$src), 453 "movupd\t{$src, $dst|$dst, $src}", []>, 454 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 455} // SchedRW 456 457let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 458 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 459 (ins VR256:$src), 460 "movaps\t{$src, $dst|$dst, $src}", []>, 461 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 462 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 463 (ins VR256:$src), 464 "movapd\t{$src, $dst|$dst, $src}", []>, 465 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 466 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 467 (ins VR256:$src), 468 "movups\t{$src, $dst|$dst, $src}", []>, 469 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 470 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 471 (ins VR256:$src), 472 "movupd\t{$src, $dst|$dst, $src}", []>, 473 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 474} // SchedRW 475} // Predicate 476 477// Reversed version with ".s" suffix for GAS compatibility. 478def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 479 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 480def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 481 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 482def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 483 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 484def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 485 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 486def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 487 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 488def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 489 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 490def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 491 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 492def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 493 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 494 495let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 496def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 497 "movaps\t{$src, $dst|$dst, $src}", 498 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 499def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 500 "movapd\t{$src, $dst|$dst, $src}", 501 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 502def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 503 "movups\t{$src, $dst|$dst, $src}", 504 [(store (v4f32 VR128:$src), addr:$dst)]>; 505def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 506 "movupd\t{$src, $dst|$dst, $src}", 507 [(store (v2f64 VR128:$src), addr:$dst)]>; 508} // SchedRW 509 510// For disassembler 511let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 512 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 513 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 514 "movaps\t{$src, $dst|$dst, $src}", []>, 515 FoldGenData<"MOVAPSrr">; 516 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 517 "movapd\t{$src, $dst|$dst, $src}", []>, 518 FoldGenData<"MOVAPDrr">; 519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movups\t{$src, $dst|$dst, $src}", []>, 521 FoldGenData<"MOVUPSrr">; 522 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 523 "movupd\t{$src, $dst|$dst, $src}", []>, 524 FoldGenData<"MOVUPDrr">; 525} 526 527// Reversed version with ".s" suffix for GAS compatibility. 528def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 529 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 530def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 531 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 532def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 533 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 534def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 535 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 536 537let Predicates = [HasAVX, NoVLX] in { 538 // 256-bit load/store need to use floating point load/store in case we don't 539 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 540 // available and changing the domain is beneficial. 541 def : Pat<(alignedloadv4i64 addr:$src), 542 (VMOVAPSYrm addr:$src)>; 543 def : Pat<(alignedloadv8i32 addr:$src), 544 (VMOVAPSYrm addr:$src)>; 545 def : Pat<(alignedloadv16i16 addr:$src), 546 (VMOVAPSYrm addr:$src)>; 547 def : Pat<(alignedloadv32i8 addr:$src), 548 (VMOVAPSYrm addr:$src)>; 549 def : Pat<(loadv4i64 addr:$src), 550 (VMOVUPSYrm addr:$src)>; 551 def : Pat<(loadv8i32 addr:$src), 552 (VMOVUPSYrm addr:$src)>; 553 def : Pat<(loadv16i16 addr:$src), 554 (VMOVUPSYrm addr:$src)>; 555 def : Pat<(loadv32i8 addr:$src), 556 (VMOVUPSYrm addr:$src)>; 557 558 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 559 (VMOVAPSYmr addr:$dst, VR256:$src)>; 560 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 561 (VMOVAPSYmr addr:$dst, VR256:$src)>; 562 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 563 (VMOVAPSYmr addr:$dst, VR256:$src)>; 564 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 565 (VMOVAPSYmr addr:$dst, VR256:$src)>; 566 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 567 (VMOVUPSYmr addr:$dst, VR256:$src)>; 568 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 569 (VMOVUPSYmr addr:$dst, VR256:$src)>; 570 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 571 (VMOVUPSYmr addr:$dst, VR256:$src)>; 572 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 573 (VMOVUPSYmr addr:$dst, VR256:$src)>; 574} 575 576// Use movaps / movups for SSE integer load / store (one byte shorter). 577// The instructions selected below are then converted to MOVDQA/MOVDQU 578// during the SSE domain pass. 579let Predicates = [UseSSE1] in { 580 def : Pat<(alignedloadv2i64 addr:$src), 581 (MOVAPSrm addr:$src)>; 582 def : Pat<(alignedloadv4i32 addr:$src), 583 (MOVAPSrm addr:$src)>; 584 def : Pat<(alignedloadv8i16 addr:$src), 585 (MOVAPSrm addr:$src)>; 586 def : Pat<(alignedloadv16i8 addr:$src), 587 (MOVAPSrm addr:$src)>; 588 def : Pat<(loadv2i64 addr:$src), 589 (MOVUPSrm addr:$src)>; 590 def : Pat<(loadv4i32 addr:$src), 591 (MOVUPSrm addr:$src)>; 592 def : Pat<(loadv8i16 addr:$src), 593 (MOVUPSrm addr:$src)>; 594 def : Pat<(loadv16i8 addr:$src), 595 (MOVUPSrm addr:$src)>; 596 597 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 598 (MOVAPSmr addr:$dst, VR128:$src)>; 599 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 600 (MOVAPSmr addr:$dst, VR128:$src)>; 601 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 602 (MOVAPSmr addr:$dst, VR128:$src)>; 603 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 604 (MOVAPSmr addr:$dst, VR128:$src)>; 605 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 606 (MOVUPSmr addr:$dst, VR128:$src)>; 607 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 608 (MOVUPSmr addr:$dst, VR128:$src)>; 609 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 610 (MOVUPSmr addr:$dst, VR128:$src)>; 611 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 612 (MOVUPSmr addr:$dst, VR128:$src)>; 613} 614 615//===----------------------------------------------------------------------===// 616// SSE 1 & 2 - Move Low packed FP Instructions 617//===----------------------------------------------------------------------===// 618 619multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, 620 string base_opc, string asm_opr> { 621 // No pattern as they need be special cased between high and low. 622 let hasSideEffects = 0, mayLoad = 1 in 623 def PSrm : PI<opc, MRMSrcMem, 624 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 625 !strconcat(base_opc, "s", asm_opr), 626 [], SSEPackedSingle>, PS, 627 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 628 629 def PDrm : PI<opc, MRMSrcMem, 630 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 631 !strconcat(base_opc, "d", asm_opr), 632 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 633 (scalar_to_vector (loadf64 addr:$src2)))))], 634 SSEPackedDouble>, PD, 635 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 636} 637 638multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 639 string base_opc> { 640 let Predicates = [UseAVX] in 641 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 642 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 643 VEX_4V, VEX_WIG; 644 645 let Constraints = "$src1 = $dst" in 646 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 647 "\t{$src2, $dst|$dst, $src2}">; 648} 649 650defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 651 652let SchedRW = [WriteFStore] in { 653let Predicates = [UseAVX] in { 654let mayStore = 1, hasSideEffects = 0 in 655def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 656 "movlps\t{$src, $dst|$dst, $src}", 657 []>, 658 VEX, VEX_WIG; 659def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 660 "movlpd\t{$src, $dst|$dst, $src}", 661 [(store (f64 (extractelt (v2f64 VR128:$src), 662 (iPTR 0))), addr:$dst)]>, 663 VEX, VEX_WIG; 664}// UseAVX 665let mayStore = 1, hasSideEffects = 0 in 666def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 667 "movlps\t{$src, $dst|$dst, $src}", 668 []>; 669def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 670 "movlpd\t{$src, $dst|$dst, $src}", 671 [(store (f64 (extractelt (v2f64 VR128:$src), 672 (iPTR 0))), addr:$dst)]>; 673} // SchedRW 674 675let Predicates = [UseSSE1] in { 676 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 677 // end up with a movsd or blend instead of shufp. 678 // No need for aligned load, we're only loading 64-bits. 679 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 680 (i8 -28)), 681 (MOVLPSrm VR128:$src1, addr:$src2)>; 682 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 683 (MOVLPSrm VR128:$src1, addr:$src2)>; 684 685 def : Pat<(v4f32 (X86vzload64 addr:$src)), 686 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 687 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 688 (MOVLPSmr addr:$dst, VR128:$src)>; 689} 690 691//===----------------------------------------------------------------------===// 692// SSE 1 & 2 - Move Hi packed FP Instructions 693//===----------------------------------------------------------------------===// 694 695defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 696 697let SchedRW = [WriteFStore] in { 698// v2f64 extract element 1 is always custom lowered to unpack high to low 699// and extract element 0 so the non-store version isn't too horrible. 700let Predicates = [UseAVX] in { 701let mayStore = 1, hasSideEffects = 0 in 702def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 703 "movhps\t{$src, $dst|$dst, $src}", 704 []>, VEX, VEX_WIG; 705def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 706 "movhpd\t{$src, $dst|$dst, $src}", 707 [(store (f64 (extractelt 708 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 709 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 710} // UseAVX 711let mayStore = 1, hasSideEffects = 0 in 712def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 713 "movhps\t{$src, $dst|$dst, $src}", 714 []>; 715def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 716 "movhpd\t{$src, $dst|$dst, $src}", 717 [(store (f64 (extractelt 718 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 719 (iPTR 0))), addr:$dst)]>; 720} // SchedRW 721 722let Predicates = [UseAVX] in { 723 // MOVHPD patterns 724 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 725 (VMOVHPDrm VR128:$src1, addr:$src2)>; 726 727 def : Pat<(store (f64 (extractelt 728 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 729 (iPTR 0))), addr:$dst), 730 (VMOVHPDmr addr:$dst, VR128:$src)>; 731 732 // MOVLPD patterns 733 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 734 (VMOVLPDrm VR128:$src1, addr:$src2)>; 735} 736 737let Predicates = [UseSSE1] in { 738 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 739 // end up with a movsd or blend instead of shufp. 740 // No need for aligned load, we're only loading 64-bits. 741 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 742 (MOVHPSrm VR128:$src1, addr:$src2)>; 743 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 744 (MOVHPSrm VR128:$src1, addr:$src2)>; 745 746 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 747 addr:$dst), 748 (MOVHPSmr addr:$dst, VR128:$src)>; 749} 750 751let Predicates = [UseSSE2] in { 752 // MOVHPD patterns 753 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 754 (MOVHPDrm VR128:$src1, addr:$src2)>; 755 756 def : Pat<(store (f64 (extractelt 757 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 758 (iPTR 0))), addr:$dst), 759 (MOVHPDmr addr:$dst, VR128:$src)>; 760 761 // MOVLPD patterns 762 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 763 (MOVLPDrm VR128:$src1, addr:$src2)>; 764} 765 766let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 767 // Use MOVLPD to load into the low bits from a full vector unless we can use 768 // BLENDPD. 769 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 770 (MOVLPDrm VR128:$src1, addr:$src2)>; 771} 772 773//===----------------------------------------------------------------------===// 774// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 775//===----------------------------------------------------------------------===// 776 777let Predicates = [UseAVX] in { 778 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 779 (ins VR128:$src1, VR128:$src2), 780 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 781 [(set VR128:$dst, 782 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 783 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 784 let isCommutable = 1 in 785 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 786 (ins VR128:$src1, VR128:$src2), 787 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 788 [(set VR128:$dst, 789 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 790 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 791 NotMemoryFoldable; 792} 793let Constraints = "$src1 = $dst" in { 794 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 795 (ins VR128:$src1, VR128:$src2), 796 "movlhps\t{$src2, $dst|$dst, $src2}", 797 [(set VR128:$dst, 798 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 799 Sched<[SchedWriteFShuffle.XMM]>; 800 let isCommutable = 1 in 801 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 802 (ins VR128:$src1, VR128:$src2), 803 "movhlps\t{$src2, $dst|$dst, $src2}", 804 [(set VR128:$dst, 805 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 806 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 807} 808 809//===----------------------------------------------------------------------===// 810// SSE 1 & 2 - Conversion Instructions 811//===----------------------------------------------------------------------===// 812 813multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 814 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 815 string asm, string mem, X86FoldableSchedWrite sched, 816 Domain d, 817 SchedRead Int2Fpu = ReadDefault> { 818 let ExeDomain = d in { 819 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 820 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 821 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 822 Sched<[sched, Int2Fpu]>; 823 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 824 mem#"\t{$src, $dst|$dst, $src}", 825 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 826 Sched<[sched.Folded]>; 827 } 828} 829 830multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 831 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 832 string asm, Domain d, X86FoldableSchedWrite sched> { 833let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 834 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 835 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 836 Sched<[sched]>; 837 let mayLoad = 1 in 838 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 839 [(set RC:$dst, (DstTy (any_sint_to_fp 840 (SrcTy (ld_frag addr:$src)))))], d>, 841 Sched<[sched.Folded]>; 842} 843} 844 845multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 846 X86MemOperand x86memop, string asm, string mem, 847 X86FoldableSchedWrite sched, Domain d> { 848let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 849 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 850 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 851 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 852 let mayLoad = 1 in 853 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 854 (ins DstRC:$src1, x86memop:$src), 855 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 856 Sched<[sched.Folded, sched.ReadAfterFold]>; 857} // hasSideEffects = 0 858} 859 860let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 861defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 862 "cvttss2si", "cvttss2si", 863 WriteCvtSS2I, SSEPackedSingle>, 864 XS, VEX, VEX_LIG; 865defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 866 "cvttss2si", "cvttss2si", 867 WriteCvtSS2I, SSEPackedSingle>, 868 XS, VEX, VEX_W, VEX_LIG; 869defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 870 "cvttsd2si", "cvttsd2si", 871 WriteCvtSD2I, SSEPackedDouble>, 872 XD, VEX, VEX_LIG; 873defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 874 "cvttsd2si", "cvttsd2si", 875 WriteCvtSD2I, SSEPackedDouble>, 876 XD, VEX, VEX_W, VEX_LIG; 877 878defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 879 "cvtss2si", "cvtss2si", 880 WriteCvtSS2I, SSEPackedSingle>, 881 XS, VEX, VEX_LIG; 882defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 883 "cvtss2si", "cvtss2si", 884 WriteCvtSS2I, SSEPackedSingle>, 885 XS, VEX, VEX_W, VEX_LIG; 886defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 887 "cvtsd2si", "cvtsd2si", 888 WriteCvtSD2I, SSEPackedDouble>, 889 XD, VEX, VEX_LIG; 890defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 891 "cvtsd2si", "cvtsd2si", 892 WriteCvtSD2I, SSEPackedDouble>, 893 XD, VEX, VEX_W, VEX_LIG; 894} 895 896// The assembler can recognize rr 64-bit instructions by seeing a rxx 897// register, but the same isn't true when only using memory operands, 898// provide other assembly "l" and "q" forms to address this explicitly 899// where appropriate to do so. 900let isCodeGenOnly = 1 in { 901defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 902 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 903 VEX_LIG, SIMD_EXC; 904defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 905 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 906 VEX_W, VEX_LIG, SIMD_EXC; 907defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 908 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 909 VEX_LIG; 910defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 911 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 912 VEX_W, VEX_LIG, SIMD_EXC; 913} // isCodeGenOnly = 1 914 915let Predicates = [UseAVX] in { 916 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 917 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 918 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 919 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 920 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 921 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 922 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 923 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 924 925 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 926 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 927 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 928 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 929 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 930 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 931 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 932 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 933 934 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; 935 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; 936 937 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; 938 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; 939} 940 941let isCodeGenOnly = 1 in { 942defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 943 "cvttss2si", "cvttss2si", 944 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 945defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 946 "cvttss2si", "cvttss2si", 947 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 948defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 949 "cvttsd2si", "cvttsd2si", 950 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 951defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 952 "cvttsd2si", "cvttsd2si", 953 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 954 955defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 956 "cvtss2si", "cvtss2si", 957 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 958defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 959 "cvtss2si", "cvtss2si", 960 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 961defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 962 "cvtsd2si", "cvtsd2si", 963 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 964defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 965 "cvtsd2si", "cvtsd2si", 966 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 967 968defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 969 "cvtsi2ss", "cvtsi2ss{l}", 970 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; 971defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 972 "cvtsi2ss", "cvtsi2ss{q}", 973 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; 974defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 975 "cvtsi2sd", "cvtsi2sd{l}", 976 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; 977defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 978 "cvtsi2sd", "cvtsi2sd{q}", 979 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; 980} // isCodeGenOnly = 1 981 982let Predicates = [UseSSE1] in { 983 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; 984 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; 985} 986 987let Predicates = [UseSSE2] in { 988 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; 989 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; 990} 991 992// Conversion Instructions Intrinsics - Match intrinsics which expect MM 993// and/or XMM operand(s). 994 995multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 996 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 997 Operand memop, PatFrags mem_frags, string asm, 998 X86FoldableSchedWrite sched, Domain d> { 999let ExeDomain = d in { 1000 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1001 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1002 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 1003 Sched<[sched]>; 1004 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1005 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1006 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, 1007 Sched<[sched.Folded]>; 1008} 1009} 1010 1011multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1012 RegisterClass DstRC, X86MemOperand x86memop, 1013 string asm, string mem, X86FoldableSchedWrite sched, 1014 Domain d, bit Is2Addr = 1> { 1015let hasSideEffects = 0, ExeDomain = d in { 1016 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1017 !if(Is2Addr, 1018 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1019 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1020 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 1021 let mayLoad = 1 in 1022 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1023 (ins DstRC:$src1, x86memop:$src2), 1024 !if(Is2Addr, 1025 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 1026 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 1027 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 1028} 1029} 1030 1031let Uses = [MXCSR], mayRaiseFPException = 1 in { 1032let Predicates = [UseAVX] in { 1033defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 1034 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1035 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1036defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1037 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1038 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; 1039} 1040defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1041 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1042 SSEPackedDouble>, XD; 1043defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1044 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1045 SSEPackedDouble>, XD, REX_W; 1046} 1047 1048let Predicates = [UseAVX] in { 1049defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1050 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1051 XS, VEX_4V, VEX_LIG, SIMD_EXC; 1052defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1053 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1054 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1055defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1056 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1057 XD, VEX_4V, VEX_LIG; 1058defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1059 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1060 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1061} 1062let Constraints = "$src1 = $dst" in { 1063 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1064 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1065 XS, SIMD_EXC; 1066 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1067 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1068 XS, REX_W, SIMD_EXC; 1069 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1070 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1071 XD; 1072 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1073 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1074 XD, REX_W, SIMD_EXC; 1075} 1076 1077def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1078 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1079def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1080 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1081def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1082 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1083def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1084 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1085 1086def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1087 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1088def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1089 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1090 1091def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1092 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1093def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1094 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1095def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1096 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1097def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1098 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1099 1100def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1101 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1102def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1103 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1104 1105/// SSE 1 Only 1106 1107// Aliases for intrinsics 1108let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1109defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1110 ssmem, sse_load_f32, "cvttss2si", 1111 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1112defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1113 X86cvtts2Int, ssmem, sse_load_f32, 1114 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1115 XS, VEX, VEX_LIG, VEX_W; 1116defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1117 sdmem, sse_load_f64, "cvttsd2si", 1118 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1119defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1120 X86cvtts2Int, sdmem, sse_load_f64, 1121 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1122 XD, VEX, VEX_LIG, VEX_W; 1123} 1124let Uses = [MXCSR], mayRaiseFPException = 1 in { 1125defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1126 ssmem, sse_load_f32, "cvttss2si", 1127 WriteCvtSS2I, SSEPackedSingle>, XS; 1128defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1129 X86cvtts2Int, ssmem, sse_load_f32, 1130 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1131 XS, REX_W; 1132defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1133 sdmem, sse_load_f64, "cvttsd2si", 1134 WriteCvtSD2I, SSEPackedDouble>, XD; 1135defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1136 X86cvtts2Int, sdmem, sse_load_f64, 1137 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1138 XD, REX_W; 1139} 1140 1141def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1142 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1143def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1144 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1145def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1146 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1147def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1148 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1149def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1150 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1151def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1152 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1153def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1154 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1155def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1156 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1157 1158def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1159 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1160def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1161 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1162def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1163 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1164def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1165 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1166def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1167 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1168def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1169 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1170def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1171 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1172def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1173 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1174 1175let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1176defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1177 ssmem, sse_load_f32, "cvtss2si", 1178 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1179defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1180 ssmem, sse_load_f32, "cvtss2si", 1181 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; 1182} 1183let Uses = [MXCSR], mayRaiseFPException = 1 in { 1184defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1185 ssmem, sse_load_f32, "cvtss2si", 1186 WriteCvtSS2I, SSEPackedSingle>, XS; 1187defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1188 ssmem, sse_load_f32, "cvtss2si", 1189 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W; 1190 1191defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1192 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1193 SSEPackedSingle, WriteCvtI2PS>, 1194 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1195defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1196 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1197 SSEPackedSingle, WriteCvtI2PSY>, 1198 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1199 1200defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1201 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1202 SSEPackedSingle, WriteCvtI2PS>, 1203 PS, Requires<[UseSSE2]>; 1204} 1205 1206// AVX aliases 1207def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1208 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1209def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1210 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1211def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1212 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1213def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1214 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1215def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1216 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1217def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1218 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1219def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1220 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1221def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1222 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1223 1224// SSE aliases 1225def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1226 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1227def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1228 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1229def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1230 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1231def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1232 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1233def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1234 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1235def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1236 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1237def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1238 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1239def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1240 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1241 1242/// SSE 2 Only 1243 1244// Convert scalar double to scalar single 1245let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in { 1246def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1247 (ins FR32:$src1, FR64:$src2), 1248 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1249 VEX_4V, VEX_LIG, VEX_WIG, 1250 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1251let mayLoad = 1 in 1252def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1253 (ins FR32:$src1, f64mem:$src2), 1254 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1255 XD, VEX_4V, VEX_LIG, VEX_WIG, 1256 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1257} 1258 1259def : Pat<(f32 (any_fpround FR64:$src)), 1260 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1261 Requires<[UseAVX]>; 1262 1263let isCodeGenOnly = 1 in { 1264def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1265 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1266 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1267 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1268def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1269 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1270 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1271 XD, Requires<[UseSSE2, OptForSize]>, 1272 Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC; 1273} 1274 1275let Uses = [MXCSR], mayRaiseFPException = 1 in { 1276def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1277 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1278 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1279 [(set VR128:$dst, 1280 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1281 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1282 Sched<[WriteCvtSD2SS]>; 1283def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1284 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1285 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1286 [(set VR128:$dst, 1287 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1288 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1289 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1290let Constraints = "$src1 = $dst" in { 1291def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1292 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1293 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1294 [(set VR128:$dst, 1295 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1296 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1297def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1298 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1299 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1300 [(set VR128:$dst, 1301 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1302 XD, Requires<[UseSSE2]>, 1303 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1304} 1305} 1306 1307// Convert scalar single to scalar double 1308// SSE2 instructions with XS prefix 1309let isCodeGenOnly = 1, hasSideEffects = 0 in { 1310def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1311 (ins FR64:$src1, FR32:$src2), 1312 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1313 XS, VEX_4V, VEX_LIG, VEX_WIG, 1314 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1315let mayLoad = 1 in 1316def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1317 (ins FR64:$src1, f32mem:$src2), 1318 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1319 XS, VEX_4V, VEX_LIG, VEX_WIG, 1320 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1321 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1322} // isCodeGenOnly = 1, hasSideEffects = 0 1323 1324def : Pat<(f64 (any_fpextend FR32:$src)), 1325 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1326def : Pat<(any_fpextend (loadf32 addr:$src)), 1327 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1328 1329let isCodeGenOnly = 1 in { 1330def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1331 "cvtss2sd\t{$src, $dst|$dst, $src}", 1332 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1333 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1334def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1335 "cvtss2sd\t{$src, $dst|$dst, $src}", 1336 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1337 XS, Requires<[UseSSE2, OptForSize]>, 1338 Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC; 1339} // isCodeGenOnly = 1 1340 1341let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 1342def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1343 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1344 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1345 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, 1346 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1347let mayLoad = 1 in 1348def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1349 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1350 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1351 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, 1352 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1353let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1354def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1355 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1356 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1357 []>, XS, Requires<[UseSSE2]>, 1358 Sched<[WriteCvtSS2SD]>; 1359let mayLoad = 1 in 1360def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1361 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1362 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1363 []>, XS, Requires<[UseSSE2]>, 1364 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1365} 1366} // hasSideEffects = 0 1367 1368// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1369// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1370// vmovs{s,d} instructions 1371let Predicates = [UseAVX] in { 1372def : Pat<(v4f32 (X86Movss 1373 (v4f32 VR128:$dst), 1374 (v4f32 (scalar_to_vector 1375 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1376 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1377 1378def : Pat<(v2f64 (X86Movsd 1379 (v2f64 VR128:$dst), 1380 (v2f64 (scalar_to_vector 1381 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1382 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1383 1384def : Pat<(v4f32 (X86Movss 1385 (v4f32 VR128:$dst), 1386 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1387 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1388 1389def : Pat<(v4f32 (X86Movss 1390 (v4f32 VR128:$dst), 1391 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1392 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1393 1394def : Pat<(v4f32 (X86Movss 1395 (v4f32 VR128:$dst), 1396 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1397 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1398 1399def : Pat<(v4f32 (X86Movss 1400 (v4f32 VR128:$dst), 1401 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1402 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1403 1404def : Pat<(v2f64 (X86Movsd 1405 (v2f64 VR128:$dst), 1406 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1407 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1408 1409def : Pat<(v2f64 (X86Movsd 1410 (v2f64 VR128:$dst), 1411 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1412 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1413 1414def : Pat<(v2f64 (X86Movsd 1415 (v2f64 VR128:$dst), 1416 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1417 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1418 1419def : Pat<(v2f64 (X86Movsd 1420 (v2f64 VR128:$dst), 1421 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1422 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1423} // Predicates = [UseAVX] 1424 1425let Predicates = [UseSSE2] in { 1426def : Pat<(v4f32 (X86Movss 1427 (v4f32 VR128:$dst), 1428 (v4f32 (scalar_to_vector 1429 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1430 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1431 1432def : Pat<(v2f64 (X86Movsd 1433 (v2f64 VR128:$dst), 1434 (v2f64 (scalar_to_vector 1435 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1436 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1437 1438def : Pat<(v2f64 (X86Movsd 1439 (v2f64 VR128:$dst), 1440 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1441 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1442 1443def : Pat<(v2f64 (X86Movsd 1444 (v2f64 VR128:$dst), 1445 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1446 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1447 1448def : Pat<(v2f64 (X86Movsd 1449 (v2f64 VR128:$dst), 1450 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1451 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1452 1453def : Pat<(v2f64 (X86Movsd 1454 (v2f64 VR128:$dst), 1455 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1456 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1457} // Predicates = [UseSSE2] 1458 1459let Predicates = [UseSSE1] in { 1460def : Pat<(v4f32 (X86Movss 1461 (v4f32 VR128:$dst), 1462 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1463 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1464 1465def : Pat<(v4f32 (X86Movss 1466 (v4f32 VR128:$dst), 1467 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1468 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1469 1470def : Pat<(v4f32 (X86Movss 1471 (v4f32 VR128:$dst), 1472 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1473 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1474 1475def : Pat<(v4f32 (X86Movss 1476 (v4f32 VR128:$dst), 1477 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1478 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1479} // Predicates = [UseSSE1] 1480 1481let Predicates = [HasAVX, NoVLX] in { 1482// Convert packed single/double fp to doubleword 1483def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1484 "cvtps2dq\t{$src, $dst|$dst, $src}", 1485 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1486 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC; 1487def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1488 "cvtps2dq\t{$src, $dst|$dst, $src}", 1489 [(set VR128:$dst, 1490 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1491 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC; 1492def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1493 "cvtps2dq\t{$src, $dst|$dst, $src}", 1494 [(set VR256:$dst, 1495 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1496 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC; 1497def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1498 "cvtps2dq\t{$src, $dst|$dst, $src}", 1499 [(set VR256:$dst, 1500 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1501 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC; 1502} 1503def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1504 "cvtps2dq\t{$src, $dst|$dst, $src}", 1505 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1506 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1507def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1508 "cvtps2dq\t{$src, $dst|$dst, $src}", 1509 [(set VR128:$dst, 1510 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1511 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1512 1513 1514// Convert Packed Double FP to Packed DW Integers 1515let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1516// The assembler can recognize rr 256-bit instructions by seeing a ymm 1517// register, but the same isn't true when using memory operands instead. 1518// Provide other assembly rr and rm forms to address this explicitly. 1519def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1520 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1521 [(set VR128:$dst, 1522 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1523 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1524 1525// XMM only 1526def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1527 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1528 [(set VR128:$dst, 1529 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1530 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1531 1532// YMM only 1533def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1534 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1535 [(set VR128:$dst, 1536 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1537 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1538def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1539 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1540 [(set VR128:$dst, 1541 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1542 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1543} 1544 1545def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1546 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1547def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1548 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1549 1550def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1551 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1552 [(set VR128:$dst, 1553 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1554 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1555def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1556 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1557 [(set VR128:$dst, 1558 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1559 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1560 1561// Convert with truncation packed single/double fp to doubleword 1562// SSE2 packed instructions with XS prefix 1563let Uses = [MXCSR], mayRaiseFPException = 1 in { 1564let Predicates = [HasAVX, NoVLX] in { 1565def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1566 "cvttps2dq\t{$src, $dst|$dst, $src}", 1567 [(set VR128:$dst, 1568 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1569 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1570def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1571 "cvttps2dq\t{$src, $dst|$dst, $src}", 1572 [(set VR128:$dst, 1573 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1574 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1575def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1576 "cvttps2dq\t{$src, $dst|$dst, $src}", 1577 [(set VR256:$dst, 1578 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1579 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1580def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1581 "cvttps2dq\t{$src, $dst|$dst, $src}", 1582 [(set VR256:$dst, 1583 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1584 VEX, VEX_L, 1585 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1586} 1587 1588def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1589 "cvttps2dq\t{$src, $dst|$dst, $src}", 1590 [(set VR128:$dst, 1591 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1592 Sched<[WriteCvtPS2I]>; 1593def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1594 "cvttps2dq\t{$src, $dst|$dst, $src}", 1595 [(set VR128:$dst, 1596 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1597 Sched<[WriteCvtPS2ILd]>; 1598} 1599 1600// The assembler can recognize rr 256-bit instructions by seeing a ymm 1601// register, but the same isn't true when using memory operands instead. 1602// Provide other assembly rr and rm forms to address this explicitly. 1603let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1604// XMM only 1605def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1606 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1607 [(set VR128:$dst, 1608 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1609 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1610def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1611 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1612 [(set VR128:$dst, 1613 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1614 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1615 1616// YMM only 1617def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1618 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1619 [(set VR128:$dst, 1620 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1621 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1622def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1623 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1624 [(set VR128:$dst, 1625 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1626 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1627} // Predicates = [HasAVX, NoVLX] 1628 1629def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1630 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1631def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1632 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1633 1634let Predicates = [HasAVX, NoVLX] in { 1635 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1636 (VCVTTPD2DQYrr VR256:$src)>; 1637 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1638 (VCVTTPD2DQYrm addr:$src)>; 1639} 1640 1641def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1642 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1643 [(set VR128:$dst, 1644 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1645 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1646def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1647 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1648 [(set VR128:$dst, 1649 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1650 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1651 1652// Convert packed single to packed double 1653let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1654 // SSE2 instructions without OpSize prefix 1655def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1656 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1657 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1658 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1659def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1660 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1661 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1662 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1663def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1664 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1665 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1666 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1667def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1668 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1669 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1670 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1671} 1672 1673let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1674def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1675 "cvtps2pd\t{$src, $dst|$dst, $src}", 1676 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1677 PS, Sched<[WriteCvtPS2PD]>; 1678def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1679 "cvtps2pd\t{$src, $dst|$dst, $src}", 1680 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1681 PS, Sched<[WriteCvtPS2PD.Folded]>; 1682} 1683 1684// Convert Packed DW Integers to Packed Double FP 1685let Predicates = [HasAVX, NoVLX] in { 1686let hasSideEffects = 0, mayLoad = 1 in 1687def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1688 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1689 [(set VR128:$dst, 1690 (v2f64 (X86any_VSintToFP 1691 (bc_v4i32 1692 (v2i64 (scalar_to_vector 1693 (loadi64 addr:$src)))))))]>, 1694 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1695def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1696 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1697 [(set VR128:$dst, 1698 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1699 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1700def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1701 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1702 [(set VR256:$dst, 1703 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1704 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1705 VEX_WIG; 1706def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1707 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1708 [(set VR256:$dst, 1709 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1710 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1711} 1712 1713let hasSideEffects = 0, mayLoad = 1 in 1714def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1715 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1716 [(set VR128:$dst, 1717 (v2f64 (X86any_VSintToFP 1718 (bc_v4i32 1719 (v2i64 (scalar_to_vector 1720 (loadi64 addr:$src)))))))]>, 1721 Sched<[WriteCvtI2PDLd]>; 1722def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1723 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1724 [(set VR128:$dst, 1725 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1726 Sched<[WriteCvtI2PD]>; 1727 1728// AVX register conversion intrinsics 1729let Predicates = [HasAVX, NoVLX] in { 1730 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1731 (VCVTDQ2PDrm addr:$src)>; 1732} // Predicates = [HasAVX, NoVLX] 1733 1734// SSE2 register conversion intrinsics 1735let Predicates = [UseSSE2] in { 1736 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1737 (CVTDQ2PDrm addr:$src)>; 1738} // Predicates = [UseSSE2] 1739 1740// Convert packed double to packed single 1741// The assembler can recognize rr 256-bit instructions by seeing a ymm 1742// register, but the same isn't true when using memory operands instead. 1743// Provide other assembly rr and rm forms to address this explicitly. 1744let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1745// XMM only 1746def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1747 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1748 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, 1749 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1750def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1751 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1752 [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>, 1753 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1754 1755def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1756 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1757 [(set VR128:$dst, (X86any_vfpround VR256:$src))]>, 1758 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1759def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1760 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1761 [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>, 1762 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1763} // Predicates = [HasAVX, NoVLX] 1764 1765def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1766 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1767def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1768 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1769 1770def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1771 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1772 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, 1773 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1774def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1775 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1776 [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>, 1777 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1778 1779//===----------------------------------------------------------------------===// 1780// SSE 1 & 2 - Compare Instructions 1781//===----------------------------------------------------------------------===// 1782 1783// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1784multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1785 Operand memop, SDNode OpNode, ValueType VT, 1786 PatFrag ld_frag, string asm, 1787 X86FoldableSchedWrite sched, 1788 PatFrags mem_frags> { 1789 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1790 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, 1791 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1792 VR128:$src2, timm:$cc))]>, 1793 Sched<[sched]>, SIMD_EXC; 1794 let mayLoad = 1 in 1795 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1796 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, 1797 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1798 (mem_frags addr:$src2), timm:$cc))]>, 1799 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1800 1801 let isCodeGenOnly = 1 in { 1802 let isCommutable = 1 in 1803 def rr : SIi8<0xC2, MRMSrcReg, 1804 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1805 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, 1806 Sched<[sched]>, SIMD_EXC; 1807 def rm : SIi8<0xC2, MRMSrcMem, 1808 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1809 [(set RC:$dst, (OpNode RC:$src1, 1810 (ld_frag addr:$src2), timm:$cc))]>, 1811 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1812 } 1813} 1814 1815let ExeDomain = SSEPackedSingle in 1816defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1817 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1818 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1819 XS, VEX_4V, VEX_LIG, VEX_WIG; 1820let ExeDomain = SSEPackedDouble in 1821defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1822 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1823 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1824 XD, VEX_4V, VEX_LIG, VEX_WIG; 1825 1826let Constraints = "$src1 = $dst" in { 1827 let ExeDomain = SSEPackedSingle in 1828 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1829 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1830 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1831 let ExeDomain = SSEPackedDouble in 1832 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1833 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1834 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1835} 1836 1837// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1838multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 1839 ValueType vt, X86MemOperand x86memop, 1840 PatFrag ld_frag, string OpcodeStr, Domain d, 1841 X86FoldableSchedWrite sched = WriteFComX> { 1842 let ExeDomain = d in { 1843 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1844 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1845 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1846 Sched<[sched]>, SIMD_EXC; 1847 let mayLoad = 1 in 1848 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1849 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1850 [(set EFLAGS, (OpNode (vt RC:$src1), 1851 (ld_frag addr:$src2)))]>, 1852 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1853} 1854} 1855 1856// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1857multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1858 ValueType vt, Operand memop, 1859 PatFrags mem_frags, string OpcodeStr, 1860 Domain d, 1861 X86FoldableSchedWrite sched = WriteFComX> { 1862let ExeDomain = d in { 1863 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1864 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1865 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1866 Sched<[sched]>, SIMD_EXC; 1867let mayLoad = 1 in 1868 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1869 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1870 [(set EFLAGS, (OpNode (vt RC:$src1), 1871 (mem_frags addr:$src2)))]>, 1872 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1873} 1874} 1875 1876let Defs = [EFLAGS] in { 1877 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1878 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1879 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1880 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1881 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1882 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1883 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1884 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1885 1886 let isCodeGenOnly = 1 in { 1887 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1888 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1889 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1890 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1891 1892 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1893 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1894 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1895 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1896 } 1897 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1898 "ucomiss", SSEPackedSingle>, PS; 1899 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1900 "ucomisd", SSEPackedDouble>, PD; 1901 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1902 "comiss", SSEPackedSingle>, PS; 1903 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1904 "comisd", SSEPackedDouble>, PD; 1905 1906 let isCodeGenOnly = 1 in { 1907 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1908 sse_load_f32, "ucomiss", SSEPackedSingle>, PS; 1909 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1910 sse_load_f64, "ucomisd", SSEPackedDouble>, PD; 1911 1912 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1913 sse_load_f32, "comiss", SSEPackedSingle>, PS; 1914 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1915 sse_load_f64, "comisd", SSEPackedDouble>, PD; 1916 } 1917} // Defs = [EFLAGS] 1918 1919// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1920multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1921 ValueType VT, string asm, 1922 X86FoldableSchedWrite sched, 1923 Domain d, PatFrag ld_frag> { 1924 let isCommutable = 1 in 1925 def rri : PIi8<0xC2, MRMSrcReg, 1926 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1927 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1928 Sched<[sched]>, SIMD_EXC; 1929 def rmi : PIi8<0xC2, MRMSrcMem, 1930 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1931 [(set RC:$dst, 1932 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1933 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1934} 1935 1936defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1937 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1938 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 1939defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1940 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1941 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 1942defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1943 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1944 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 1945defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1946 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1947 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 1948let Constraints = "$src1 = $dst" in { 1949 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1950 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1951 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 1952 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1953 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1954 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 1955} 1956 1957def CommutableCMPCC : PatLeaf<(timm), [{ 1958 uint64_t Imm = N->getZExtValue() & 0x7; 1959 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 1960}]>; 1961 1962// Patterns to select compares with loads in first operand. 1963let Predicates = [HasAVX] in { 1964 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 1965 CommutableCMPCC:$cc)), 1966 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 1967 1968 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 1969 CommutableCMPCC:$cc)), 1970 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 1971 1972 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 1973 CommutableCMPCC:$cc)), 1974 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 1975 1976 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 1977 CommutableCMPCC:$cc)), 1978 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 1979 1980 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1981 CommutableCMPCC:$cc)), 1982 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 1983 1984 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 1985 CommutableCMPCC:$cc)), 1986 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 1987} 1988 1989let Predicates = [UseSSE2] in { 1990 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 1991 CommutableCMPCC:$cc)), 1992 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 1993 1994 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1995 CommutableCMPCC:$cc)), 1996 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 1997} 1998 1999let Predicates = [UseSSE1] in { 2000 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 2001 CommutableCMPCC:$cc)), 2002 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2003 2004 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2005 CommutableCMPCC:$cc)), 2006 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2007} 2008 2009//===----------------------------------------------------------------------===// 2010// SSE 1 & 2 - Shuffle Instructions 2011//===----------------------------------------------------------------------===// 2012 2013/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2014multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2015 ValueType vt, string asm, PatFrag mem_frag, 2016 X86FoldableSchedWrite sched, Domain d, 2017 bit IsCommutable = 0> { 2018 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2019 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2020 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2021 (i8 timm:$src3))))], d>, 2022 Sched<[sched.Folded, sched.ReadAfterFold]>; 2023 let isCommutable = IsCommutable in 2024 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2025 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2026 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2027 (i8 timm:$src3))))], d>, 2028 Sched<[sched]>; 2029} 2030 2031let Predicates = [HasAVX, NoVLX] in { 2032 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2033 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2034 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2035 PS, VEX_4V, VEX_WIG; 2036 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2037 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2038 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2039 PS, VEX_4V, VEX_L, VEX_WIG; 2040 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2041 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2042 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2043 PD, VEX_4V, VEX_WIG; 2044 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2045 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2046 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2047 PD, VEX_4V, VEX_L, VEX_WIG; 2048} 2049let Constraints = "$src1 = $dst" in { 2050 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2051 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2052 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2053 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2054 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2055 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2056} 2057 2058//===----------------------------------------------------------------------===// 2059// SSE 1 & 2 - Unpack FP Instructions 2060//===----------------------------------------------------------------------===// 2061 2062/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2063multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2064 PatFrag mem_frag, RegisterClass RC, 2065 X86MemOperand x86memop, string asm, 2066 X86FoldableSchedWrite sched, Domain d, 2067 bit IsCommutable = 0> { 2068 let isCommutable = IsCommutable in 2069 def rr : PI<opc, MRMSrcReg, 2070 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2071 asm, [(set RC:$dst, 2072 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2073 Sched<[sched]>; 2074 def rm : PI<opc, MRMSrcMem, 2075 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2076 asm, [(set RC:$dst, 2077 (vt (OpNode RC:$src1, 2078 (mem_frag addr:$src2))))], d>, 2079 Sched<[sched.Folded, sched.ReadAfterFold]>; 2080} 2081 2082let Predicates = [HasAVX, NoVLX] in { 2083defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2084 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2085 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2086defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2087 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2088 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2089defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2090 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2091 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2092defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2093 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2094 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2095 2096defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2097 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2098 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2099defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2100 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2101 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2102defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2103 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2104 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2105defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2106 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2107 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2108}// Predicates = [HasAVX, NoVLX] 2109 2110let Constraints = "$src1 = $dst" in { 2111 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2112 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2113 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2114 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2115 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2116 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2117 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2118 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2119 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2120 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2121 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2122 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2123} // Constraints = "$src1 = $dst" 2124 2125let Predicates = [HasAVX1Only] in { 2126 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2127 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2128 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2129 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2130 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2131 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2132 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2133 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2134 2135 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2136 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2137 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2138 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2139 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2140 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2141 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2142 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2143} 2144 2145let Predicates = [UseSSE2] in { 2146 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2147 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2148 (v2f64 (simple_load addr:$src2)))), 2149 (MOVHPDrm VR128:$src1, addr:$src2)>; 2150} 2151 2152//===----------------------------------------------------------------------===// 2153// SSE 1 & 2 - Extract Floating-Point Sign mask 2154//===----------------------------------------------------------------------===// 2155 2156/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2157multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2158 string asm, Domain d> { 2159 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2160 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2161 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2162 Sched<[WriteFMOVMSK]>; 2163} 2164 2165let Predicates = [HasAVX] in { 2166 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2167 SSEPackedSingle>, PS, VEX, VEX_WIG; 2168 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2169 SSEPackedDouble>, PD, VEX, VEX_WIG; 2170 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2171 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2172 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2173 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2174 2175 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2176 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2177 (VMOVMSKPSrr VR128:$src)>; 2178 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2179 (VMOVMSKPDrr VR128:$src)>; 2180 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2181 (VMOVMSKPSYrr VR256:$src)>; 2182 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2183 (VMOVMSKPDYrr VR256:$src)>; 2184} 2185 2186defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2187 SSEPackedSingle>, PS; 2188defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2189 SSEPackedDouble>, PD; 2190 2191let Predicates = [UseSSE2] in { 2192 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2193 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2194 (MOVMSKPSrr VR128:$src)>; 2195 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2196 (MOVMSKPDrr VR128:$src)>; 2197} 2198 2199//===---------------------------------------------------------------------===// 2200// SSE2 - Packed Integer Logical Instructions 2201//===---------------------------------------------------------------------===// 2202 2203let ExeDomain = SSEPackedInt in { // SSE integer instructions 2204 2205/// PDI_binop_rm - Simple SSE2 binary operator. 2206multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2207 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2208 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2209 bit IsCommutable, bit Is2Addr> { 2210 let isCommutable = IsCommutable in 2211 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2212 (ins RC:$src1, RC:$src2), 2213 !if(Is2Addr, 2214 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2215 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2216 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2217 Sched<[sched]>; 2218 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2219 (ins RC:$src1, x86memop:$src2), 2220 !if(Is2Addr, 2221 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2222 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2223 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2224 Sched<[sched.Folded, sched.ReadAfterFold]>; 2225} 2226} // ExeDomain = SSEPackedInt 2227 2228multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2229 ValueType OpVT128, ValueType OpVT256, 2230 X86SchedWriteWidths sched, bit IsCommutable, 2231 Predicate prd> { 2232let Predicates = [HasAVX, prd] in 2233 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2234 VR128, load, i128mem, sched.XMM, 2235 IsCommutable, 0>, VEX_4V, VEX_WIG; 2236 2237let Constraints = "$src1 = $dst" in 2238 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2239 memop, i128mem, sched.XMM, IsCommutable, 1>; 2240 2241let Predicates = [HasAVX2, prd] in 2242 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2243 OpVT256, VR256, load, i256mem, sched.YMM, 2244 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2245} 2246 2247// These are ordered here for pattern ordering requirements with the fp versions 2248 2249defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2250 SchedWriteVecLogic, 1, NoVLX>; 2251defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2252 SchedWriteVecLogic, 1, NoVLX>; 2253defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2254 SchedWriteVecLogic, 1, NoVLX>; 2255defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2256 SchedWriteVecLogic, 0, NoVLX>; 2257 2258//===----------------------------------------------------------------------===// 2259// SSE 1 & 2 - Logical Instructions 2260//===----------------------------------------------------------------------===// 2261 2262/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2263/// 2264/// There are no patterns here because isel prefers integer versions for SSE2 2265/// and later. There are SSE1 v4f32 patterns later. 2266multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2267 SDNode OpNode, X86SchedWriteWidths sched> { 2268 let Predicates = [HasAVX, NoVLX] in { 2269 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2270 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2271 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2272 2273 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2274 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2275 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2276 2277 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2278 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2279 [], [], 0>, PS, VEX_4V, VEX_WIG; 2280 2281 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2282 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2283 [], [], 0>, PD, VEX_4V, VEX_WIG; 2284 } 2285 2286 let Constraints = "$src1 = $dst" in { 2287 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2288 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2289 [], []>, PS; 2290 2291 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2292 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2293 [], []>, PD; 2294 } 2295} 2296 2297defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; 2298defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; 2299defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; 2300let isCommutable = 0 in 2301 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; 2302 2303let Predicates = [HasAVX2, NoVLX] in { 2304 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2305 (VPANDYrr VR256:$src1, VR256:$src2)>; 2306 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2307 (VPANDYrr VR256:$src1, VR256:$src2)>; 2308 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2309 (VPANDYrr VR256:$src1, VR256:$src2)>; 2310 2311 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2312 (VPORYrr VR256:$src1, VR256:$src2)>; 2313 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2314 (VPORYrr VR256:$src1, VR256:$src2)>; 2315 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2316 (VPORYrr VR256:$src1, VR256:$src2)>; 2317 2318 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2319 (VPXORYrr VR256:$src1, VR256:$src2)>; 2320 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2321 (VPXORYrr VR256:$src1, VR256:$src2)>; 2322 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2323 (VPXORYrr VR256:$src1, VR256:$src2)>; 2324 2325 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2326 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2327 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2328 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2329 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2330 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2331 2332 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2333 (VPANDYrm VR256:$src1, addr:$src2)>; 2334 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2335 (VPANDYrm VR256:$src1, addr:$src2)>; 2336 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2337 (VPANDYrm VR256:$src1, addr:$src2)>; 2338 2339 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2340 (VPORYrm VR256:$src1, addr:$src2)>; 2341 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2342 (VPORYrm VR256:$src1, addr:$src2)>; 2343 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2344 (VPORYrm VR256:$src1, addr:$src2)>; 2345 2346 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2347 (VPXORYrm VR256:$src1, addr:$src2)>; 2348 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2349 (VPXORYrm VR256:$src1, addr:$src2)>; 2350 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2351 (VPXORYrm VR256:$src1, addr:$src2)>; 2352 2353 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2354 (VPANDNYrm VR256:$src1, addr:$src2)>; 2355 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2356 (VPANDNYrm VR256:$src1, addr:$src2)>; 2357 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2358 (VPANDNYrm VR256:$src1, addr:$src2)>; 2359} 2360 2361// If only AVX1 is supported, we need to handle integer operations with 2362// floating point instructions since the integer versions aren't available. 2363let Predicates = [HasAVX1Only] in { 2364 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2365 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2366 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2367 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2368 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2369 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2370 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2371 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2372 2373 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2374 (VORPSYrr VR256:$src1, VR256:$src2)>; 2375 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2376 (VORPSYrr VR256:$src1, VR256:$src2)>; 2377 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2378 (VORPSYrr VR256:$src1, VR256:$src2)>; 2379 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2380 (VORPSYrr VR256:$src1, VR256:$src2)>; 2381 2382 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2383 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2384 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2385 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2386 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2387 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2388 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2389 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2390 2391 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2392 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2393 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2394 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2395 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2396 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2397 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2398 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2399 2400 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2401 (VANDPSYrm VR256:$src1, addr:$src2)>; 2402 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2403 (VANDPSYrm VR256:$src1, addr:$src2)>; 2404 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2405 (VANDPSYrm VR256:$src1, addr:$src2)>; 2406 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2407 (VANDPSYrm VR256:$src1, addr:$src2)>; 2408 2409 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2410 (VORPSYrm VR256:$src1, addr:$src2)>; 2411 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2412 (VORPSYrm VR256:$src1, addr:$src2)>; 2413 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2414 (VORPSYrm VR256:$src1, addr:$src2)>; 2415 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2416 (VORPSYrm VR256:$src1, addr:$src2)>; 2417 2418 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2419 (VXORPSYrm VR256:$src1, addr:$src2)>; 2420 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2421 (VXORPSYrm VR256:$src1, addr:$src2)>; 2422 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2423 (VXORPSYrm VR256:$src1, addr:$src2)>; 2424 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2425 (VXORPSYrm VR256:$src1, addr:$src2)>; 2426 2427 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2428 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2429 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2430 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2431 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2432 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2433 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2434 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2435} 2436 2437let Predicates = [HasAVX, NoVLX] in { 2438 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2439 (VPANDrr VR128:$src1, VR128:$src2)>; 2440 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2441 (VPANDrr VR128:$src1, VR128:$src2)>; 2442 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2443 (VPANDrr VR128:$src1, VR128:$src2)>; 2444 2445 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2446 (VPORrr VR128:$src1, VR128:$src2)>; 2447 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2448 (VPORrr VR128:$src1, VR128:$src2)>; 2449 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2450 (VPORrr VR128:$src1, VR128:$src2)>; 2451 2452 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2453 (VPXORrr VR128:$src1, VR128:$src2)>; 2454 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2455 (VPXORrr VR128:$src1, VR128:$src2)>; 2456 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2457 (VPXORrr VR128:$src1, VR128:$src2)>; 2458 2459 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2460 (VPANDNrr VR128:$src1, VR128:$src2)>; 2461 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2462 (VPANDNrr VR128:$src1, VR128:$src2)>; 2463 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2464 (VPANDNrr VR128:$src1, VR128:$src2)>; 2465 2466 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2467 (VPANDrm VR128:$src1, addr:$src2)>; 2468 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2469 (VPANDrm VR128:$src1, addr:$src2)>; 2470 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2471 (VPANDrm VR128:$src1, addr:$src2)>; 2472 2473 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2474 (VPORrm VR128:$src1, addr:$src2)>; 2475 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2476 (VPORrm VR128:$src1, addr:$src2)>; 2477 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2478 (VPORrm VR128:$src1, addr:$src2)>; 2479 2480 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2481 (VPXORrm VR128:$src1, addr:$src2)>; 2482 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2483 (VPXORrm VR128:$src1, addr:$src2)>; 2484 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2485 (VPXORrm VR128:$src1, addr:$src2)>; 2486 2487 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2488 (VPANDNrm VR128:$src1, addr:$src2)>; 2489 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2490 (VPANDNrm VR128:$src1, addr:$src2)>; 2491 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2492 (VPANDNrm VR128:$src1, addr:$src2)>; 2493} 2494 2495let Predicates = [UseSSE2] in { 2496 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2497 (PANDrr VR128:$src1, VR128:$src2)>; 2498 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2499 (PANDrr VR128:$src1, VR128:$src2)>; 2500 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2501 (PANDrr VR128:$src1, VR128:$src2)>; 2502 2503 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2504 (PORrr VR128:$src1, VR128:$src2)>; 2505 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2506 (PORrr VR128:$src1, VR128:$src2)>; 2507 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2508 (PORrr VR128:$src1, VR128:$src2)>; 2509 2510 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2511 (PXORrr VR128:$src1, VR128:$src2)>; 2512 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2513 (PXORrr VR128:$src1, VR128:$src2)>; 2514 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2515 (PXORrr VR128:$src1, VR128:$src2)>; 2516 2517 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2518 (PANDNrr VR128:$src1, VR128:$src2)>; 2519 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2520 (PANDNrr VR128:$src1, VR128:$src2)>; 2521 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2522 (PANDNrr VR128:$src1, VR128:$src2)>; 2523 2524 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2525 (PANDrm VR128:$src1, addr:$src2)>; 2526 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2527 (PANDrm VR128:$src1, addr:$src2)>; 2528 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2529 (PANDrm VR128:$src1, addr:$src2)>; 2530 2531 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2532 (PORrm VR128:$src1, addr:$src2)>; 2533 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2534 (PORrm VR128:$src1, addr:$src2)>; 2535 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2536 (PORrm VR128:$src1, addr:$src2)>; 2537 2538 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2539 (PXORrm VR128:$src1, addr:$src2)>; 2540 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2541 (PXORrm VR128:$src1, addr:$src2)>; 2542 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2543 (PXORrm VR128:$src1, addr:$src2)>; 2544 2545 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2546 (PANDNrm VR128:$src1, addr:$src2)>; 2547 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2548 (PANDNrm VR128:$src1, addr:$src2)>; 2549 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2550 (PANDNrm VR128:$src1, addr:$src2)>; 2551} 2552 2553// Patterns for packed operations when we don't have integer type available. 2554def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2555 (ANDPSrr VR128:$src1, VR128:$src2)>; 2556def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2557 (ORPSrr VR128:$src1, VR128:$src2)>; 2558def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2559 (XORPSrr VR128:$src1, VR128:$src2)>; 2560def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2561 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2562 2563def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2564 (ANDPSrm VR128:$src1, addr:$src2)>; 2565def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2566 (ORPSrm VR128:$src1, addr:$src2)>; 2567def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2568 (XORPSrm VR128:$src1, addr:$src2)>; 2569def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2570 (ANDNPSrm VR128:$src1, addr:$src2)>; 2571 2572//===----------------------------------------------------------------------===// 2573// SSE 1 & 2 - Arithmetic Instructions 2574//===----------------------------------------------------------------------===// 2575 2576/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2577/// vector forms. 2578/// 2579/// In addition, we also have a special variant of the scalar form here to 2580/// represent the associated intrinsic operation. This form is unlike the 2581/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2582/// and leaves the top elements unmodified (therefore these cannot be commuted). 2583/// 2584/// These three forms can each be reg+reg or reg+mem. 2585/// 2586 2587/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2588/// classes below 2589multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2590 SDNode OpNode, X86SchedWriteSizes sched> { 2591let Uses = [MXCSR], mayRaiseFPException = 1 in { 2592 let Predicates = [HasAVX, NoVLX] in { 2593 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2594 VR128, v4f32, f128mem, loadv4f32, 2595 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2596 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2597 VR128, v2f64, f128mem, loadv2f64, 2598 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2599 2600 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2601 OpNode, VR256, v8f32, f256mem, loadv8f32, 2602 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2603 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2604 OpNode, VR256, v4f64, f256mem, loadv4f64, 2605 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2606 } 2607 2608 let Constraints = "$src1 = $dst" in { 2609 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2610 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2611 sched.PS.XMM>, PS; 2612 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2613 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2614 sched.PD.XMM>, PD; 2615 } 2616} 2617} 2618 2619multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2620 X86SchedWriteSizes sched> { 2621let Uses = [MXCSR], mayRaiseFPException = 1 in { 2622 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2623 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2624 XS, VEX_4V, VEX_LIG, VEX_WIG; 2625 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2626 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2627 XD, VEX_4V, VEX_LIG, VEX_WIG; 2628 2629 let Constraints = "$src1 = $dst" in { 2630 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2631 OpNode, FR32, f32mem, SSEPackedSingle, 2632 sched.PS.Scl>, XS; 2633 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2634 OpNode, FR64, f64mem, SSEPackedDouble, 2635 sched.PD.Scl>, XD; 2636 } 2637} 2638} 2639 2640multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2641 SDPatternOperator OpNode, 2642 X86SchedWriteSizes sched> { 2643let Uses = [MXCSR], mayRaiseFPException = 1 in { 2644 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2645 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2646 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2647 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2648 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2649 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2650 2651 let Constraints = "$src1 = $dst" in { 2652 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2653 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2654 SSEPackedSingle, sched.PS.Scl>, XS; 2655 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2656 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2657 SSEPackedDouble, sched.PD.Scl>, XD; 2658 } 2659} 2660} 2661 2662// Binary Arithmetic instructions 2663defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2664 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2665 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2666defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2667 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2668 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2669let isCommutable = 0 in { 2670 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2671 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2672 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2673 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2674 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2675 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2676 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2677 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2678 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2679 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2680 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2681 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2682} 2683 2684let isCodeGenOnly = 1 in { 2685 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2686 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2687 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2688 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2689} 2690 2691// Patterns used to select SSE scalar fp arithmetic instructions from 2692// either: 2693// 2694// (1) a scalar fp operation followed by a blend 2695// 2696// The effect is that the backend no longer emits unnecessary vector 2697// insert instructions immediately after SSE scalar fp instructions 2698// like addss or mulss. 2699// 2700// For example, given the following code: 2701// __m128 foo(__m128 A, __m128 B) { 2702// A[0] += B[0]; 2703// return A; 2704// } 2705// 2706// Previously we generated: 2707// addss %xmm0, %xmm1 2708// movss %xmm1, %xmm0 2709// 2710// We now generate: 2711// addss %xmm1, %xmm0 2712// 2713// (2) a vector packed single/double fp operation followed by a vector insert 2714// 2715// The effect is that the backend converts the packed fp instruction 2716// followed by a vector insert into a single SSE scalar fp instruction. 2717// 2718// For example, given the following code: 2719// __m128 foo(__m128 A, __m128 B) { 2720// __m128 C = A + B; 2721// return (__m128) {c[0], a[1], a[2], a[3]}; 2722// } 2723// 2724// Previously we generated: 2725// addps %xmm0, %xmm1 2726// movss %xmm1, %xmm0 2727// 2728// We now generate: 2729// addss %xmm1, %xmm0 2730 2731// TODO: Some canonicalization in lowering would simplify the number of 2732// patterns we have to try to match. 2733multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, 2734 ValueType VT, ValueType EltTy, 2735 RegisterClass RC, PatFrag ld_frag, 2736 Predicate BasePredicate> { 2737 let Predicates = [BasePredicate] in { 2738 // extracted scalar math op with insert via movss/movsd 2739 def : Pat<(VT (Move (VT VR128:$dst), 2740 (VT (scalar_to_vector 2741 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2742 RC:$src))))), 2743 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2744 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2745 def : Pat<(VT (Move (VT VR128:$dst), 2746 (VT (scalar_to_vector 2747 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2748 (ld_frag addr:$src)))))), 2749 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2750 } 2751 2752 // Repeat for AVX versions of the instructions. 2753 let Predicates = [UseAVX] in { 2754 // extracted scalar math op with insert via movss/movsd 2755 def : Pat<(VT (Move (VT VR128:$dst), 2756 (VT (scalar_to_vector 2757 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2758 RC:$src))))), 2759 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2760 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2761 def : Pat<(VT (Move (VT VR128:$dst), 2762 (VT (scalar_to_vector 2763 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2764 (ld_frag addr:$src)))))), 2765 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2766 } 2767} 2768 2769defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2770defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2771defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2772defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2773 2774defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2775defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2776defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2777defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2778 2779/// Unop Arithmetic 2780/// In addition, we also have a special variant of the scalar form here to 2781/// represent the associated intrinsic operation. This form is unlike the 2782/// plain scalar form, in that it takes an entire vector (instead of a 2783/// scalar) and leaves the top elements undefined. 2784/// 2785/// And, we have a special variant form for a full-vector intrinsic form. 2786 2787/// sse_fp_unop_s - SSE1 unops in scalar form 2788/// For the non-AVX defs, we need $src1 to be tied to $dst because 2789/// the HW instructions are 2 operand / destructive. 2790multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2791 ValueType ScalarVT, X86MemOperand x86memop, 2792 Operand intmemop, SDNode OpNode, Domain d, 2793 X86FoldableSchedWrite sched, Predicate target> { 2794 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2795 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2796 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2797 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2798 Requires<[target]>; 2799 let mayLoad = 1 in 2800 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2801 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2802 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2803 Sched<[sched.Folded]>, 2804 Requires<[target, OptForSize]>; 2805 } 2806 2807 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2808 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2809 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2810 Sched<[sched]>; 2811 let mayLoad = 1 in 2812 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2813 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2814 Sched<[sched.Folded, sched.ReadAfterFold]>; 2815 } 2816 2817} 2818 2819multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, 2820 PatFrags mem_frags, Intrinsic Intr, 2821 Predicate target, string Suffix> { 2822 let Predicates = [target] in { 2823 // These are unary operations, but they are modeled as having 2 source operands 2824 // because the high elements of the destination are unchanged in SSE. 2825 def : Pat<(Intr VR128:$src), 2826 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2827 } 2828 // We don't want to fold scalar loads into these instructions unless 2829 // optimizing for size. This is because the folded instruction will have a 2830 // partial register update, while the unfolded sequence will not, e.g. 2831 // movss mem, %xmm0 2832 // rcpss %xmm0, %xmm0 2833 // which has a clobber before the rcp, vs. 2834 // rcpss mem, %xmm0 2835 let Predicates = [target, OptForSize] in { 2836 def : Pat<(Intr (mem_frags addr:$src2)), 2837 (!cast<Instruction>(NAME#m_Int) 2838 (vt (IMPLICIT_DEF)), addr:$src2)>; 2839 } 2840} 2841 2842multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags, 2843 Intrinsic Intr, Predicate target> { 2844 let Predicates = [target] in { 2845 def : Pat<(Intr VR128:$src), 2846 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2847 VR128:$src)>; 2848 } 2849 let Predicates = [target, OptForSize] in { 2850 def : Pat<(Intr (mem_frags addr:$src2)), 2851 (!cast<Instruction>(NAME#m_Int) 2852 (vt (IMPLICIT_DEF)), addr:$src2)>; 2853 } 2854} 2855 2856multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2857 ValueType ScalarVT, X86MemOperand x86memop, 2858 Operand intmemop, SDNode OpNode, Domain d, 2859 X86FoldableSchedWrite sched, Predicate target> { 2860 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2861 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2862 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2863 [], d>, Sched<[sched]>; 2864 let mayLoad = 1 in 2865 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2866 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2867 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2868 } 2869 let hasSideEffects = 0, ExeDomain = d in { 2870 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2871 (ins VR128:$src1, VR128:$src2), 2872 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2873 []>, Sched<[sched]>; 2874 let mayLoad = 1 in 2875 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2876 (ins VR128:$src1, intmemop:$src2), 2877 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2878 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2879 } 2880 2881 // We don't want to fold scalar loads into these instructions unless 2882 // optimizing for size. This is because the folded instruction will have a 2883 // partial register update, while the unfolded sequence will not, e.g. 2884 // vmovss mem, %xmm0 2885 // vrcpss %xmm0, %xmm0, %xmm0 2886 // which has a clobber before the rcp, vs. 2887 // vrcpss mem, %xmm0, %xmm0 2888 // TODO: In theory, we could fold the load, and avoid the stall caused by 2889 // the partial register store, either in BreakFalseDeps or with smarter RA. 2890 let Predicates = [target] in { 2891 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2892 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2893 } 2894 let Predicates = [target, OptForSize] in { 2895 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2896 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2897 addr:$src)>; 2898 } 2899} 2900 2901/// sse1_fp_unop_p - SSE1 unops in packed form. 2902multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 2903 X86SchedWriteWidths sched, list<Predicate> prds> { 2904let Predicates = prds in { 2905 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2906 !strconcat("v", OpcodeStr, 2907 "ps\t{$src, $dst|$dst, $src}"), 2908 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2909 VEX, Sched<[sched.XMM]>, VEX_WIG; 2910 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2911 !strconcat("v", OpcodeStr, 2912 "ps\t{$src, $dst|$dst, $src}"), 2913 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2914 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2915 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2916 !strconcat("v", OpcodeStr, 2917 "ps\t{$src, $dst|$dst, $src}"), 2918 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2919 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2920 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2921 !strconcat("v", OpcodeStr, 2922 "ps\t{$src, $dst|$dst, $src}"), 2923 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2924 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2925} 2926 2927 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2928 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2929 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2930 Sched<[sched.XMM]>; 2931 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2932 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2933 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2934 Sched<[sched.XMM.Folded]>; 2935} 2936 2937/// sse2_fp_unop_p - SSE2 unops in vector forms. 2938multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2939 SDNode OpNode, X86SchedWriteWidths sched> { 2940let Predicates = [HasAVX, NoVLX] in { 2941 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2942 !strconcat("v", OpcodeStr, 2943 "pd\t{$src, $dst|$dst, $src}"), 2944 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2945 VEX, Sched<[sched.XMM]>, VEX_WIG; 2946 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2947 !strconcat("v", OpcodeStr, 2948 "pd\t{$src, $dst|$dst, $src}"), 2949 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2950 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2951 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2952 !strconcat("v", OpcodeStr, 2953 "pd\t{$src, $dst|$dst, $src}"), 2954 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2955 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2956 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2957 !strconcat("v", OpcodeStr, 2958 "pd\t{$src, $dst|$dst, $src}"), 2959 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2960 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2961} 2962 2963 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2964 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2965 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2966 Sched<[sched.XMM]>; 2967 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2968 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2969 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 2970 Sched<[sched.XMM.Folded]>; 2971} 2972 2973multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, 2974 X86SchedWriteWidths sched, Predicate AVXTarget> { 2975 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2976 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 2977 UseSSE1, "SS">, XS; 2978 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2979 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 2980 AVXTarget>, 2981 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 2982} 2983 2984multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2985 X86SchedWriteWidths sched, Predicate AVXTarget> { 2986 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem, 2987 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 2988 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, 2989 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 2990 XS, VEX_4V, VEX_LIG, VEX_WIG; 2991} 2992 2993multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2994 X86SchedWriteWidths sched, Predicate AVXTarget> { 2995 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem, 2996 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 2997 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, 2998 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 2999 XD, VEX_4V, VEX_LIG, VEX_WIG; 3000} 3001 3002// Square root. 3003defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 3004 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3005 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3006 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3007 3008// Reciprocal approximations. Note that these typically require refinement 3009// in order to obtain suitable precision. 3010defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3011 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3012 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3013defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3014 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3015 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3016 3017// There is no f64 version of the reciprocal approximation instructions. 3018 3019multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 3020 ValueType VT, Predicate BasePredicate> { 3021 let Predicates = [BasePredicate] in { 3022 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3023 (OpNode (extractelt VT:$src, 0))))), 3024 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3025 } 3026 3027 // Repeat for AVX versions of the instructions. 3028 let Predicates = [UseAVX] in { 3029 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3030 (OpNode (extractelt VT:$src, 0))))), 3031 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3032 } 3033} 3034 3035defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3036defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3037 3038multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3039 SDNode Move, ValueType VT, 3040 Predicate BasePredicate> { 3041 let Predicates = [BasePredicate] in { 3042 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3043 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3044 } 3045 3046 // Repeat for AVX versions of the instructions. 3047 let Predicates = [HasAVX] in { 3048 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3049 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3050 } 3051} 3052 3053defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3054 v4f32, UseSSE1>; 3055defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3056 v4f32, UseSSE1>; 3057 3058 3059//===----------------------------------------------------------------------===// 3060// SSE 1 & 2 - Non-temporal stores 3061//===----------------------------------------------------------------------===// 3062 3063let AddedComplexity = 400 in { // Prefer non-temporal versions 3064let Predicates = [HasAVX, NoVLX] in { 3065let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3066def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3067 (ins f128mem:$dst, VR128:$src), 3068 "movntps\t{$src, $dst|$dst, $src}", 3069 [(alignednontemporalstore (v4f32 VR128:$src), 3070 addr:$dst)]>, VEX, VEX_WIG; 3071def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3072 (ins f128mem:$dst, VR128:$src), 3073 "movntpd\t{$src, $dst|$dst, $src}", 3074 [(alignednontemporalstore (v2f64 VR128:$src), 3075 addr:$dst)]>, VEX, VEX_WIG; 3076} // SchedRW 3077 3078let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3079def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3080 (ins f256mem:$dst, VR256:$src), 3081 "movntps\t{$src, $dst|$dst, $src}", 3082 [(alignednontemporalstore (v8f32 VR256:$src), 3083 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3084def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3085 (ins f256mem:$dst, VR256:$src), 3086 "movntpd\t{$src, $dst|$dst, $src}", 3087 [(alignednontemporalstore (v4f64 VR256:$src), 3088 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3089} // SchedRW 3090 3091let ExeDomain = SSEPackedInt in { 3092def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3093 (ins i128mem:$dst, VR128:$src), 3094 "movntdq\t{$src, $dst|$dst, $src}", 3095 [(alignednontemporalstore (v2i64 VR128:$src), 3096 addr:$dst)]>, VEX, VEX_WIG, 3097 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3098def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3099 (ins i256mem:$dst, VR256:$src), 3100 "movntdq\t{$src, $dst|$dst, $src}", 3101 [(alignednontemporalstore (v4i64 VR256:$src), 3102 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3103 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3104} // ExeDomain 3105} // Predicates 3106 3107let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3108def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3109 "movntps\t{$src, $dst|$dst, $src}", 3110 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3111def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3112 "movntpd\t{$src, $dst|$dst, $src}", 3113 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3114} // SchedRW 3115 3116let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3117def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3118 "movntdq\t{$src, $dst|$dst, $src}", 3119 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3120 3121let SchedRW = [WriteStoreNT] in { 3122// There is no AVX form for instructions below this point 3123def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3124 "movnti{l}\t{$src, $dst|$dst, $src}", 3125 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3126 PS, Requires<[HasSSE2]>; 3127def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3128 "movnti{q}\t{$src, $dst|$dst, $src}", 3129 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3130 PS, Requires<[HasSSE2]>; 3131} // SchedRW = [WriteStoreNT] 3132 3133let Predicates = [HasAVX, NoVLX] in { 3134 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3135 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3136 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3137 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3138 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3139 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3140 3141 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3142 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3143 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3144 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3145 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3146 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3147} 3148 3149let Predicates = [UseSSE2] in { 3150 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3151 (MOVNTDQmr addr:$dst, VR128:$src)>; 3152 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3153 (MOVNTDQmr addr:$dst, VR128:$src)>; 3154 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3155 (MOVNTDQmr addr:$dst, VR128:$src)>; 3156} 3157 3158} // AddedComplexity 3159 3160//===----------------------------------------------------------------------===// 3161// SSE 1 & 2 - Prefetch and memory fence 3162//===----------------------------------------------------------------------===// 3163 3164// Prefetch intrinsic. 3165let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3166def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3167 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3168def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3169 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3170def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3171 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3172def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3173 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3174} 3175 3176// FIXME: How should flush instruction be modeled? 3177let SchedRW = [WriteLoad] in { 3178// Flush cache 3179def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3180 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3181 PS, Requires<[HasSSE2]>; 3182} 3183 3184let SchedRW = [WriteNop] in { 3185// Pause. This "instruction" is encoded as "rep; nop", so even though it 3186// was introduced with SSE2, it's backward compatible. 3187def PAUSE : I<0x90, RawFrm, (outs), (ins), 3188 "pause", [(int_x86_sse2_pause)]>, OBXS; 3189} 3190 3191let SchedRW = [WriteFence] in { 3192// Load, store, and memory fence 3193// TODO: As with mfence, we may want to ease the availability of sfence/lfence 3194// to include any 64-bit target. 3195def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3196 PS, Requires<[HasSSE1]>; 3197def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3198 PS, Requires<[HasSSE2]>; 3199def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3200 PS, Requires<[HasMFence]>; 3201} // SchedRW 3202 3203def : Pat<(X86MFence), (MFENCE)>; 3204 3205//===----------------------------------------------------------------------===// 3206// SSE 1 & 2 - Load/Store XCSR register 3207//===----------------------------------------------------------------------===// 3208 3209let mayLoad=1, hasSideEffects=1 in 3210def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3211 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3212 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3213let mayStore=1, hasSideEffects=1 in 3214def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3215 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3216 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3217 3218let mayLoad=1, hasSideEffects=1 in 3219def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3220 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3221 PS, Sched<[WriteLDMXCSR]>; 3222let mayStore=1, hasSideEffects=1 in 3223def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3224 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3225 PS, Sched<[WriteSTMXCSR]>; 3226 3227//===---------------------------------------------------------------------===// 3228// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3229//===---------------------------------------------------------------------===// 3230 3231let ExeDomain = SSEPackedInt in { // SSE integer instructions 3232 3233let hasSideEffects = 0 in { 3234def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3235 "movdqa\t{$src, $dst|$dst, $src}", []>, 3236 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3237def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3238 "movdqu\t{$src, $dst|$dst, $src}", []>, 3239 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3240def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3241 "movdqa\t{$src, $dst|$dst, $src}", []>, 3242 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3243def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3244 "movdqu\t{$src, $dst|$dst, $src}", []>, 3245 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3246} 3247 3248// For Disassembler 3249let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3250def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3251 "movdqa\t{$src, $dst|$dst, $src}", []>, 3252 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3253 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3254def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3255 "movdqa\t{$src, $dst|$dst, $src}", []>, 3256 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3257 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3258def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3259 "movdqu\t{$src, $dst|$dst, $src}", []>, 3260 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3261 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3262def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3263 "movdqu\t{$src, $dst|$dst, $src}", []>, 3264 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3265 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3266} 3267 3268let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3269 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3270def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3271 "movdqa\t{$src, $dst|$dst, $src}", 3272 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3273 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3274def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3275 "movdqa\t{$src, $dst|$dst, $src}", []>, 3276 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3277 VEX, VEX_L, VEX_WIG; 3278def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3279 "vmovdqu\t{$src, $dst|$dst, $src}", 3280 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3281 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3282 XS, VEX, VEX_WIG; 3283def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3284 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3285 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3286 XS, VEX, VEX_L, VEX_WIG; 3287} 3288 3289let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3290def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3291 (ins i128mem:$dst, VR128:$src), 3292 "movdqa\t{$src, $dst|$dst, $src}", 3293 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3294 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3295def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3296 (ins i256mem:$dst, VR256:$src), 3297 "movdqa\t{$src, $dst|$dst, $src}", []>, 3298 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3299def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3300 "vmovdqu\t{$src, $dst|$dst, $src}", 3301 [(store (v2i64 VR128:$src), addr:$dst)]>, 3302 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3303def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3304 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3305 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3306} 3307 3308let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3309let hasSideEffects = 0 in { 3310def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3311 "movdqa\t{$src, $dst|$dst, $src}", []>; 3312 3313def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3314 "movdqu\t{$src, $dst|$dst, $src}", []>, 3315 XS, Requires<[UseSSE2]>; 3316} 3317 3318// For Disassembler 3319let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3320def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3321 "movdqa\t{$src, $dst|$dst, $src}", []>, 3322 FoldGenData<"MOVDQArr">; 3323 3324def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3325 "movdqu\t{$src, $dst|$dst, $src}", []>, 3326 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3327} 3328} // SchedRW 3329 3330let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3331 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3332def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3333 "movdqa\t{$src, $dst|$dst, $src}", 3334 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3335def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3336 "movdqu\t{$src, $dst|$dst, $src}", 3337 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3338 XS, Requires<[UseSSE2]>; 3339} 3340 3341let mayStore = 1, hasSideEffects = 0, 3342 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3343def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3344 "movdqa\t{$src, $dst|$dst, $src}", 3345 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3346def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3347 "movdqu\t{$src, $dst|$dst, $src}", 3348 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3349 XS, Requires<[UseSSE2]>; 3350} 3351 3352} // ExeDomain = SSEPackedInt 3353 3354// Reversed version with ".s" suffix for GAS compatibility. 3355def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3356 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3357def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3358 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3359def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3360 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3361def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3362 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3363 3364// Reversed version with ".s" suffix for GAS compatibility. 3365def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3366 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3367def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3368 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3369 3370let Predicates = [HasAVX, NoVLX] in { 3371 // Additional patterns for other integer sizes. 3372 def : Pat<(alignedloadv4i32 addr:$src), 3373 (VMOVDQArm addr:$src)>; 3374 def : Pat<(alignedloadv8i16 addr:$src), 3375 (VMOVDQArm addr:$src)>; 3376 def : Pat<(alignedloadv16i8 addr:$src), 3377 (VMOVDQArm addr:$src)>; 3378 def : Pat<(loadv4i32 addr:$src), 3379 (VMOVDQUrm addr:$src)>; 3380 def : Pat<(loadv8i16 addr:$src), 3381 (VMOVDQUrm addr:$src)>; 3382 def : Pat<(loadv16i8 addr:$src), 3383 (VMOVDQUrm addr:$src)>; 3384 3385 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3386 (VMOVDQAmr addr:$dst, VR128:$src)>; 3387 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3388 (VMOVDQAmr addr:$dst, VR128:$src)>; 3389 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3390 (VMOVDQAmr addr:$dst, VR128:$src)>; 3391 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3392 (VMOVDQUmr addr:$dst, VR128:$src)>; 3393 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3394 (VMOVDQUmr addr:$dst, VR128:$src)>; 3395 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3396 (VMOVDQUmr addr:$dst, VR128:$src)>; 3397} 3398 3399//===---------------------------------------------------------------------===// 3400// SSE2 - Packed Integer Arithmetic Instructions 3401//===---------------------------------------------------------------------===// 3402 3403let ExeDomain = SSEPackedInt in { // SSE integer instructions 3404 3405/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3406multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3407 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3408 PatFrag memop_frag, X86MemOperand x86memop, 3409 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3410 let isCommutable = 1 in 3411 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3412 (ins RC:$src1, RC:$src2), 3413 !if(Is2Addr, 3414 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3415 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3416 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3417 Sched<[sched]>; 3418 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3419 (ins RC:$src1, x86memop:$src2), 3420 !if(Is2Addr, 3421 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3422 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3423 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3424 (memop_frag addr:$src2))))]>, 3425 Sched<[sched.Folded, sched.ReadAfterFold]>; 3426} 3427} // ExeDomain = SSEPackedInt 3428 3429defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3430 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3431defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3432 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3433defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3434 SchedWriteVecALU, 1, NoVLX>; 3435defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3436 SchedWriteVecALU, 1, NoVLX>; 3437defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3438 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3439defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3440 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3441defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3442 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3443defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3444 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3445defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3446 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3447defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3448 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3449defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3450 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3451defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3452 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3453defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3454 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3455defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3456 SchedWriteVecALU, 0, NoVLX>; 3457defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3458 SchedWriteVecALU, 0, NoVLX>; 3459defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3460 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3461defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3462 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3463defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3464 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3465defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3466 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3467defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3468 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3469defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3470 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3471defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3472 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3473defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3474 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3475defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 3476 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3477defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 3478 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3479defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3480 SchedWriteVecIMul, 1, NoVLX>; 3481 3482let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3483defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3484 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3485 VEX_4V, VEX_WIG; 3486 3487let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3488defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3489 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3490 0>, VEX_4V, VEX_L, VEX_WIG; 3491let Constraints = "$src1 = $dst" in 3492defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3493 memop, i128mem, SchedWriteVecIMul.XMM>; 3494 3495let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3496defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3497 load, i128mem, SchedWritePSADBW.XMM, 0>, 3498 VEX_4V, VEX_WIG; 3499let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3500defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3501 load, i256mem, SchedWritePSADBW.YMM, 0>, 3502 VEX_4V, VEX_L, VEX_WIG; 3503let Constraints = "$src1 = $dst" in 3504defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3505 memop, i128mem, SchedWritePSADBW.XMM>; 3506 3507//===---------------------------------------------------------------------===// 3508// SSE2 - Packed Integer Logical Instructions 3509//===---------------------------------------------------------------------===// 3510 3511multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3512 string OpcodeStr, SDNode OpNode, 3513 SDNode OpNode2, RegisterClass RC, 3514 X86FoldableSchedWrite sched, 3515 X86FoldableSchedWrite schedImm, 3516 ValueType DstVT, ValueType SrcVT, 3517 PatFrag ld_frag, bit Is2Addr = 1> { 3518 // src2 is always 128-bit 3519 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3520 (ins RC:$src1, VR128:$src2), 3521 !if(Is2Addr, 3522 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3523 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3524 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3525 Sched<[sched]>; 3526 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3527 (ins RC:$src1, i128mem:$src2), 3528 !if(Is2Addr, 3529 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3530 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3531 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3532 (SrcVT (ld_frag addr:$src2)))))]>, 3533 Sched<[sched.Folded, sched.ReadAfterFold]>; 3534 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3535 (ins RC:$src1, u8imm:$src2), 3536 !if(Is2Addr, 3537 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3538 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3539 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3540 Sched<[schedImm]>; 3541} 3542 3543multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3544 string OpcodeStr, SDNode OpNode, 3545 SDNode OpNode2, ValueType DstVT128, 3546 ValueType DstVT256, ValueType SrcVT, 3547 X86SchedWriteWidths sched, 3548 X86SchedWriteWidths schedImm, Predicate prd> { 3549let Predicates = [HasAVX, prd] in 3550 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3551 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3552 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; 3553let Predicates = [HasAVX2, prd] in 3554 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3555 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3556 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, 3557 VEX_WIG; 3558let Constraints = "$src1 = $dst" in 3559 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3560 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3561 memop>; 3562} 3563 3564multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3565 SDNode OpNode, RegisterClass RC, ValueType VT, 3566 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3567 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3568 !if(Is2Addr, 3569 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3570 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3571 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3572 Sched<[sched]>; 3573} 3574 3575multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3576 SDNode OpNode, X86SchedWriteWidths sched> { 3577let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3578 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3579 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3580let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3581 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3582 VR256, v32i8, sched.YMM, 0>, 3583 VEX_4V, VEX_L, VEX_WIG; 3584let Constraints = "$src1 = $dst" in 3585 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3586 sched.XMM>; 3587} 3588 3589let ExeDomain = SSEPackedInt in { 3590 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3591 v8i16, v16i16, v8i16, SchedWriteVecShift, 3592 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3593 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3594 v4i32, v8i32, v4i32, SchedWriteVecShift, 3595 SchedWriteVecShiftImm, NoVLX>; 3596 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3597 v2i64, v4i64, v2i64, SchedWriteVecShift, 3598 SchedWriteVecShiftImm, NoVLX>; 3599 3600 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3601 v8i16, v16i16, v8i16, SchedWriteVecShift, 3602 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3603 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3604 v4i32, v8i32, v4i32, SchedWriteVecShift, 3605 SchedWriteVecShiftImm, NoVLX>; 3606 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3607 v2i64, v4i64, v2i64, SchedWriteVecShift, 3608 SchedWriteVecShiftImm, NoVLX>; 3609 3610 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3611 v8i16, v16i16, v8i16, SchedWriteVecShift, 3612 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3613 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3614 v4i32, v8i32, v4i32, SchedWriteVecShift, 3615 SchedWriteVecShiftImm, NoVLX>; 3616 3617 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3618 SchedWriteShuffle>; 3619 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3620 SchedWriteShuffle>; 3621} // ExeDomain = SSEPackedInt 3622 3623//===---------------------------------------------------------------------===// 3624// SSE2 - Packed Integer Comparison Instructions 3625//===---------------------------------------------------------------------===// 3626 3627defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3628 SchedWriteVecALU, 1, TruePredicate>; 3629defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3630 SchedWriteVecALU, 1, TruePredicate>; 3631defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3632 SchedWriteVecALU, 1, TruePredicate>; 3633defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3634 SchedWriteVecALU, 0, TruePredicate>; 3635defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3636 SchedWriteVecALU, 0, TruePredicate>; 3637defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3638 SchedWriteVecALU, 0, TruePredicate>; 3639 3640//===---------------------------------------------------------------------===// 3641// SSE2 - Packed Integer Shuffle Instructions 3642//===---------------------------------------------------------------------===// 3643 3644let ExeDomain = SSEPackedInt in { 3645multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3646 SDNode OpNode, X86SchedWriteWidths sched, 3647 Predicate prd> { 3648let Predicates = [HasAVX, prd] in { 3649 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3650 (ins VR128:$src1, u8imm:$src2), 3651 !strconcat("v", OpcodeStr, 3652 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3653 [(set VR128:$dst, 3654 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3655 VEX, Sched<[sched.XMM]>, VEX_WIG; 3656 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3657 (ins i128mem:$src1, u8imm:$src2), 3658 !strconcat("v", OpcodeStr, 3659 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3660 [(set VR128:$dst, 3661 (vt128 (OpNode (load addr:$src1), 3662 (i8 timm:$src2))))]>, VEX, 3663 Sched<[sched.XMM.Folded]>, VEX_WIG; 3664} 3665 3666let Predicates = [HasAVX2, prd] in { 3667 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3668 (ins VR256:$src1, u8imm:$src2), 3669 !strconcat("v", OpcodeStr, 3670 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3671 [(set VR256:$dst, 3672 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3673 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3674 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3675 (ins i256mem:$src1, u8imm:$src2), 3676 !strconcat("v", OpcodeStr, 3677 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3678 [(set VR256:$dst, 3679 (vt256 (OpNode (load addr:$src1), 3680 (i8 timm:$src2))))]>, VEX, VEX_L, 3681 Sched<[sched.YMM.Folded]>, VEX_WIG; 3682} 3683 3684let Predicates = [UseSSE2] in { 3685 def ri : Ii8<0x70, MRMSrcReg, 3686 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3687 !strconcat(OpcodeStr, 3688 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3689 [(set VR128:$dst, 3690 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3691 Sched<[sched.XMM]>; 3692 def mi : Ii8<0x70, MRMSrcMem, 3693 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3694 !strconcat(OpcodeStr, 3695 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3696 [(set VR128:$dst, 3697 (vt128 (OpNode (memop addr:$src1), 3698 (i8 timm:$src2))))]>, 3699 Sched<[sched.XMM.Folded]>; 3700} 3701} 3702} // ExeDomain = SSEPackedInt 3703 3704defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3705 SchedWriteShuffle, NoVLX>, PD; 3706defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3707 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3708defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3709 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3710 3711//===---------------------------------------------------------------------===// 3712// Packed Integer Pack Instructions (SSE & AVX) 3713//===---------------------------------------------------------------------===// 3714 3715let ExeDomain = SSEPackedInt in { 3716multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3717 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3718 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3719 PatFrag ld_frag, bit Is2Addr = 1> { 3720 def rr : PDI<opc, MRMSrcReg, 3721 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3722 !if(Is2Addr, 3723 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3724 !strconcat(OpcodeStr, 3725 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3726 [(set RC:$dst, 3727 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3728 Sched<[sched]>; 3729 def rm : PDI<opc, MRMSrcMem, 3730 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3731 !if(Is2Addr, 3732 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3733 !strconcat(OpcodeStr, 3734 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3735 [(set RC:$dst, 3736 (OutVT (OpNode (ArgVT RC:$src1), 3737 (ld_frag addr:$src2))))]>, 3738 Sched<[sched.Folded, sched.ReadAfterFold]>; 3739} 3740 3741multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3742 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3743 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3744 PatFrag ld_frag, bit Is2Addr = 1> { 3745 def rr : SS48I<opc, MRMSrcReg, 3746 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3747 !if(Is2Addr, 3748 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3749 !strconcat(OpcodeStr, 3750 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3751 [(set RC:$dst, 3752 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3753 Sched<[sched]>; 3754 def rm : SS48I<opc, MRMSrcMem, 3755 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3756 !if(Is2Addr, 3757 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3758 !strconcat(OpcodeStr, 3759 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3760 [(set RC:$dst, 3761 (OutVT (OpNode (ArgVT RC:$src1), 3762 (ld_frag addr:$src2))))]>, 3763 Sched<[sched.Folded, sched.ReadAfterFold]>; 3764} 3765 3766let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3767 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3768 i128mem, SchedWriteShuffle.XMM, load, 0>, 3769 VEX_4V, VEX_WIG; 3770 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3771 i128mem, SchedWriteShuffle.XMM, load, 0>, 3772 VEX_4V, VEX_WIG; 3773 3774 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3775 i128mem, SchedWriteShuffle.XMM, load, 0>, 3776 VEX_4V, VEX_WIG; 3777 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3778 i128mem, SchedWriteShuffle.XMM, load, 0>, 3779 VEX_4V; 3780} 3781 3782let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3783 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3784 i256mem, SchedWriteShuffle.YMM, load, 0>, 3785 VEX_4V, VEX_L, VEX_WIG; 3786 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3787 i256mem, SchedWriteShuffle.YMM, load, 0>, 3788 VEX_4V, VEX_L, VEX_WIG; 3789 3790 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3791 i256mem, SchedWriteShuffle.YMM, load, 0>, 3792 VEX_4V, VEX_L, VEX_WIG; 3793 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3794 i256mem, SchedWriteShuffle.YMM, load, 0>, 3795 VEX_4V, VEX_L; 3796} 3797 3798let Constraints = "$src1 = $dst" in { 3799 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3800 i128mem, SchedWriteShuffle.XMM, memop>; 3801 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3802 i128mem, SchedWriteShuffle.XMM, memop>; 3803 3804 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3805 i128mem, SchedWriteShuffle.XMM, memop>; 3806 3807 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3808 i128mem, SchedWriteShuffle.XMM, memop>; 3809} 3810} // ExeDomain = SSEPackedInt 3811 3812//===---------------------------------------------------------------------===// 3813// SSE2 - Packed Integer Unpack Instructions 3814//===---------------------------------------------------------------------===// 3815 3816let ExeDomain = SSEPackedInt in { 3817multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3818 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3819 X86FoldableSchedWrite sched, PatFrag ld_frag, 3820 bit Is2Addr = 1> { 3821 def rr : PDI<opc, MRMSrcReg, 3822 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3823 !if(Is2Addr, 3824 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3825 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3826 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3827 Sched<[sched]>; 3828 def rm : PDI<opc, MRMSrcMem, 3829 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3830 !if(Is2Addr, 3831 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3832 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3833 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3834 Sched<[sched.Folded, sched.ReadAfterFold]>; 3835} 3836 3837let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3838 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3839 i128mem, SchedWriteShuffle.XMM, load, 0>, 3840 VEX_4V, VEX_WIG; 3841 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3842 i128mem, SchedWriteShuffle.XMM, load, 0>, 3843 VEX_4V, VEX_WIG; 3844 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3845 i128mem, SchedWriteShuffle.XMM, load, 0>, 3846 VEX_4V, VEX_WIG; 3847 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3848 i128mem, SchedWriteShuffle.XMM, load, 0>, 3849 VEX_4V, VEX_WIG; 3850} 3851 3852let Predicates = [HasAVX, NoVLX] in { 3853 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3854 i128mem, SchedWriteShuffle.XMM, load, 0>, 3855 VEX_4V, VEX_WIG; 3856 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3857 i128mem, SchedWriteShuffle.XMM, load, 0>, 3858 VEX_4V, VEX_WIG; 3859 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3860 i128mem, SchedWriteShuffle.XMM, load, 0>, 3861 VEX_4V, VEX_WIG; 3862 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3863 i128mem, SchedWriteShuffle.XMM, load, 0>, 3864 VEX_4V, VEX_WIG; 3865} 3866 3867let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3868 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3869 i256mem, SchedWriteShuffle.YMM, load, 0>, 3870 VEX_4V, VEX_L, VEX_WIG; 3871 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3872 i256mem, SchedWriteShuffle.YMM, load, 0>, 3873 VEX_4V, VEX_L, VEX_WIG; 3874 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3875 i256mem, SchedWriteShuffle.YMM, load, 0>, 3876 VEX_4V, VEX_L, VEX_WIG; 3877 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3878 i256mem, SchedWriteShuffle.YMM, load, 0>, 3879 VEX_4V, VEX_L, VEX_WIG; 3880} 3881 3882let Predicates = [HasAVX2, NoVLX] in { 3883 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3884 i256mem, SchedWriteShuffle.YMM, load, 0>, 3885 VEX_4V, VEX_L, VEX_WIG; 3886 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3887 i256mem, SchedWriteShuffle.YMM, load, 0>, 3888 VEX_4V, VEX_L, VEX_WIG; 3889 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3890 i256mem, SchedWriteShuffle.YMM, load, 0>, 3891 VEX_4V, VEX_L, VEX_WIG; 3892 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3893 i256mem, SchedWriteShuffle.YMM, load, 0>, 3894 VEX_4V, VEX_L, VEX_WIG; 3895} 3896 3897let Constraints = "$src1 = $dst" in { 3898 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3899 i128mem, SchedWriteShuffle.XMM, memop>; 3900 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3901 i128mem, SchedWriteShuffle.XMM, memop>; 3902 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3903 i128mem, SchedWriteShuffle.XMM, memop>; 3904 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3905 i128mem, SchedWriteShuffle.XMM, memop>; 3906 3907 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3908 i128mem, SchedWriteShuffle.XMM, memop>; 3909 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3910 i128mem, SchedWriteShuffle.XMM, memop>; 3911 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3912 i128mem, SchedWriteShuffle.XMM, memop>; 3913 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3914 i128mem, SchedWriteShuffle.XMM, memop>; 3915} 3916} // ExeDomain = SSEPackedInt 3917 3918//===---------------------------------------------------------------------===// 3919// SSE2 - Packed Integer Extract and Insert 3920//===---------------------------------------------------------------------===// 3921 3922let ExeDomain = SSEPackedInt in { 3923multiclass sse2_pinsrw<bit Is2Addr = 1> { 3924 def rr : Ii8<0xC4, MRMSrcReg, 3925 (outs VR128:$dst), (ins VR128:$src1, 3926 GR32orGR64:$src2, u8imm:$src3), 3927 !if(Is2Addr, 3928 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3929 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3930 [(set VR128:$dst, 3931 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 3932 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3933 def rm : Ii8<0xC4, MRMSrcMem, 3934 (outs VR128:$dst), (ins VR128:$src1, 3935 i16mem:$src2, u8imm:$src3), 3936 !if(Is2Addr, 3937 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3938 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3939 [(set VR128:$dst, 3940 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3941 imm:$src3))]>, 3942 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 3943} 3944 3945// Extract 3946let Predicates = [HasAVX, NoBWI] in 3947def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3948 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3949 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3950 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3951 imm:$src2))]>, 3952 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; 3953def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3954 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3955 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3956 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3957 imm:$src2))]>, 3958 Sched<[WriteVecExtract]>; 3959 3960// Insert 3961let Predicates = [HasAVX, NoBWI] in 3962defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; 3963 3964let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 3965defm PINSRW : sse2_pinsrw, PD; 3966 3967} // ExeDomain = SSEPackedInt 3968 3969//===---------------------------------------------------------------------===// 3970// SSE2 - Packed Mask Creation 3971//===---------------------------------------------------------------------===// 3972 3973let ExeDomain = SSEPackedInt in { 3974 3975def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3976 (ins VR128:$src), 3977 "pmovmskb\t{$src, $dst|$dst, $src}", 3978 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3979 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 3980 3981let Predicates = [HasAVX2] in { 3982def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3983 (ins VR256:$src), 3984 "pmovmskb\t{$src, $dst|$dst, $src}", 3985 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 3986 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 3987} 3988 3989def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 3990 "pmovmskb\t{$src, $dst|$dst, $src}", 3991 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3992 Sched<[WriteVecMOVMSK]>; 3993 3994} // ExeDomain = SSEPackedInt 3995 3996//===---------------------------------------------------------------------===// 3997// SSE2 - Conditional Store 3998//===---------------------------------------------------------------------===// 3999 4000let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 4001let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 4002def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4003 (ins VR128:$src, VR128:$mask), 4004 "maskmovdqu\t{$mask, $src|$src, $mask}", 4005 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4006 VEX, VEX_WIG; 4007let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4008def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4009 (ins VR128:$src, VR128:$mask), 4010 "maskmovdqu\t{$mask, $src|$src, $mask}", 4011 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4012 VEX, VEX_WIG; 4013 4014let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 4015def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4016 "maskmovdqu\t{$mask, $src|$src, $mask}", 4017 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4018let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4019def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4020 "maskmovdqu\t{$mask, $src|$src, $mask}", 4021 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4022 4023} // ExeDomain = SSEPackedInt 4024 4025//===---------------------------------------------------------------------===// 4026// SSE2 - Move Doubleword/Quadword 4027//===---------------------------------------------------------------------===// 4028 4029//===---------------------------------------------------------------------===// 4030// Move Int Doubleword to Packed Double Int 4031// 4032let ExeDomain = SSEPackedInt in { 4033def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4034 "movd\t{$src, $dst|$dst, $src}", 4035 [(set VR128:$dst, 4036 (v4i32 (scalar_to_vector GR32:$src)))]>, 4037 VEX, Sched<[WriteVecMoveFromGpr]>; 4038def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4039 "movd\t{$src, $dst|$dst, $src}", 4040 [(set VR128:$dst, 4041 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4042 VEX, Sched<[WriteVecLoad]>; 4043def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4044 "movq\t{$src, $dst|$dst, $src}", 4045 [(set VR128:$dst, 4046 (v2i64 (scalar_to_vector GR64:$src)))]>, 4047 VEX, Sched<[WriteVecMoveFromGpr]>; 4048let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4049def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4050 "movq\t{$src, $dst|$dst, $src}", []>, 4051 VEX, Sched<[WriteVecLoad]>; 4052let isCodeGenOnly = 1 in 4053def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4054 "movq\t{$src, $dst|$dst, $src}", 4055 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4056 VEX, Sched<[WriteVecMoveFromGpr]>; 4057 4058def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4059 "movd\t{$src, $dst|$dst, $src}", 4060 [(set VR128:$dst, 4061 (v4i32 (scalar_to_vector GR32:$src)))]>, 4062 Sched<[WriteVecMoveFromGpr]>; 4063def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4064 "movd\t{$src, $dst|$dst, $src}", 4065 [(set VR128:$dst, 4066 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4067 Sched<[WriteVecLoad]>; 4068def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4069 "movq\t{$src, $dst|$dst, $src}", 4070 [(set VR128:$dst, 4071 (v2i64 (scalar_to_vector GR64:$src)))]>, 4072 Sched<[WriteVecMoveFromGpr]>; 4073let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4074def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4075 "movq\t{$src, $dst|$dst, $src}", []>, 4076 Sched<[WriteVecLoad]>; 4077let isCodeGenOnly = 1 in 4078def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4079 "movq\t{$src, $dst|$dst, $src}", 4080 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4081 Sched<[WriteVecMoveFromGpr]>; 4082} // ExeDomain = SSEPackedInt 4083 4084//===---------------------------------------------------------------------===// 4085// Move Int Doubleword to Single Scalar 4086// 4087let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4088 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4089 "movd\t{$src, $dst|$dst, $src}", 4090 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4091 VEX, Sched<[WriteVecMoveFromGpr]>; 4092 4093 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4094 "movd\t{$src, $dst|$dst, $src}", 4095 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4096 Sched<[WriteVecMoveFromGpr]>; 4097 4098} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4099 4100//===---------------------------------------------------------------------===// 4101// Move Packed Doubleword Int to Packed Double Int 4102// 4103let ExeDomain = SSEPackedInt in { 4104def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4105 "movd\t{$src, $dst|$dst, $src}", 4106 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4107 (iPTR 0)))]>, VEX, 4108 Sched<[WriteVecMoveToGpr]>; 4109def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4110 (ins i32mem:$dst, VR128:$src), 4111 "movd\t{$src, $dst|$dst, $src}", 4112 [(store (i32 (extractelt (v4i32 VR128:$src), 4113 (iPTR 0))), addr:$dst)]>, 4114 VEX, Sched<[WriteVecStore]>; 4115def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4116 "movd\t{$src, $dst|$dst, $src}", 4117 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4118 (iPTR 0)))]>, 4119 Sched<[WriteVecMoveToGpr]>; 4120def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4121 "movd\t{$src, $dst|$dst, $src}", 4122 [(store (i32 (extractelt (v4i32 VR128:$src), 4123 (iPTR 0))), addr:$dst)]>, 4124 Sched<[WriteVecStore]>; 4125} // ExeDomain = SSEPackedInt 4126 4127//===---------------------------------------------------------------------===// 4128// Move Packed Doubleword Int first element to Doubleword Int 4129// 4130let ExeDomain = SSEPackedInt in { 4131let SchedRW = [WriteVecMoveToGpr] in { 4132def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4133 "movq\t{$src, $dst|$dst, $src}", 4134 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4135 (iPTR 0)))]>, 4136 VEX; 4137 4138def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4139 "movq\t{$src, $dst|$dst, $src}", 4140 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4141 (iPTR 0)))]>; 4142} //SchedRW 4143 4144let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4145def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4146 (ins i64mem:$dst, VR128:$src), 4147 "movq\t{$src, $dst|$dst, $src}", []>, 4148 VEX, Sched<[WriteVecStore]>; 4149let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4150def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4151 "movq\t{$src, $dst|$dst, $src}", []>, 4152 Sched<[WriteVecStore]>; 4153} // ExeDomain = SSEPackedInt 4154 4155//===---------------------------------------------------------------------===// 4156// Bitcast FR64 <-> GR64 4157// 4158let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4159 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4160 "movq\t{$src, $dst|$dst, $src}", 4161 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4162 VEX, Sched<[WriteVecMoveToGpr]>; 4163 4164 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4165 "movq\t{$src, $dst|$dst, $src}", 4166 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4167 Sched<[WriteVecMoveToGpr]>; 4168} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4169 4170//===---------------------------------------------------------------------===// 4171// Move Scalar Single to Double Int 4172// 4173let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4174 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4175 "movd\t{$src, $dst|$dst, $src}", 4176 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4177 VEX, Sched<[WriteVecMoveToGpr]>; 4178 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4179 "movd\t{$src, $dst|$dst, $src}", 4180 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4181 Sched<[WriteVecMoveToGpr]>; 4182} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4183 4184let Predicates = [UseAVX] in { 4185 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4186 (VMOVDI2PDIrr GR32:$src)>; 4187 4188 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4189 (VMOV64toPQIrr GR64:$src)>; 4190 4191 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4192 // These instructions also write zeros in the high part of a 256-bit register. 4193 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4194 (VMOVDI2PDIrm addr:$src)>; 4195 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4196 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4197} 4198 4199let Predicates = [UseSSE2] in { 4200 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4201 (MOVDI2PDIrr GR32:$src)>; 4202 4203 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4204 (MOV64toPQIrr GR64:$src)>; 4205 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4206 (MOVDI2PDIrm addr:$src)>; 4207} 4208 4209// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4210// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4211// these aliases. 4212def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4213 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4214def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4215 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4216// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4217def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4218 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4219def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4220 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4221 4222//===---------------------------------------------------------------------===// 4223// SSE2 - Move Quadword 4224//===---------------------------------------------------------------------===// 4225 4226//===---------------------------------------------------------------------===// 4227// Move Quadword Int to Packed Quadword Int 4228// 4229 4230let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4231def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4232 "vmovq\t{$src, $dst|$dst, $src}", 4233 [(set VR128:$dst, 4234 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4235 VEX, Requires<[UseAVX]>, VEX_WIG; 4236def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4237 "movq\t{$src, $dst|$dst, $src}", 4238 [(set VR128:$dst, 4239 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4240 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4241} // ExeDomain, SchedRW 4242 4243//===---------------------------------------------------------------------===// 4244// Move Packed Quadword Int to Quadword Int 4245// 4246let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4247def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4248 "movq\t{$src, $dst|$dst, $src}", 4249 [(store (i64 (extractelt (v2i64 VR128:$src), 4250 (iPTR 0))), addr:$dst)]>, 4251 VEX, VEX_WIG; 4252def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4253 "movq\t{$src, $dst|$dst, $src}", 4254 [(store (i64 (extractelt (v2i64 VR128:$src), 4255 (iPTR 0))), addr:$dst)]>; 4256} // ExeDomain, SchedRW 4257 4258// For disassembler only 4259let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4260 SchedRW = [SchedWriteVecLogic.XMM] in { 4261def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4262 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4263def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4264 "movq\t{$src, $dst|$dst, $src}", []>; 4265} 4266 4267def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4268 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4269def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4270 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4271 4272let Predicates = [UseAVX] in { 4273 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4274 (VMOVQI2PQIrm addr:$src)>; 4275 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4276 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4277 4278 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4279 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4280} 4281 4282let Predicates = [UseSSE2] in { 4283 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4284 4285 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4286 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4287} 4288 4289//===---------------------------------------------------------------------===// 4290// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4291// IA32 document. movq xmm1, xmm2 does clear the high bits. 4292// 4293let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4294def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4295 "vmovq\t{$src, $dst|$dst, $src}", 4296 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4297 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4298def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4299 "movq\t{$src, $dst|$dst, $src}", 4300 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4301 XS, Requires<[UseSSE2]>; 4302} // ExeDomain, SchedRW 4303 4304let Predicates = [UseAVX] in { 4305 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4306 (VMOVZPQILo2PQIrr VR128:$src)>; 4307} 4308let Predicates = [UseSSE2] in { 4309 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4310 (MOVZPQILo2PQIrr VR128:$src)>; 4311} 4312 4313let Predicates = [UseAVX] in { 4314 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4315 (SUBREG_TO_REG (i32 0), 4316 (v2f64 (VMOVZPQILo2PQIrr 4317 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4318 sub_xmm)>; 4319 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4320 (SUBREG_TO_REG (i32 0), 4321 (v2i64 (VMOVZPQILo2PQIrr 4322 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4323 sub_xmm)>; 4324} 4325 4326//===---------------------------------------------------------------------===// 4327// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4328//===---------------------------------------------------------------------===// 4329 4330multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4331 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4332 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4333def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4334 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4335 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4336 Sched<[sched]>; 4337def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4338 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4339 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4340 Sched<[sched.Folded]>; 4341} 4342 4343let Predicates = [HasAVX, NoVLX] in { 4344 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4345 v4f32, VR128, loadv4f32, f128mem, 4346 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4347 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4348 v4f32, VR128, loadv4f32, f128mem, 4349 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4350 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4351 v8f32, VR256, loadv8f32, f256mem, 4352 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4353 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4354 v8f32, VR256, loadv8f32, f256mem, 4355 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4356} 4357defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4358 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4359defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4360 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4361 4362let Predicates = [HasAVX, NoVLX] in { 4363 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4364 (VMOVSHDUPrr VR128:$src)>; 4365 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4366 (VMOVSHDUPrm addr:$src)>; 4367 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4368 (VMOVSLDUPrr VR128:$src)>; 4369 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4370 (VMOVSLDUPrm addr:$src)>; 4371 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4372 (VMOVSHDUPYrr VR256:$src)>; 4373 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4374 (VMOVSHDUPYrm addr:$src)>; 4375 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4376 (VMOVSLDUPYrr VR256:$src)>; 4377 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4378 (VMOVSLDUPYrm addr:$src)>; 4379} 4380 4381let Predicates = [UseSSE3] in { 4382 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4383 (MOVSHDUPrr VR128:$src)>; 4384 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4385 (MOVSHDUPrm addr:$src)>; 4386 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4387 (MOVSLDUPrr VR128:$src)>; 4388 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4389 (MOVSLDUPrm addr:$src)>; 4390} 4391 4392//===---------------------------------------------------------------------===// 4393// SSE3 - Replicate Double FP - MOVDDUP 4394//===---------------------------------------------------------------------===// 4395 4396multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4397def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4398 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4399 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4400 Sched<[sched.XMM]>; 4401def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4402 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4403 [(set VR128:$dst, 4404 (v2f64 (X86Movddup 4405 (scalar_to_vector (loadf64 addr:$src)))))]>, 4406 Sched<[sched.XMM.Folded]>; 4407} 4408 4409// FIXME: Merge with above classes when there are patterns for the ymm version 4410multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4411def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4412 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4413 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4414 Sched<[sched.YMM]>; 4415def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4416 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4417 [(set VR256:$dst, 4418 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4419 Sched<[sched.YMM.Folded]>; 4420} 4421 4422let Predicates = [HasAVX, NoVLX] in { 4423 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4424 VEX, VEX_WIG; 4425 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4426 VEX, VEX_L, VEX_WIG; 4427} 4428 4429defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4430 4431 4432let Predicates = [HasAVX, NoVLX] in { 4433 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4434 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4435} 4436 4437let Predicates = [UseSSE3] in { 4438 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4439 (MOVDDUPrm addr:$src)>; 4440} 4441 4442//===---------------------------------------------------------------------===// 4443// SSE3 - Move Unaligned Integer 4444//===---------------------------------------------------------------------===// 4445 4446let Predicates = [HasAVX] in { 4447 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4448 "vlddqu\t{$src, $dst|$dst, $src}", 4449 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4450 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4451 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4452 "vlddqu\t{$src, $dst|$dst, $src}", 4453 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4454 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4455} // Predicates 4456 4457def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4458 "lddqu\t{$src, $dst|$dst, $src}", 4459 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4460 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4461 4462//===---------------------------------------------------------------------===// 4463// SSE3 - Arithmetic 4464//===---------------------------------------------------------------------===// 4465 4466multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4467 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4468 PatFrag ld_frag, bit Is2Addr = 1> { 4469let Uses = [MXCSR], mayRaiseFPException = 1 in { 4470 def rr : I<0xD0, MRMSrcReg, 4471 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4472 !if(Is2Addr, 4473 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4474 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4475 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4476 Sched<[sched]>; 4477 def rm : I<0xD0, MRMSrcMem, 4478 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4479 !if(Is2Addr, 4480 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4481 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4482 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4483 Sched<[sched.Folded, sched.ReadAfterFold]>; 4484} 4485} 4486 4487let Predicates = [HasAVX] in { 4488 let ExeDomain = SSEPackedSingle in { 4489 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4490 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4491 XD, VEX_4V, VEX_WIG; 4492 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4493 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4494 XD, VEX_4V, VEX_L, VEX_WIG; 4495 } 4496 let ExeDomain = SSEPackedDouble in { 4497 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4498 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4499 PD, VEX_4V, VEX_WIG; 4500 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4501 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4502 PD, VEX_4V, VEX_L, VEX_WIG; 4503 } 4504} 4505let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4506 let ExeDomain = SSEPackedSingle in 4507 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4508 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4509 let ExeDomain = SSEPackedDouble in 4510 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4511 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4512} 4513 4514//===---------------------------------------------------------------------===// 4515// SSE3 Instructions 4516//===---------------------------------------------------------------------===// 4517 4518// Horizontal ops 4519multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4520 X86MemOperand x86memop, SDNode OpNode, 4521 X86FoldableSchedWrite sched, PatFrag ld_frag, 4522 bit Is2Addr = 1> { 4523let Uses = [MXCSR], mayRaiseFPException = 1 in { 4524 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4525 !if(Is2Addr, 4526 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4527 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4528 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4529 Sched<[sched]>; 4530 4531 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4532 !if(Is2Addr, 4533 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4534 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4535 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4536 Sched<[sched.Folded, sched.ReadAfterFold]>; 4537} 4538} 4539multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4540 X86MemOperand x86memop, SDNode OpNode, 4541 X86FoldableSchedWrite sched, PatFrag ld_frag, 4542 bit Is2Addr = 1> { 4543let Uses = [MXCSR], mayRaiseFPException = 1 in { 4544 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4545 !if(Is2Addr, 4546 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4547 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4548 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4549 Sched<[sched]>; 4550 4551 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4552 !if(Is2Addr, 4553 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4554 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4555 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4556 Sched<[sched.Folded, sched.ReadAfterFold]>; 4557} 4558} 4559 4560let Predicates = [HasAVX] in { 4561 let ExeDomain = SSEPackedSingle in { 4562 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4563 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4564 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4565 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4566 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4567 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4568 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4569 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4570 } 4571 let ExeDomain = SSEPackedDouble in { 4572 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4573 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4574 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4575 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4576 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4577 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4578 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4579 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4580 } 4581} 4582 4583let Constraints = "$src1 = $dst" in { 4584 let ExeDomain = SSEPackedSingle in { 4585 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4586 WriteFHAdd, memopv4f32>; 4587 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4588 WriteFHAdd, memopv4f32>; 4589 } 4590 let ExeDomain = SSEPackedDouble in { 4591 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4592 WriteFHAdd, memopv2f64>; 4593 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4594 WriteFHAdd, memopv2f64>; 4595 } 4596} 4597 4598//===---------------------------------------------------------------------===// 4599// SSSE3 - Packed Absolute Instructions 4600//===---------------------------------------------------------------------===// 4601 4602/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4603multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4604 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4605 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4606 (ins VR128:$src), 4607 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4608 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4609 Sched<[sched.XMM]>; 4610 4611 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4612 (ins i128mem:$src), 4613 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4614 [(set VR128:$dst, 4615 (vt (OpNode (ld_frag addr:$src))))]>, 4616 Sched<[sched.XMM.Folded]>; 4617} 4618 4619/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4620multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4621 SDNode OpNode, X86SchedWriteWidths sched> { 4622 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4623 (ins VR256:$src), 4624 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4625 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4626 Sched<[sched.YMM]>; 4627 4628 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4629 (ins i256mem:$src), 4630 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4631 [(set VR256:$dst, 4632 (vt (OpNode (load addr:$src))))]>, 4633 Sched<[sched.YMM.Folded]>; 4634} 4635 4636let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4637 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4638 load>, VEX, VEX_WIG; 4639 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4640 load>, VEX, VEX_WIG; 4641} 4642let Predicates = [HasAVX, NoVLX] in { 4643 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4644 load>, VEX, VEX_WIG; 4645} 4646let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4647 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4648 VEX, VEX_L, VEX_WIG; 4649 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4650 VEX, VEX_L, VEX_WIG; 4651} 4652let Predicates = [HasAVX2, NoVLX] in { 4653 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4654 VEX, VEX_L, VEX_WIG; 4655} 4656 4657defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4658 memop>; 4659defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4660 memop>; 4661defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4662 memop>; 4663 4664//===---------------------------------------------------------------------===// 4665// SSSE3 - Packed Binary Operator Instructions 4666//===---------------------------------------------------------------------===// 4667 4668/// SS3I_binop_rm - Simple SSSE3 bin op 4669multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4670 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4671 PatFrag memop_frag, X86MemOperand x86memop, 4672 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4673 let isCommutable = 1 in 4674 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4675 (ins RC:$src1, RC:$src2), 4676 !if(Is2Addr, 4677 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4678 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4679 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4680 Sched<[sched]>; 4681 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4682 (ins RC:$src1, x86memop:$src2), 4683 !if(Is2Addr, 4684 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4685 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4686 [(set RC:$dst, 4687 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4688 Sched<[sched.Folded, sched.ReadAfterFold]>; 4689} 4690 4691/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4692multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4693 Intrinsic IntId128, X86FoldableSchedWrite sched, 4694 PatFrag ld_frag, bit Is2Addr = 1> { 4695 let isCommutable = 1 in 4696 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4697 (ins VR128:$src1, VR128:$src2), 4698 !if(Is2Addr, 4699 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4700 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4701 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4702 Sched<[sched]>; 4703 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4704 (ins VR128:$src1, i128mem:$src2), 4705 !if(Is2Addr, 4706 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4707 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4708 [(set VR128:$dst, 4709 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4710 Sched<[sched.Folded, sched.ReadAfterFold]>; 4711} 4712 4713multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4714 Intrinsic IntId256, 4715 X86FoldableSchedWrite sched> { 4716 let isCommutable = 1 in 4717 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4718 (ins VR256:$src1, VR256:$src2), 4719 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4720 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4721 Sched<[sched]>; 4722 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4723 (ins VR256:$src1, i256mem:$src2), 4724 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4725 [(set VR256:$dst, 4726 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4727 Sched<[sched.Folded, sched.ReadAfterFold]>; 4728} 4729 4730let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4731let isCommutable = 0 in { 4732 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4733 VR128, load, i128mem, 4734 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4735 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4736 v16i8, VR128, load, i128mem, 4737 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4738} 4739defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4740 VR128, load, i128mem, 4741 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4742} 4743 4744let ImmT = NoImm, Predicates = [HasAVX] in { 4745let isCommutable = 0 in { 4746 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4747 load, i128mem, 4748 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4749 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4750 load, i128mem, 4751 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4752 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4753 load, i128mem, 4754 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4755 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4756 load, i128mem, 4757 SchedWritePHAdd.XMM, 0>, VEX_4V; 4758 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4759 int_x86_ssse3_psign_b_128, 4760 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4761 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4762 int_x86_ssse3_psign_w_128, 4763 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4764 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4765 int_x86_ssse3_psign_d_128, 4766 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4767 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4768 int_x86_ssse3_phadd_sw_128, 4769 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4770 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4771 int_x86_ssse3_phsub_sw_128, 4772 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4773} 4774} 4775 4776let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4777let isCommutable = 0 in { 4778 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4779 VR256, load, i256mem, 4780 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4781 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4782 v32i8, VR256, load, i256mem, 4783 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4784} 4785defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4786 VR256, load, i256mem, 4787 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4788} 4789 4790let ImmT = NoImm, Predicates = [HasAVX2] in { 4791let isCommutable = 0 in { 4792 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4793 VR256, load, i256mem, 4794 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4795 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4796 load, i256mem, 4797 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4798 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4799 VR256, load, i256mem, 4800 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4801 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4802 load, i256mem, 4803 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; 4804 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4805 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4806 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4807 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4808 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4809 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4810 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4811 int_x86_avx2_phadd_sw, 4812 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4813 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4814 int_x86_avx2_phsub_sw, 4815 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4816} 4817} 4818 4819// None of these have i8 immediate fields. 4820let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4821let isCommutable = 0 in { 4822 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4823 memop, i128mem, SchedWritePHAdd.XMM>; 4824 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4825 memop, i128mem, SchedWritePHAdd.XMM>; 4826 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4827 memop, i128mem, SchedWritePHAdd.XMM>; 4828 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4829 memop, i128mem, SchedWritePHAdd.XMM>; 4830 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4831 SchedWriteVecALU.XMM, memop>; 4832 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4833 SchedWriteVecALU.XMM, memop>; 4834 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4835 SchedWriteVecALU.XMM, memop>; 4836 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4837 memop, i128mem, SchedWriteVarShuffle.XMM>; 4838 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4839 int_x86_ssse3_phadd_sw_128, 4840 SchedWritePHAdd.XMM, memop>; 4841 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4842 int_x86_ssse3_phsub_sw_128, 4843 SchedWritePHAdd.XMM, memop>; 4844 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4845 v16i8, VR128, memop, i128mem, 4846 SchedWriteVecIMul.XMM>; 4847} 4848defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4849 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4850} 4851 4852//===---------------------------------------------------------------------===// 4853// SSSE3 - Packed Align Instruction Patterns 4854//===---------------------------------------------------------------------===// 4855 4856multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4857 PatFrag memop_frag, X86MemOperand x86memop, 4858 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4859 let hasSideEffects = 0 in { 4860 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4861 (ins RC:$src1, RC:$src2, u8imm:$src3), 4862 !if(Is2Addr, 4863 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4864 !strconcat(asm, 4865 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4866 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4867 Sched<[sched]>; 4868 let mayLoad = 1 in 4869 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4870 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4871 !if(Is2Addr, 4872 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4873 !strconcat(asm, 4874 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4875 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4876 (memop_frag addr:$src2), 4877 (i8 timm:$src3))))]>, 4878 Sched<[sched.Folded, sched.ReadAfterFold]>; 4879 } 4880} 4881 4882let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4883 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4884 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4885let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4886 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4887 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4888let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4889 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4890 SchedWriteShuffle.XMM>; 4891 4892//===---------------------------------------------------------------------===// 4893// SSSE3 - Thread synchronization 4894//===---------------------------------------------------------------------===// 4895 4896let SchedRW = [WriteSystem] in { 4897let Uses = [EAX, ECX, EDX] in 4898def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4899 TB, Requires<[HasSSE3, Not64BitMode]>; 4900let Uses = [RAX, ECX, EDX] in 4901def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4902 TB, Requires<[HasSSE3, In64BitMode]>; 4903 4904let Uses = [ECX, EAX] in 4905def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4906 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4907} // SchedRW 4908 4909def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4910def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4911 4912def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4913 Requires<[Not64BitMode]>; 4914def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4915 Requires<[In64BitMode]>; 4916 4917//===----------------------------------------------------------------------===// 4918// SSE4.1 - Packed Move with Sign/Zero Extend 4919// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 4920//===----------------------------------------------------------------------===// 4921 4922multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4923 RegisterClass OutRC, RegisterClass InRC, 4924 X86FoldableSchedWrite sched> { 4925 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4926 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4927 Sched<[sched]>; 4928 4929 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4930 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4931 Sched<[sched.Folded]>; 4932} 4933 4934multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 4935 X86MemOperand MemOp, X86MemOperand MemYOp, 4936 Predicate prd> { 4937 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 4938 SchedWriteShuffle.XMM>; 4939 let Predicates = [HasAVX, prd] in 4940 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 4941 VR128, VR128, SchedWriteShuffle.XMM>, 4942 VEX, VEX_WIG; 4943 let Predicates = [HasAVX2, prd] in 4944 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 4945 VR256, VR128, WriteShuffle256>, 4946 VEX, VEX_L, VEX_WIG; 4947} 4948 4949multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4950 X86MemOperand MemYOp, Predicate prd> { 4951 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 4952 MemOp, MemYOp, prd>; 4953 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 4954 !strconcat("pmovzx", OpcodeStr), 4955 MemOp, MemYOp, prd>; 4956} 4957 4958defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 4959defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 4960defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 4961 4962defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 4963defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 4964 4965defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 4966 4967// AVX2 Patterns 4968multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 4969 SDNode ExtOp, SDNode InVecOp> { 4970 // Register-Register patterns 4971 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4972 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 4973 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 4974 } 4975 let Predicates = [HasAVX2, NoVLX] in { 4976 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 4977 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 4978 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 4979 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 4980 4981 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 4982 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 4983 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 4984 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 4985 4986 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 4987 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 4988 } 4989 4990 // Simple Register-Memory patterns 4991 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4992 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4993 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4994 4995 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 4996 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4997 } 4998 4999 let Predicates = [HasAVX2, NoVLX] in { 5000 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5001 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5002 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5003 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5004 5005 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5006 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5007 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5008 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5009 5010 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5011 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5012 } 5013 5014 // AVX2 Register-Memory patterns 5015 let Predicates = [HasAVX2, NoVLX] in { 5016 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5017 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5018 5019 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5020 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5021 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5022 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5023 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5024 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5025 5026 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5027 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5028 5029 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5030 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5031 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), 5032 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5033 5034 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5035 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5036 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5037 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5038 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5039 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5040 } 5041} 5042 5043defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5044defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5045 5046// SSE4.1/AVX patterns. 5047multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5048 SDNode ExtOp> { 5049 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5050 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5051 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5052 } 5053 let Predicates = [HasAVX, NoVLX] in { 5054 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5055 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5056 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5057 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5058 5059 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5060 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5061 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5062 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5063 5064 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5065 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5066 } 5067 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5068 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5069 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5070 } 5071 let Predicates = [HasAVX, NoVLX] in { 5072 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5073 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5074 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5075 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5076 5077 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5078 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5079 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5080 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5081 5082 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5083 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5084 } 5085 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5086 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5087 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5088 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5089 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5090 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5091 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5092 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5093 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5094 } 5095 let Predicates = [HasAVX, NoVLX] in { 5096 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5097 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5098 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5099 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5100 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5101 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5102 5103 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5104 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5105 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5106 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5107 5108 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5109 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5110 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5111 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5112 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5113 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5114 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5115 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5116 5117 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5118 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5119 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5120 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5121 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5122 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5123 5124 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5125 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5126 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5127 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5128 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5129 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5130 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5131 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5132 } 5133} 5134 5135defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5136defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5137 5138let Predicates = [UseSSE41] in { 5139 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5140 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5141} 5142 5143//===----------------------------------------------------------------------===// 5144// SSE4.1 - Extract Instructions 5145//===----------------------------------------------------------------------===// 5146 5147/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5148multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5149 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5150 (ins VR128:$src1, u8imm:$src2), 5151 !strconcat(OpcodeStr, 5152 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5153 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5154 imm:$src2))]>, 5155 Sched<[WriteVecExtract]>; 5156 let hasSideEffects = 0, mayStore = 1 in 5157 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5158 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5159 !strconcat(OpcodeStr, 5160 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5161 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), 5162 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5163} 5164 5165let Predicates = [HasAVX, NoBWI] in 5166 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; 5167 5168defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5169 5170 5171/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5172multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5173 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5174 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5175 (ins VR128:$src1, u8imm:$src2), 5176 !strconcat(OpcodeStr, 5177 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5178 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5179 5180 let hasSideEffects = 0, mayStore = 1 in 5181 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5182 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5183 !strconcat(OpcodeStr, 5184 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5185 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), 5186 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5187} 5188 5189let Predicates = [HasAVX, NoBWI] in 5190 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; 5191 5192defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5193 5194 5195/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5196multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5197 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5198 (ins VR128:$src1, u8imm:$src2), 5199 !strconcat(OpcodeStr, 5200 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5201 [(set GR32:$dst, 5202 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5203 Sched<[WriteVecExtract]>; 5204 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5205 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5206 !strconcat(OpcodeStr, 5207 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5208 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5209 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5210} 5211 5212let Predicates = [HasAVX, NoDQI] in 5213 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5214 5215defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5216 5217/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5218multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5219 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5220 (ins VR128:$src1, u8imm:$src2), 5221 !strconcat(OpcodeStr, 5222 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5223 [(set GR64:$dst, 5224 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5225 Sched<[WriteVecExtract]>; 5226 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5227 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5228 !strconcat(OpcodeStr, 5229 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5230 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5231 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5232} 5233 5234let Predicates = [HasAVX, NoDQI] in 5235 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5236 5237defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5238 5239/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5240/// destination 5241multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5242 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5243 (ins VR128:$src1, u8imm:$src2), 5244 !strconcat(OpcodeStr, 5245 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5246 [(set GR32orGR64:$dst, 5247 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5248 Sched<[WriteVecExtract]>; 5249 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5250 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5251 !strconcat(OpcodeStr, 5252 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5253 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5254 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5255} 5256 5257let ExeDomain = SSEPackedSingle in { 5258 let Predicates = [UseAVX] in 5259 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5260 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5261} 5262 5263//===----------------------------------------------------------------------===// 5264// SSE4.1 - Insert Instructions 5265//===----------------------------------------------------------------------===// 5266 5267multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5268 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5269 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5270 !if(Is2Addr, 5271 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5272 !strconcat(asm, 5273 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5274 [(set VR128:$dst, 5275 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 5276 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5277 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5278 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5279 !if(Is2Addr, 5280 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5281 !strconcat(asm, 5282 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5283 [(set VR128:$dst, 5284 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>, 5285 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5286} 5287 5288let Predicates = [HasAVX, NoBWI] in 5289 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; 5290let Constraints = "$src1 = $dst" in 5291 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5292 5293multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5294 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5295 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5296 !if(Is2Addr, 5297 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5298 !strconcat(asm, 5299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5300 [(set VR128:$dst, 5301 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5302 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5303 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5304 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5305 !if(Is2Addr, 5306 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5307 !strconcat(asm, 5308 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5309 [(set VR128:$dst, 5310 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5311 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5312} 5313 5314let Predicates = [HasAVX, NoDQI] in 5315 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5316let Constraints = "$src1 = $dst" in 5317 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5318 5319multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5320 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5321 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5322 !if(Is2Addr, 5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5324 !strconcat(asm, 5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5326 [(set VR128:$dst, 5327 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5328 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5329 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5330 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5331 !if(Is2Addr, 5332 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5333 !strconcat(asm, 5334 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5335 [(set VR128:$dst, 5336 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5337 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5338} 5339 5340let Predicates = [HasAVX, NoDQI] in 5341 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5342let Constraints = "$src1 = $dst" in 5343 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5344 5345// insertps has a few different modes, there's the first two here below which 5346// are optimized inserts that won't zero arbitrary elements in the destination 5347// vector. The next one matches the intrinsic and could zero arbitrary elements 5348// in the target vector. 5349multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5350 let isCommutable = 1 in 5351 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5352 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5353 !if(Is2Addr, 5354 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5355 !strconcat(asm, 5356 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5357 [(set VR128:$dst, 5358 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5359 Sched<[SchedWriteFShuffle.XMM]>; 5360 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5361 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5362 !if(Is2Addr, 5363 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5364 !strconcat(asm, 5365 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5366 [(set VR128:$dst, 5367 (X86insertps VR128:$src1, 5368 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5369 timm:$src3))]>, 5370 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5371} 5372 5373let ExeDomain = SSEPackedSingle in { 5374 let Predicates = [UseAVX] in 5375 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5376 VEX_4V, VEX_WIG; 5377 let Constraints = "$src1 = $dst" in 5378 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5379} 5380 5381//===----------------------------------------------------------------------===// 5382// SSE4.1 - Round Instructions 5383//===----------------------------------------------------------------------===// 5384 5385multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5386 X86MemOperand x86memop, RegisterClass RC, 5387 ValueType VT, PatFrag mem_frag, SDNode OpNode, 5388 X86FoldableSchedWrite sched> { 5389 // Intrinsic operation, reg. 5390 // Vector intrinsic operation, reg 5391let Uses = [MXCSR], mayRaiseFPException = 1 in { 5392 def r : SS4AIi8<opc, MRMSrcReg, 5393 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5394 !strconcat(OpcodeStr, 5395 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5396 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5397 Sched<[sched]>; 5398 5399 // Vector intrinsic operation, mem 5400 def m : SS4AIi8<opc, MRMSrcMem, 5401 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5402 !strconcat(OpcodeStr, 5403 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5404 [(set RC:$dst, 5405 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5406 Sched<[sched.Folded]>; 5407} 5408} 5409 5410multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5411 string OpcodeStr, X86FoldableSchedWrite sched> { 5412let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5413 def SSr : SS4AIi8<opcss, MRMSrcReg, 5414 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5415 !strconcat(OpcodeStr, 5416 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5417 []>, Sched<[sched]>; 5418 5419 let mayLoad = 1 in 5420 def SSm : SS4AIi8<opcss, MRMSrcMem, 5421 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5422 !strconcat(OpcodeStr, 5423 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5424 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5425} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5426 5427let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5428 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5429 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5430 !strconcat(OpcodeStr, 5431 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5432 []>, Sched<[sched]>; 5433 5434 let mayLoad = 1 in 5435 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5436 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5437 !strconcat(OpcodeStr, 5438 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5439 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5440} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5441} 5442 5443multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5444 string OpcodeStr, X86FoldableSchedWrite sched> { 5445let Uses = [MXCSR], mayRaiseFPException = 1 in { 5446let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5447 def SSr : SS4AIi8<opcss, MRMSrcReg, 5448 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5449 !strconcat(OpcodeStr, 5450 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5451 []>, Sched<[sched]>; 5452 5453 let mayLoad = 1 in 5454 def SSm : SS4AIi8<opcss, MRMSrcMem, 5455 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5456 !strconcat(OpcodeStr, 5457 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5458 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5459} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5460 5461let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5462 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5463 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5464 !strconcat(OpcodeStr, 5465 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5466 []>, Sched<[sched]>; 5467 5468 let mayLoad = 1 in 5469 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5470 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5471 !strconcat(OpcodeStr, 5472 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5473 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5474} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5475} 5476} 5477 5478multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5479 string OpcodeStr, X86FoldableSchedWrite sched, 5480 ValueType VT32, ValueType VT64, 5481 SDNode OpNode, bit Is2Addr = 1> { 5482let Uses = [MXCSR], mayRaiseFPException = 1 in { 5483let ExeDomain = SSEPackedSingle in { 5484 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5485 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5486 !if(Is2Addr, 5487 !strconcat(OpcodeStr, 5488 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5489 !strconcat(OpcodeStr, 5490 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5491 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5492 Sched<[sched]>; 5493 5494 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5495 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5496 !if(Is2Addr, 5497 !strconcat(OpcodeStr, 5498 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5499 !strconcat(OpcodeStr, 5500 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5501 [(set VR128:$dst, 5502 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, 5503 Sched<[sched.Folded, sched.ReadAfterFold]>; 5504} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5505 5506let ExeDomain = SSEPackedDouble in { 5507 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5508 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5509 !if(Is2Addr, 5510 !strconcat(OpcodeStr, 5511 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5512 !strconcat(OpcodeStr, 5513 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5514 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5515 Sched<[sched]>; 5516 5517 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5518 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5519 !if(Is2Addr, 5520 !strconcat(OpcodeStr, 5521 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5522 !strconcat(OpcodeStr, 5523 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5524 [(set VR128:$dst, 5525 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, 5526 Sched<[sched.Folded, sched.ReadAfterFold]>; 5527} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5528} 5529} 5530 5531// FP round - roundss, roundps, roundsd, roundpd 5532let Predicates = [HasAVX, NoVLX] in { 5533 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5534 // Intrinsic form 5535 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5536 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5537 VEX, VEX_WIG; 5538 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5539 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5540 VEX, VEX_L, VEX_WIG; 5541 } 5542 5543 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5544 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5545 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5546 VEX, VEX_WIG; 5547 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5548 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5549 VEX, VEX_L, VEX_WIG; 5550 } 5551} 5552let Predicates = [UseAVX] in { 5553 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5554 v4f32, v2f64, X86RndScales, 0>, 5555 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5556 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5557 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5558} 5559 5560let Predicates = [UseAVX] in { 5561 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5562 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5563 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5564 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5565} 5566 5567let Predicates = [UseAVX, OptForSize] in { 5568 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5569 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5570 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5571 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5572} 5573 5574let ExeDomain = SSEPackedSingle in 5575defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5576 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5577let ExeDomain = SSEPackedDouble in 5578defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5579 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5580 5581defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5582 5583let Constraints = "$src1 = $dst" in 5584defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5585 v4f32, v2f64, X86RndScales>; 5586 5587let Predicates = [UseSSE41] in { 5588 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5589 (ROUNDSSr FR32:$src1, timm:$src2)>; 5590 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5591 (ROUNDSDr FR64:$src1, timm:$src2)>; 5592} 5593 5594let Predicates = [UseSSE41, OptForSize] in { 5595 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5596 (ROUNDSSm addr:$src1, timm:$src2)>; 5597 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5598 (ROUNDSDm addr:$src1, timm:$src2)>; 5599} 5600 5601//===----------------------------------------------------------------------===// 5602// SSE4.1 - Packed Bit Test 5603//===----------------------------------------------------------------------===// 5604 5605// ptest instruction we'll lower to this in X86ISelLowering primarily from 5606// the intel intrinsic that corresponds to this. 5607let Defs = [EFLAGS], Predicates = [HasAVX] in { 5608def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5609 "vptest\t{$src2, $src1|$src1, $src2}", 5610 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5611 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5612def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5613 "vptest\t{$src2, $src1|$src1, $src2}", 5614 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5615 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5616 VEX, VEX_WIG; 5617 5618def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5619 "vptest\t{$src2, $src1|$src1, $src2}", 5620 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5621 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5622def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5623 "vptest\t{$src2, $src1|$src1, $src2}", 5624 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5625 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5626 VEX, VEX_L, VEX_WIG; 5627} 5628 5629let Defs = [EFLAGS] in { 5630def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5631 "ptest\t{$src2, $src1|$src1, $src2}", 5632 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5633 Sched<[SchedWriteVecTest.XMM]>; 5634def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5635 "ptest\t{$src2, $src1|$src1, $src2}", 5636 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5637 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5638} 5639 5640// The bit test instructions below are AVX only 5641multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5642 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5643 X86FoldableSchedWrite sched> { 5644 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5645 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5646 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5647 Sched<[sched]>, VEX; 5648 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5649 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5650 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5651 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5652} 5653 5654let Defs = [EFLAGS], Predicates = [HasAVX] in { 5655let ExeDomain = SSEPackedSingle in { 5656defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5657 SchedWriteFTest.XMM>; 5658defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5659 SchedWriteFTest.YMM>, VEX_L; 5660} 5661let ExeDomain = SSEPackedDouble in { 5662defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5663 SchedWriteFTest.XMM>; 5664defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5665 SchedWriteFTest.YMM>, VEX_L; 5666} 5667} 5668 5669//===----------------------------------------------------------------------===// 5670// SSE4.1 - Misc Instructions 5671//===----------------------------------------------------------------------===// 5672 5673let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5674 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5675 "popcnt{w}\t{$src, $dst|$dst, $src}", 5676 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5677 Sched<[WritePOPCNT]>, OpSize16, XS; 5678 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5679 "popcnt{w}\t{$src, $dst|$dst, $src}", 5680 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5681 (implicit EFLAGS)]>, 5682 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5683 5684 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5685 "popcnt{l}\t{$src, $dst|$dst, $src}", 5686 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5687 Sched<[WritePOPCNT]>, OpSize32, XS; 5688 5689 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5690 "popcnt{l}\t{$src, $dst|$dst, $src}", 5691 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5692 (implicit EFLAGS)]>, 5693 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5694 5695 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5696 "popcnt{q}\t{$src, $dst|$dst, $src}", 5697 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5698 Sched<[WritePOPCNT]>, XS; 5699 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5700 "popcnt{q}\t{$src, $dst|$dst, $src}", 5701 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5702 (implicit EFLAGS)]>, 5703 Sched<[WritePOPCNT.Folded]>, XS; 5704} 5705 5706// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5707multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5708 SDNode OpNode, PatFrag ld_frag, 5709 X86FoldableSchedWrite Sched> { 5710 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5711 (ins VR128:$src), 5712 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5713 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5714 Sched<[Sched]>; 5715 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5716 (ins i128mem:$src), 5717 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5718 [(set VR128:$dst, 5719 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5720 Sched<[Sched.Folded]>; 5721} 5722 5723// PHMIN has the same profile as PSAD, thus we use the same scheduling 5724// model, although the naming is misleading. 5725let Predicates = [HasAVX] in 5726defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5727 X86phminpos, load, 5728 WritePHMINPOS>, VEX, VEX_WIG; 5729defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5730 X86phminpos, memop, 5731 WritePHMINPOS>; 5732 5733/// SS48I_binop_rm - Simple SSE41 binary operator. 5734multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5735 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5736 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5737 bit Is2Addr = 1> { 5738 let isCommutable = 1 in 5739 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5740 (ins RC:$src1, RC:$src2), 5741 !if(Is2Addr, 5742 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5743 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5744 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5745 Sched<[sched]>; 5746 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5747 (ins RC:$src1, x86memop:$src2), 5748 !if(Is2Addr, 5749 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5750 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5751 [(set RC:$dst, 5752 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5753 Sched<[sched.Folded, sched.ReadAfterFold]>; 5754} 5755 5756let Predicates = [HasAVX, NoVLX] in { 5757 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5758 load, i128mem, SchedWriteVecALU.XMM, 0>, 5759 VEX_4V, VEX_WIG; 5760 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5761 load, i128mem, SchedWriteVecALU.XMM, 0>, 5762 VEX_4V, VEX_WIG; 5763 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5764 load, i128mem, SchedWriteVecALU.XMM, 0>, 5765 VEX_4V, VEX_WIG; 5766 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5767 load, i128mem, SchedWriteVecALU.XMM, 0>, 5768 VEX_4V, VEX_WIG; 5769 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5770 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5771 VEX_4V, VEX_WIG; 5772} 5773let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5774 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5775 load, i128mem, SchedWriteVecALU.XMM, 0>, 5776 VEX_4V, VEX_WIG; 5777 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5778 load, i128mem, SchedWriteVecALU.XMM, 0>, 5779 VEX_4V, VEX_WIG; 5780 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5781 load, i128mem, SchedWriteVecALU.XMM, 0>, 5782 VEX_4V, VEX_WIG; 5783 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5784 load, i128mem, SchedWriteVecALU.XMM, 0>, 5785 VEX_4V, VEX_WIG; 5786} 5787 5788let Predicates = [HasAVX2, NoVLX] in { 5789 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5790 load, i256mem, SchedWriteVecALU.YMM, 0>, 5791 VEX_4V, VEX_L, VEX_WIG; 5792 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5793 load, i256mem, SchedWriteVecALU.YMM, 0>, 5794 VEX_4V, VEX_L, VEX_WIG; 5795 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5796 load, i256mem, SchedWriteVecALU.YMM, 0>, 5797 VEX_4V, VEX_L, VEX_WIG; 5798 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5799 load, i256mem, SchedWriteVecALU.YMM, 0>, 5800 VEX_4V, VEX_L, VEX_WIG; 5801 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5802 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5803 VEX_4V, VEX_L, VEX_WIG; 5804} 5805let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5806 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5807 load, i256mem, SchedWriteVecALU.YMM, 0>, 5808 VEX_4V, VEX_L, VEX_WIG; 5809 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5810 load, i256mem, SchedWriteVecALU.YMM, 0>, 5811 VEX_4V, VEX_L, VEX_WIG; 5812 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5813 load, i256mem, SchedWriteVecALU.YMM, 0>, 5814 VEX_4V, VEX_L, VEX_WIG; 5815 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5816 load, i256mem, SchedWriteVecALU.YMM, 0>, 5817 VEX_4V, VEX_L, VEX_WIG; 5818} 5819 5820let Constraints = "$src1 = $dst" in { 5821 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5822 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5823 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5824 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5825 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5826 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5827 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5828 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5829 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5830 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5831 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5832 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5833 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5834 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5835 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5836 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5837 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5838 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5839} 5840 5841let Predicates = [HasAVX, NoVLX] in 5842 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5843 load, i128mem, SchedWritePMULLD.XMM, 0>, 5844 VEX_4V, VEX_WIG; 5845let Predicates = [HasAVX] in 5846 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5847 load, i128mem, SchedWriteVecALU.XMM, 0>, 5848 VEX_4V, VEX_WIG; 5849 5850let Predicates = [HasAVX2, NoVLX] in 5851 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5852 load, i256mem, SchedWritePMULLD.YMM, 0>, 5853 VEX_4V, VEX_L, VEX_WIG; 5854let Predicates = [HasAVX2] in 5855 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5856 load, i256mem, SchedWriteVecALU.YMM, 0>, 5857 VEX_4V, VEX_L, VEX_WIG; 5858 5859let Constraints = "$src1 = $dst" in { 5860 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5861 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5862 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5863 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5864} 5865 5866/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5867multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5868 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5869 X86MemOperand x86memop, bit Is2Addr, 5870 X86FoldableSchedWrite sched> { 5871 let isCommutable = 1 in 5872 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5873 (ins RC:$src1, RC:$src2, u8imm:$src3), 5874 !if(Is2Addr, 5875 !strconcat(OpcodeStr, 5876 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5877 !strconcat(OpcodeStr, 5878 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5879 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5880 Sched<[sched]>; 5881 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5882 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5883 !if(Is2Addr, 5884 !strconcat(OpcodeStr, 5885 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5886 !strconcat(OpcodeStr, 5887 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5888 [(set RC:$dst, 5889 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 5890 Sched<[sched.Folded, sched.ReadAfterFold]>; 5891} 5892 5893/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5894multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5895 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5896 X86MemOperand x86memop, bit Is2Addr, 5897 X86FoldableSchedWrite sched> { 5898 let isCommutable = 1 in 5899 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5900 (ins RC:$src1, RC:$src2, u8imm:$src3), 5901 !if(Is2Addr, 5902 !strconcat(OpcodeStr, 5903 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5904 !strconcat(OpcodeStr, 5905 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5906 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 5907 Sched<[sched]>; 5908 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5909 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5910 !if(Is2Addr, 5911 !strconcat(OpcodeStr, 5912 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5913 !strconcat(OpcodeStr, 5914 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5915 [(set RC:$dst, 5916 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 5917 Sched<[sched.Folded, sched.ReadAfterFold]>; 5918} 5919 5920def BlendCommuteImm2 : SDNodeXForm<timm, [{ 5921 uint8_t Imm = N->getZExtValue() & 0x03; 5922 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 5923}]>; 5924 5925def BlendCommuteImm4 : SDNodeXForm<timm, [{ 5926 uint8_t Imm = N->getZExtValue() & 0x0f; 5927 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 5928}]>; 5929 5930def BlendCommuteImm8 : SDNodeXForm<timm, [{ 5931 uint8_t Imm = N->getZExtValue() & 0xff; 5932 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 5933}]>; 5934 5935// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 5936def BlendScaleImm4 : SDNodeXForm<timm, [{ 5937 uint8_t Imm = N->getZExtValue(); 5938 uint8_t NewImm = 0; 5939 for (unsigned i = 0; i != 4; ++i) { 5940 if (Imm & (1 << i)) 5941 NewImm |= 0x3 << (i * 2); 5942 } 5943 return getI8Imm(NewImm, SDLoc(N)); 5944}]>; 5945 5946// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 5947def BlendScaleImm2 : SDNodeXForm<timm, [{ 5948 uint8_t Imm = N->getZExtValue(); 5949 uint8_t NewImm = 0; 5950 for (unsigned i = 0; i != 2; ++i) { 5951 if (Imm & (1 << i)) 5952 NewImm |= 0xf << (i * 4); 5953 } 5954 return getI8Imm(NewImm, SDLoc(N)); 5955}]>; 5956 5957// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 5958def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 5959 uint8_t Imm = N->getZExtValue(); 5960 uint8_t NewImm = 0; 5961 for (unsigned i = 0; i != 2; ++i) { 5962 if (Imm & (1 << i)) 5963 NewImm |= 0x3 << (i * 2); 5964 } 5965 return getI8Imm(NewImm, SDLoc(N)); 5966}]>; 5967 5968// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 5969def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 5970 uint8_t Imm = N->getZExtValue(); 5971 uint8_t NewImm = 0; 5972 for (unsigned i = 0; i != 4; ++i) { 5973 if (Imm & (1 << i)) 5974 NewImm |= 0x3 << (i * 2); 5975 } 5976 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5977}]>; 5978 5979// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 5980def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 5981 uint8_t Imm = N->getZExtValue(); 5982 uint8_t NewImm = 0; 5983 for (unsigned i = 0; i != 2; ++i) { 5984 if (Imm & (1 << i)) 5985 NewImm |= 0xf << (i * 4); 5986 } 5987 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5988}]>; 5989 5990// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 5991def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 5992 uint8_t Imm = N->getZExtValue(); 5993 uint8_t NewImm = 0; 5994 for (unsigned i = 0; i != 2; ++i) { 5995 if (Imm & (1 << i)) 5996 NewImm |= 0x3 << (i * 2); 5997 } 5998 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 5999}]>; 6000 6001let Predicates = [HasAVX] in { 6002 let isCommutable = 0 in { 6003 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6004 VR128, load, i128mem, 0, 6005 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6006 } 6007 6008let Uses = [MXCSR], mayRaiseFPException = 1 in { 6009 let ExeDomain = SSEPackedSingle in 6010 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6011 VR128, load, f128mem, 0, 6012 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6013 let ExeDomain = SSEPackedDouble in 6014 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6015 VR128, load, f128mem, 0, 6016 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6017 let ExeDomain = SSEPackedSingle in 6018 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6019 VR256, load, i256mem, 0, 6020 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6021} 6022} 6023 6024let Predicates = [HasAVX2] in { 6025 let isCommutable = 0 in { 6026 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6027 VR256, load, i256mem, 0, 6028 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6029 } 6030} 6031 6032let Constraints = "$src1 = $dst" in { 6033 let isCommutable = 0 in { 6034 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6035 VR128, memop, i128mem, 1, 6036 SchedWriteMPSAD.XMM>; 6037 } 6038 6039 let ExeDomain = SSEPackedSingle in 6040 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6041 VR128, memop, f128mem, 1, 6042 SchedWriteDPPS.XMM>, SIMD_EXC; 6043 let ExeDomain = SSEPackedDouble in 6044 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6045 VR128, memop, f128mem, 1, 6046 SchedWriteDPPD.XMM>, SIMD_EXC; 6047} 6048 6049/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6050multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6051 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6052 X86MemOperand x86memop, bit Is2Addr, Domain d, 6053 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6054let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6055 let isCommutable = 1 in 6056 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6057 (ins RC:$src1, RC:$src2, u8imm:$src3), 6058 !if(Is2Addr, 6059 !strconcat(OpcodeStr, 6060 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6061 !strconcat(OpcodeStr, 6062 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6063 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6064 Sched<[sched]>; 6065 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6066 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6067 !if(Is2Addr, 6068 !strconcat(OpcodeStr, 6069 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6070 !strconcat(OpcodeStr, 6071 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6072 [(set RC:$dst, 6073 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6074 Sched<[sched.Folded, sched.ReadAfterFold]>; 6075} 6076 6077 // Pattern to commute if load is in first source. 6078 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6079 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6080 (commuteXForm timm:$src3))>; 6081} 6082 6083let Predicates = [HasAVX] in { 6084 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6085 VR128, load, f128mem, 0, SSEPackedSingle, 6086 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6087 VEX_4V, VEX_WIG; 6088 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6089 VR256, load, f256mem, 0, SSEPackedSingle, 6090 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6091 VEX_4V, VEX_L, VEX_WIG; 6092 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6093 VR128, load, f128mem, 0, SSEPackedDouble, 6094 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6095 VEX_4V, VEX_WIG; 6096 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6097 VR256, load, f256mem, 0, SSEPackedDouble, 6098 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6099 VEX_4V, VEX_L, VEX_WIG; 6100 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6101 VR128, load, i128mem, 0, SSEPackedInt, 6102 SchedWriteBlend.XMM, BlendCommuteImm8>, 6103 VEX_4V, VEX_WIG; 6104} 6105 6106let Predicates = [HasAVX2] in { 6107 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6108 VR256, load, i256mem, 0, SSEPackedInt, 6109 SchedWriteBlend.YMM, BlendCommuteImm8>, 6110 VEX_4V, VEX_L, VEX_WIG; 6111} 6112 6113// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6114// ExecutionDomainFixPass will cleanup domains later on. 6115let Predicates = [HasAVX1Only] in { 6116def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6117 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6118def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6119 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6120def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6121 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6122 6123// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6124// it from becoming movsd via commuting under optsize. 6125def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6126 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6127def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6128 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6129def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6130 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6131 6132def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6133 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6134def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6135 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6136def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6137 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6138 6139// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6140// it from becoming movss via commuting under optsize. 6141def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6142 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6143def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6144 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6145def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6146 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6147} 6148 6149defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6150 VR128, memop, f128mem, 1, SSEPackedSingle, 6151 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6152defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6153 VR128, memop, f128mem, 1, SSEPackedDouble, 6154 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6155defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6156 VR128, memop, i128mem, 1, SSEPackedInt, 6157 SchedWriteBlend.XMM, BlendCommuteImm8>; 6158 6159let Predicates = [UseSSE41] in { 6160// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6161// it from becoming movss via commuting under optsize. 6162def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6163 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6164def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6165 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6166def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6167 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6168 6169def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6170 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6171def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6172 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6173def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6174 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6175} 6176 6177// For insertion into the zero index (low half) of a 256-bit vector, it is 6178// more efficient to generate a blend with immediate instead of an insert*128. 6179let Predicates = [HasAVX] in { 6180def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6181 (VBLENDPDYrri VR256:$src1, 6182 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6183 VR128:$src2, sub_xmm), 0x3)>; 6184def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6185 (VBLENDPSYrri VR256:$src1, 6186 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6187 VR128:$src2, sub_xmm), 0xf)>; 6188 6189def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6190 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6191 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6192def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6193 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6194 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6195} 6196 6197/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6198multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6199 X86MemOperand x86memop, ValueType VT, 6200 PatFrag mem_frag, SDNode OpNode, 6201 X86FoldableSchedWrite sched> { 6202 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6203 (ins RC:$src1, RC:$src2, RC:$src3), 6204 !strconcat(OpcodeStr, 6205 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6206 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6207 SSEPackedInt>, TAPD, VEX_4V, 6208 Sched<[sched]>; 6209 6210 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6211 (ins RC:$src1, x86memop:$src2, RC:$src3), 6212 !strconcat(OpcodeStr, 6213 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6214 [(set RC:$dst, 6215 (OpNode RC:$src3, (mem_frag addr:$src2), 6216 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, 6217 Sched<[sched.Folded, sched.ReadAfterFold, 6218 // x86memop:$src2 6219 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6220 ReadDefault, 6221 // RC::$src3 6222 sched.ReadAfterFold]>; 6223} 6224 6225let Predicates = [HasAVX] in { 6226let ExeDomain = SSEPackedDouble in { 6227defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6228 v2f64, loadv2f64, X86Blendv, 6229 SchedWriteFVarBlend.XMM>; 6230defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6231 v4f64, loadv4f64, X86Blendv, 6232 SchedWriteFVarBlend.YMM>, VEX_L; 6233} // ExeDomain = SSEPackedDouble 6234let ExeDomain = SSEPackedSingle in { 6235defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6236 v4f32, loadv4f32, X86Blendv, 6237 SchedWriteFVarBlend.XMM>; 6238defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6239 v8f32, loadv8f32, X86Blendv, 6240 SchedWriteFVarBlend.YMM>, VEX_L; 6241} // ExeDomain = SSEPackedSingle 6242defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6243 v16i8, loadv16i8, X86Blendv, 6244 SchedWriteVarBlend.XMM>; 6245} 6246 6247let Predicates = [HasAVX2] in { 6248defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6249 v32i8, loadv32i8, X86Blendv, 6250 SchedWriteVarBlend.YMM>, VEX_L; 6251} 6252 6253let Predicates = [HasAVX] in { 6254 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6255 (v4i32 VR128:$src2))), 6256 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6257 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6258 (v2i64 VR128:$src2))), 6259 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6260 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6261 (v8i32 VR256:$src2))), 6262 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6263 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6264 (v4i64 VR256:$src2))), 6265 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6266} 6267 6268// Prefer a movss or movsd over a blendps when optimizing for size. these were 6269// changed to use blends because blends have better throughput on sandybridge 6270// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6271let Predicates = [HasAVX, OptForSpeed] in { 6272 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6273 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6274 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6275 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6276 6277 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6278 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6279 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6280 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6281 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6282 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6283 6284 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6285 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6286 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6287 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6288 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6289 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6290 6291 // Move low f32 and clear high bits. 6292 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6293 (SUBREG_TO_REG (i32 0), 6294 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6295 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6296 (i8 1))), sub_xmm)>; 6297 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6298 (SUBREG_TO_REG (i32 0), 6299 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6300 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6301 (i8 3))), sub_xmm)>; 6302} 6303 6304// Prefer a movss or movsd over a blendps when optimizing for size. these were 6305// changed to use blends because blends have better throughput on sandybridge 6306// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6307let Predicates = [UseSSE41, OptForSpeed] in { 6308 // With SSE41 we can use blends for these patterns. 6309 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6310 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6311 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6312 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6313 6314 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6315 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6316 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6317 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6318 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6319 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6320 6321 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6322 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6323 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6324 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6325 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6326 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6327} 6328 6329 6330/// SS41I_ternary - SSE 4.1 ternary operator 6331let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6332 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6333 PatFrag mem_frag, X86MemOperand x86memop, 6334 SDNode OpNode, X86FoldableSchedWrite sched> { 6335 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6336 (ins VR128:$src1, VR128:$src2), 6337 !strconcat(OpcodeStr, 6338 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6339 [(set VR128:$dst, 6340 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6341 Sched<[sched]>; 6342 6343 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6344 (ins VR128:$src1, x86memop:$src2), 6345 !strconcat(OpcodeStr, 6346 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6347 [(set VR128:$dst, 6348 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6349 Sched<[sched.Folded, sched.ReadAfterFold]>; 6350 } 6351} 6352 6353let ExeDomain = SSEPackedDouble in 6354defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6355 X86Blendv, SchedWriteFVarBlend.XMM>; 6356let ExeDomain = SSEPackedSingle in 6357defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6358 X86Blendv, SchedWriteFVarBlend.XMM>; 6359defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6360 X86Blendv, SchedWriteVarBlend.XMM>; 6361 6362// Aliases with the implicit xmm0 argument 6363def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6364 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6365def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6366 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6367def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6368 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6369def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6370 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6371def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6372 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6373def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6374 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6375 6376let Predicates = [UseSSE41] in { 6377 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6378 (v4i32 VR128:$src2))), 6379 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6380 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6381 (v2i64 VR128:$src2))), 6382 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6383} 6384 6385let AddedComplexity = 400 in { // Prefer non-temporal versions 6386 6387let Predicates = [HasAVX, NoVLX] in 6388def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6389 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6390 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6391let Predicates = [HasAVX2, NoVLX] in 6392def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6393 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6394 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6395def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6396 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6397 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6398 6399let Predicates = [HasAVX2, NoVLX] in { 6400 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6401 (VMOVNTDQAYrm addr:$src)>; 6402 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6403 (VMOVNTDQAYrm addr:$src)>; 6404 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6405 (VMOVNTDQAYrm addr:$src)>; 6406 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6407 (VMOVNTDQAYrm addr:$src)>; 6408 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6409 (VMOVNTDQAYrm addr:$src)>; 6410 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6411 (VMOVNTDQAYrm addr:$src)>; 6412} 6413 6414let Predicates = [HasAVX, NoVLX] in { 6415 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6416 (VMOVNTDQArm addr:$src)>; 6417 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6418 (VMOVNTDQArm addr:$src)>; 6419 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6420 (VMOVNTDQArm addr:$src)>; 6421 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6422 (VMOVNTDQArm addr:$src)>; 6423 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6424 (VMOVNTDQArm addr:$src)>; 6425 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6426 (VMOVNTDQArm addr:$src)>; 6427} 6428 6429let Predicates = [UseSSE41] in { 6430 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6431 (MOVNTDQArm addr:$src)>; 6432 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6433 (MOVNTDQArm addr:$src)>; 6434 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6435 (MOVNTDQArm addr:$src)>; 6436 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6437 (MOVNTDQArm addr:$src)>; 6438 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6439 (MOVNTDQArm addr:$src)>; 6440 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6441 (MOVNTDQArm addr:$src)>; 6442} 6443 6444} // AddedComplexity 6445 6446//===----------------------------------------------------------------------===// 6447// SSE4.2 - Compare Instructions 6448//===----------------------------------------------------------------------===// 6449 6450/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6451multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6452 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6453 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6454 bit Is2Addr = 1> { 6455 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6456 (ins RC:$src1, RC:$src2), 6457 !if(Is2Addr, 6458 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6459 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6460 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6461 Sched<[sched]>; 6462 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6463 (ins RC:$src1, x86memop:$src2), 6464 !if(Is2Addr, 6465 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6466 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6467 [(set RC:$dst, 6468 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6469 Sched<[sched.Folded, sched.ReadAfterFold]>; 6470} 6471 6472let Predicates = [HasAVX] in 6473 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6474 load, i128mem, SchedWriteVecALU.XMM, 0>, 6475 VEX_4V, VEX_WIG; 6476 6477let Predicates = [HasAVX2] in 6478 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6479 load, i256mem, SchedWriteVecALU.YMM, 0>, 6480 VEX_4V, VEX_L, VEX_WIG; 6481 6482let Constraints = "$src1 = $dst" in 6483 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6484 memop, i128mem, SchedWriteVecALU.XMM>; 6485 6486//===----------------------------------------------------------------------===// 6487// SSE4.2 - String/text Processing Instructions 6488//===----------------------------------------------------------------------===// 6489 6490multiclass pcmpistrm_SS42AI<string asm> { 6491 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6492 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6493 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6494 []>, Sched<[WritePCmpIStrM]>; 6495 let mayLoad = 1 in 6496 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6497 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6498 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6499 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6500} 6501 6502let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6503 let Predicates = [HasAVX] in 6504 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 6505 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6506} 6507 6508multiclass SS42AI_pcmpestrm<string asm> { 6509 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6510 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6511 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6512 []>, Sched<[WritePCmpEStrM]>; 6513 let mayLoad = 1 in 6514 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6515 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6516 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6517 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6518} 6519 6520let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6521 let Predicates = [HasAVX] in 6522 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 6523 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6524} 6525 6526multiclass SS42AI_pcmpistri<string asm> { 6527 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6528 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6529 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6530 []>, Sched<[WritePCmpIStrI]>; 6531 let mayLoad = 1 in 6532 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6533 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6534 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6535 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6536} 6537 6538let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6539 let Predicates = [HasAVX] in 6540 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 6541 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6542} 6543 6544multiclass SS42AI_pcmpestri<string asm> { 6545 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6546 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6547 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6548 []>, Sched<[WritePCmpEStrI]>; 6549 let mayLoad = 1 in 6550 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6551 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6552 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6553 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6554} 6555 6556let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6557 let Predicates = [HasAVX] in 6558 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 6559 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6560} 6561 6562//===----------------------------------------------------------------------===// 6563// SSE4.2 - CRC Instructions 6564//===----------------------------------------------------------------------===// 6565 6566// No CRC instructions have AVX equivalents 6567 6568// crc intrinsic instruction 6569// This set of instructions are only rm, the only difference is the size 6570// of r and m. 6571class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6572 RegisterClass RCIn, SDPatternOperator Int> : 6573 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6574 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6575 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6576 Sched<[WriteCRC32]>; 6577 6578class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6579 X86MemOperand x86memop, SDPatternOperator Int> : 6580 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6581 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6582 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6583 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; 6584 6585let Constraints = "$src1 = $dst" in { 6586 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6587 int_x86_sse42_crc32_32_8>; 6588 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6589 int_x86_sse42_crc32_32_8>; 6590 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6591 int_x86_sse42_crc32_32_16>, OpSize16; 6592 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6593 int_x86_sse42_crc32_32_16>, OpSize16; 6594 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6595 int_x86_sse42_crc32_32_32>, OpSize32; 6596 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6597 int_x86_sse42_crc32_32_32>, OpSize32; 6598 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6599 int_x86_sse42_crc32_64_64>, REX_W; 6600 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6601 int_x86_sse42_crc32_64_64>, REX_W; 6602 let hasSideEffects = 0 in { 6603 let mayLoad = 1 in 6604 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6605 null_frag>, REX_W; 6606 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6607 null_frag>, REX_W; 6608 } 6609} 6610 6611//===----------------------------------------------------------------------===// 6612// SHA-NI Instructions 6613//===----------------------------------------------------------------------===// 6614 6615// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6616multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6617 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6618 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6619 (ins VR128:$src1, VR128:$src2), 6620 !if(UsesXMM0, 6621 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6622 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6623 [!if(UsesXMM0, 6624 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6625 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6626 T8PS, Sched<[sched]>; 6627 6628 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6629 (ins VR128:$src1, i128mem:$src2), 6630 !if(UsesXMM0, 6631 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6632 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6633 [!if(UsesXMM0, 6634 (set VR128:$dst, (IntId VR128:$src1, 6635 (memop addr:$src2), XMM0)), 6636 (set VR128:$dst, (IntId VR128:$src1, 6637 (memop addr:$src2))))]>, T8PS, 6638 Sched<[sched.Folded, sched.ReadAfterFold]>; 6639} 6640 6641let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6642 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6643 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6644 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6645 [(set VR128:$dst, 6646 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6647 (i8 timm:$src3)))]>, TAPS, 6648 Sched<[SchedWriteVecIMul.XMM]>; 6649 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6650 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6651 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6652 [(set VR128:$dst, 6653 (int_x86_sha1rnds4 VR128:$src1, 6654 (memop addr:$src2), 6655 (i8 timm:$src3)))]>, TAPS, 6656 Sched<[SchedWriteVecIMul.XMM.Folded, 6657 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6658 6659 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6660 SchedWriteVecIMul.XMM>; 6661 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6662 SchedWriteVecIMul.XMM>; 6663 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6664 SchedWriteVecIMul.XMM>; 6665 6666 let Uses=[XMM0] in 6667 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6668 SchedWriteVecIMul.XMM, 1>; 6669 6670 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6671 SchedWriteVecIMul.XMM>; 6672 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6673 SchedWriteVecIMul.XMM>; 6674} 6675 6676// Aliases with explicit %xmm0 6677def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6678 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6679def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6680 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6681 6682//===----------------------------------------------------------------------===// 6683// AES-NI Instructions 6684//===----------------------------------------------------------------------===// 6685 6686multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6687 Intrinsic IntId, PatFrag ld_frag, 6688 bit Is2Addr = 0, RegisterClass RC = VR128, 6689 X86MemOperand MemOp = i128mem> { 6690 let AsmString = OpcodeStr# 6691 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6692 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6693 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6694 (ins RC:$src1, RC:$src2), "", 6695 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6696 Sched<[WriteAESDecEnc]>; 6697 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6698 (ins RC:$src1, MemOp:$src2), "", 6699 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6700 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6701 } 6702} 6703 6704// Perform One Round of an AES Encryption/Decryption Flow 6705let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6706 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6707 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; 6708 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6709 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; 6710 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6711 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; 6712 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6713 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; 6714} 6715 6716let Predicates = [NoVLX, HasVAES] in { 6717 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6718 int_x86_aesni_aesenc_256, load, 0, VR256, 6719 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6720 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6721 int_x86_aesni_aesenclast_256, load, 0, VR256, 6722 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6723 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6724 int_x86_aesni_aesdec_256, load, 0, VR256, 6725 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6726 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6727 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6728 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6729} 6730 6731let Constraints = "$src1 = $dst" in { 6732 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6733 int_x86_aesni_aesenc, memop, 1>; 6734 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6735 int_x86_aesni_aesenclast, memop, 1>; 6736 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6737 int_x86_aesni_aesdec, memop, 1>; 6738 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6739 int_x86_aesni_aesdeclast, memop, 1>; 6740} 6741 6742// Perform the AES InvMixColumn Transformation 6743let Predicates = [HasAVX, HasAES] in { 6744 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6745 (ins VR128:$src1), 6746 "vaesimc\t{$src1, $dst|$dst, $src1}", 6747 [(set VR128:$dst, 6748 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6749 VEX, VEX_WIG; 6750 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6751 (ins i128mem:$src1), 6752 "vaesimc\t{$src1, $dst|$dst, $src1}", 6753 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6754 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6755} 6756def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6757 (ins VR128:$src1), 6758 "aesimc\t{$src1, $dst|$dst, $src1}", 6759 [(set VR128:$dst, 6760 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6761def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6762 (ins i128mem:$src1), 6763 "aesimc\t{$src1, $dst|$dst, $src1}", 6764 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6765 Sched<[WriteAESIMC.Folded]>; 6766 6767// AES Round Key Generation Assist 6768let Predicates = [HasAVX, HasAES] in { 6769 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6770 (ins VR128:$src1, u8imm:$src2), 6771 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6772 [(set VR128:$dst, 6773 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6774 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6775 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6776 (ins i128mem:$src1, u8imm:$src2), 6777 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6778 [(set VR128:$dst, 6779 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6780 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6781} 6782def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6783 (ins VR128:$src1, u8imm:$src2), 6784 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6785 [(set VR128:$dst, 6786 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6787 Sched<[WriteAESKeyGen]>; 6788def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6789 (ins i128mem:$src1, u8imm:$src2), 6790 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6791 [(set VR128:$dst, 6792 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6793 Sched<[WriteAESKeyGen.Folded]>; 6794 6795//===----------------------------------------------------------------------===// 6796// PCLMUL Instructions 6797//===----------------------------------------------------------------------===// 6798 6799// Immediate transform to help with commuting. 6800def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6801 uint8_t Imm = N->getZExtValue(); 6802 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6803}]>; 6804 6805// SSE carry-less Multiplication instructions 6806let Predicates = [NoAVX, HasPCLMUL] in { 6807 let Constraints = "$src1 = $dst" in { 6808 let isCommutable = 1 in 6809 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6810 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6811 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6812 [(set VR128:$dst, 6813 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6814 Sched<[WriteCLMul]>; 6815 6816 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6817 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6818 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6819 [(set VR128:$dst, 6820 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6821 timm:$src3))]>, 6822 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6823 } // Constraints = "$src1 = $dst" 6824 6825 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6826 (i8 timm:$src3)), 6827 (PCLMULQDQrm VR128:$src1, addr:$src2, 6828 (PCLMULCommuteImm timm:$src3))>; 6829} // Predicates = [NoAVX, HasPCLMUL] 6830 6831// SSE aliases 6832foreach HI = ["hq","lq"] in 6833foreach LO = ["hq","lq"] in { 6834 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6835 (PCLMULQDQrr VR128:$dst, VR128:$src, 6836 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6837 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6838 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6839 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6840} 6841 6842// AVX carry-less Multiplication instructions 6843multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6844 PatFrag LdFrag, Intrinsic IntId> { 6845 let isCommutable = 1 in 6846 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6847 (ins RC:$src1, RC:$src2, u8imm:$src3), 6848 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6849 [(set RC:$dst, 6850 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6851 Sched<[WriteCLMul]>; 6852 6853 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6854 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6855 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6856 [(set RC:$dst, 6857 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6858 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6859 6860 // We can commute a load in the first operand by swapping the sources and 6861 // rotating the immediate. 6862 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6863 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6864 (PCLMULCommuteImm timm:$src3))>; 6865} 6866 6867let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6868defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6869 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 6870 6871let Predicates = [NoVLX, HasVPCLMULQDQ] in 6872defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6873 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 6874 6875multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6876 X86MemOperand MemOp, string Hi, string Lo> { 6877 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6878 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 6879 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6880 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6881 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 6882 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6883} 6884 6885multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 6886 X86MemOperand MemOp> { 6887 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 6888 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 6889 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 6890 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 6891} 6892 6893// AVX aliases 6894defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 6895defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 6896 6897//===----------------------------------------------------------------------===// 6898// SSE4A Instructions 6899//===----------------------------------------------------------------------===// 6900 6901let Predicates = [HasSSE4A] in { 6902 6903let ExeDomain = SSEPackedInt in { 6904let Constraints = "$src = $dst" in { 6905def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 6906 (ins VR128:$src, u8imm:$len, u8imm:$idx), 6907 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 6908 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 6909 timm:$idx))]>, 6910 PD, Sched<[SchedWriteVecALU.XMM]>; 6911def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6912 (ins VR128:$src, VR128:$mask), 6913 "extrq\t{$mask, $src|$src, $mask}", 6914 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 6915 VR128:$mask))]>, 6916 PD, Sched<[SchedWriteVecALU.XMM]>; 6917 6918def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 6919 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 6920 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 6921 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 6922 timm:$len, timm:$idx))]>, 6923 XD, Sched<[SchedWriteVecALU.XMM]>; 6924def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6925 (ins VR128:$src, VR128:$mask), 6926 "insertq\t{$mask, $src|$src, $mask}", 6927 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 6928 VR128:$mask))]>, 6929 XD, Sched<[SchedWriteVecALU.XMM]>; 6930} 6931} // ExeDomain = SSEPackedInt 6932 6933// Non-temporal (unaligned) scalar stores. 6934let AddedComplexity = 400 in { // Prefer non-temporal versions 6935let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 6936def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 6937 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 6938 6939def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 6940 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 6941} // SchedRW 6942 6943def : Pat<(nontemporalstore FR32:$src, addr:$dst), 6944 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 6945 6946def : Pat<(nontemporalstore FR64:$src, addr:$dst), 6947 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 6948 6949} // AddedComplexity 6950} // HasSSE4A 6951 6952//===----------------------------------------------------------------------===// 6953// AVX Instructions 6954//===----------------------------------------------------------------------===// 6955 6956//===----------------------------------------------------------------------===// 6957// VBROADCAST - Load from memory and broadcast to all elements of the 6958// destination operand 6959// 6960class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 6961 X86MemOperand x86memop, ValueType VT, 6962 PatFrag bcast_frag, SchedWrite Sched> : 6963 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 6964 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6965 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 6966 Sched<[Sched]>, VEX; 6967 6968// AVX2 adds register forms 6969class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 6970 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 6971 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 6972 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6973 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 6974 Sched<[Sched]>, VEX; 6975 6976let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 6977 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 6978 f32mem, v4f32, X86VBroadcastld32, 6979 SchedWriteFShuffle.XMM.Folded>; 6980 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 6981 f32mem, v8f32, X86VBroadcastld32, 6982 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6983} 6984let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 6985def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 6986 v4f64, X86VBroadcastld64, 6987 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6988 6989let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 6990 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 6991 v4f32, v4f32, SchedWriteFShuffle.XMM>; 6992 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 6993 v8f32, v4f32, WriteFShuffle256>, VEX_L; 6994} 6995let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 6996def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 6997 v4f64, v2f64, WriteFShuffle256>, VEX_L; 6998 6999//===----------------------------------------------------------------------===// 7000// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7001// halves of a 256-bit vector. 7002// 7003let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7004def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7005 (ins i128mem:$src), 7006 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7007 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7008 7009let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7010 ExeDomain = SSEPackedSingle in 7011def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7012 (ins f128mem:$src), 7013 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7014 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7015 7016let Predicates = [HasAVX, NoVLX] in { 7017def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), 7018 (VBROADCASTF128 addr:$src)>; 7019def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), 7020 (VBROADCASTF128 addr:$src)>; 7021} 7022 7023// NOTE: We're using FP instructions here, but execution domain fixing can 7024// convert to integer when profitable. 7025let Predicates = [HasAVX, NoVLX] in { 7026def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 7027 (VBROADCASTF128 addr:$src)>; 7028def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), 7029 (VBROADCASTF128 addr:$src)>; 7030def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), 7031 (VBROADCASTF128 addr:$src)>; 7032def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), 7033 (VBROADCASTF128 addr:$src)>; 7034} 7035 7036//===----------------------------------------------------------------------===// 7037// VINSERTF128 - Insert packed floating-point values 7038// 7039let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7040def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7041 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7042 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7043 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7044let mayLoad = 1 in 7045def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7046 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7047 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7048 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7049} 7050 7051// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7052// with YMM register containing zero. 7053// FIXME: Avoid producing vxorps to clear the fake inputs. 7054let Predicates = [HasAVX1Only] in { 7055def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7056} 7057 7058multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, 7059 PatFrag memop_frag> { 7060 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7061 (iPTR imm)), 7062 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7063 (INSERT_get_vinsert128_imm VR256:$ins))>; 7064 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7065 (From (memop_frag addr:$src2)), 7066 (iPTR imm)), 7067 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7068 (INSERT_get_vinsert128_imm VR256:$ins))>; 7069} 7070 7071let Predicates = [HasAVX, NoVLX] in { 7072 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; 7073 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; 7074} 7075 7076let Predicates = [HasAVX1Only] in { 7077 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; 7078 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>; 7079 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>; 7080 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>; 7081} 7082 7083//===----------------------------------------------------------------------===// 7084// VEXTRACTF128 - Extract packed floating-point values 7085// 7086let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7087def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7088 (ins VR256:$src1, u8imm:$src2), 7089 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7090 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7091let mayStore = 1 in 7092def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7093 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7094 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7095 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7096} 7097 7098multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7099 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7100 (To (!cast<Instruction>(InstrStr#rr) 7101 (From VR256:$src1), 7102 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7103 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7104 (iPTR imm))), addr:$dst), 7105 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7106 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7107} 7108 7109// AVX1 patterns 7110let Predicates = [HasAVX, NoVLX] in { 7111 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7112 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7113} 7114 7115let Predicates = [HasAVX1Only] in { 7116 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7117 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7118 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7119 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7120} 7121 7122//===----------------------------------------------------------------------===// 7123// VMASKMOV - Conditional SIMD Packed Loads and Stores 7124// 7125multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7126 Intrinsic IntLd, Intrinsic IntLd256, 7127 Intrinsic IntSt, Intrinsic IntSt256, 7128 X86SchedWriteMaskMove schedX, 7129 X86SchedWriteMaskMove schedY> { 7130 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7131 (ins VR128:$src1, f128mem:$src2), 7132 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7133 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7134 VEX_4V, Sched<[schedX.RM]>; 7135 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7136 (ins VR256:$src1, f256mem:$src2), 7137 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7138 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7139 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7140 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7141 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7142 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7143 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7144 VEX_4V, Sched<[schedX.MR]>; 7145 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7146 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7147 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7148 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7149 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7150} 7151 7152let ExeDomain = SSEPackedSingle in 7153defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7154 int_x86_avx_maskload_ps, 7155 int_x86_avx_maskload_ps_256, 7156 int_x86_avx_maskstore_ps, 7157 int_x86_avx_maskstore_ps_256, 7158 WriteFMaskMove32, WriteFMaskMove32Y>; 7159let ExeDomain = SSEPackedDouble in 7160defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7161 int_x86_avx_maskload_pd, 7162 int_x86_avx_maskload_pd_256, 7163 int_x86_avx_maskstore_pd, 7164 int_x86_avx_maskstore_pd_256, 7165 WriteFMaskMove64, WriteFMaskMove64Y>; 7166 7167//===----------------------------------------------------------------------===// 7168// VPERMIL - Permute Single and Double Floating-Point Values 7169// 7170 7171multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7172 RegisterClass RC, X86MemOperand x86memop_f, 7173 X86MemOperand x86memop_i, 7174 ValueType f_vt, ValueType i_vt, 7175 X86FoldableSchedWrite sched, 7176 X86FoldableSchedWrite varsched> { 7177 let Predicates = [HasAVX, NoVLX] in { 7178 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7179 (ins RC:$src1, RC:$src2), 7180 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7181 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7182 Sched<[varsched]>; 7183 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7184 (ins RC:$src1, x86memop_i:$src2), 7185 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7186 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7187 (i_vt (load addr:$src2)))))]>, VEX_4V, 7188 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7189 7190 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7191 (ins RC:$src1, u8imm:$src2), 7192 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7193 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7194 Sched<[sched]>; 7195 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7196 (ins x86memop_f:$src1, u8imm:$src2), 7197 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7198 [(set RC:$dst, 7199 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7200 Sched<[sched.Folded]>; 7201 }// Predicates = [HasAVX, NoVLX] 7202} 7203 7204let ExeDomain = SSEPackedSingle in { 7205 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7206 v4f32, v4i32, SchedWriteFShuffle.XMM, 7207 SchedWriteFVarShuffle.XMM>; 7208 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7209 v8f32, v8i32, SchedWriteFShuffle.YMM, 7210 SchedWriteFVarShuffle.YMM>, VEX_L; 7211} 7212let ExeDomain = SSEPackedDouble in { 7213 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7214 v2f64, v2i64, SchedWriteFShuffle.XMM, 7215 SchedWriteFVarShuffle.XMM>; 7216 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7217 v4f64, v4i64, SchedWriteFShuffle.YMM, 7218 SchedWriteFVarShuffle.YMM>, VEX_L; 7219} 7220 7221//===----------------------------------------------------------------------===// 7222// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7223// 7224 7225let ExeDomain = SSEPackedSingle in { 7226let isCommutable = 1 in 7227def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7228 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7229 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7230 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7231 (i8 timm:$src3))))]>, VEX_4V, VEX_L, 7232 Sched<[WriteFShuffle256]>; 7233def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7234 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7235 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7236 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), 7237 (i8 timm:$src3)))]>, VEX_4V, VEX_L, 7238 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7239} 7240 7241// Immediate transform to help with commuting. 7242def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7243 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7244}]>; 7245 7246let Predicates = [HasAVX] in { 7247// Pattern with load in other operand. 7248def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), 7249 VR256:$src1, (i8 timm:$imm))), 7250 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; 7251} 7252 7253let Predicates = [HasAVX1Only] in { 7254def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7255 (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>; 7256def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 7257 (loadv4i64 addr:$src2), (i8 timm:$imm))), 7258 (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>; 7259// Pattern with load in other operand. 7260def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7261 VR256:$src1, (i8 timm:$imm))), 7262 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; 7263} 7264 7265//===----------------------------------------------------------------------===// 7266// VZERO - Zero YMM registers 7267// Note: These instruction do not affect the YMM16-YMM31. 7268// 7269 7270let SchedRW = [WriteSystem] in { 7271let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7272 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7273 // Zero All YMM registers 7274 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7275 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7276 Requires<[HasAVX]>, VEX_WIG; 7277 7278 // Zero Upper bits of YMM registers 7279 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7280 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7281 Requires<[HasAVX]>, VEX_WIG; 7282} // Defs 7283} // SchedRW 7284 7285//===----------------------------------------------------------------------===// 7286// Half precision conversion instructions 7287// 7288 7289multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7290 X86FoldableSchedWrite sched> { 7291 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7292 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7293 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, 7294 T8PD, VEX, Sched<[sched]>; 7295 let hasSideEffects = 0, mayLoad = 1 in 7296 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7297 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7298 []>, T8PD, VEX, Sched<[sched.Folded]>; 7299} 7300 7301multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7302 SchedWrite RR, SchedWrite MR> { 7303 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7304 (ins RC:$src1, i32u8imm:$src2), 7305 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7306 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, 7307 TAPD, VEX, Sched<[RR]>; 7308 let hasSideEffects = 0, mayStore = 1 in 7309 def mr : Ii8<0x1D, MRMDestMem, (outs), 7310 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7311 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7312 TAPD, VEX, Sched<[MR]>; 7313} 7314 7315let Predicates = [HasF16C, NoVLX] in { 7316 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7317 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7318 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7319 WriteCvtPS2PHSt>, SIMD_EXC; 7320 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7321 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7322 7323 // Pattern match vcvtph2ps of a scalar i64 load. 7324 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7325 (VCVTPH2PSrm addr:$src)>; 7326 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 7327 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7328 (VCVTPH2PSrm addr:$src)>; 7329 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), 7330 (VCVTPH2PSYrm addr:$src)>; 7331 7332 def : Pat<(store (f64 (extractelt 7333 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7334 (iPTR 0))), addr:$dst), 7335 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7336 def : Pat<(store (i64 (extractelt 7337 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7338 (iPTR 0))), addr:$dst), 7339 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7340 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7341 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7342} 7343 7344//===----------------------------------------------------------------------===// 7345// AVX2 Instructions 7346//===----------------------------------------------------------------------===// 7347 7348/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7349multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7350 ValueType OpVT, X86FoldableSchedWrite sched, 7351 RegisterClass RC, 7352 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7353 let isCommutable = 1 in 7354 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7355 (ins RC:$src1, RC:$src2, u8imm:$src3), 7356 !strconcat(OpcodeStr, 7357 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7358 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7359 Sched<[sched]>, VEX_4V; 7360 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7361 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7362 !strconcat(OpcodeStr, 7363 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7364 [(set RC:$dst, 7365 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7366 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; 7367 7368 // Pattern to commute if load is in first source. 7369 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7370 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7371 (commuteXForm timm:$src3))>; 7372} 7373 7374let Predicates = [HasAVX2] in { 7375defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7376 SchedWriteBlend.XMM, VR128, i128mem, 7377 BlendCommuteImm4>; 7378defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7379 SchedWriteBlend.YMM, VR256, i256mem, 7380 BlendCommuteImm8>, VEX_L; 7381 7382def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7383 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7384def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7385 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7386def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7387 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7388 7389def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7390 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7391def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7392 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7393def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7394 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7395} 7396 7397// For insertion into the zero index (low half) of a 256-bit vector, it is 7398// more efficient to generate a blend with immediate instead of an insert*128. 7399// NOTE: We're using FP instructions here, but execution domain fixing should 7400// take care of using integer instructions when profitable. 7401let Predicates = [HasAVX] in { 7402def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7403 (VBLENDPSYrri VR256:$src1, 7404 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7405 VR128:$src2, sub_xmm), 0xf)>; 7406def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7407 (VBLENDPSYrri VR256:$src1, 7408 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7409 VR128:$src2, sub_xmm), 0xf)>; 7410def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7411 (VBLENDPSYrri VR256:$src1, 7412 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7413 VR128:$src2, sub_xmm), 0xf)>; 7414def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7415 (VBLENDPSYrri VR256:$src1, 7416 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7417 VR128:$src2, sub_xmm), 0xf)>; 7418 7419def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7420 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7421 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7422def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7423 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7424 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7425def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7426 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7427 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7428def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7429 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7430 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7431} 7432 7433//===----------------------------------------------------------------------===// 7434// VPBROADCAST - Load from memory and broadcast to all elements of the 7435// destination operand 7436// 7437multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7438 X86MemOperand x86memop, PatFrag bcast_frag, 7439 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7440 let Predicates = [HasAVX2, prd] in { 7441 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7442 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7443 [(set VR128:$dst, 7444 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7445 Sched<[SchedWriteShuffle.XMM]>, VEX; 7446 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7447 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7448 [(set VR128:$dst, 7449 (OpVT128 (bcast_frag addr:$src)))]>, 7450 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7451 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7452 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7453 [(set VR256:$dst, 7454 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7455 Sched<[WriteShuffle256]>, VEX, VEX_L; 7456 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7457 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7458 [(set VR256:$dst, 7459 (OpVT256 (bcast_frag addr:$src)))]>, 7460 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7461 7462 // Provide aliases for broadcast from the same register class that 7463 // automatically does the extract. 7464 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7465 (!cast<Instruction>(NAME#"Yrr") 7466 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7467 } 7468} 7469 7470defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7471 v16i8, v32i8, NoVLX_Or_NoBWI>; 7472defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7473 v8i16, v16i16, NoVLX_Or_NoBWI>; 7474defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7475 v4i32, v8i32, NoVLX>; 7476defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7477 v2i64, v4i64, NoVLX>; 7478 7479let Predicates = [HasAVX2, NoVLX] in { 7480 // Provide fallback in case the load node that is used in the patterns above 7481 // is used by additional users, which prevents the pattern selection. 7482 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7483 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7484 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7485 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7486 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7487 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7488} 7489 7490let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7491 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7492 (VPBROADCASTBrr (VMOVDI2PDIrr 7493 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7494 GR8:$src, sub_8bit))))>; 7495 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7496 (VPBROADCASTBYrr (VMOVDI2PDIrr 7497 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7498 GR8:$src, sub_8bit))))>; 7499 7500 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7501 (VPBROADCASTWrr (VMOVDI2PDIrr 7502 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7503 GR16:$src, sub_16bit))))>; 7504 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7505 (VPBROADCASTWYrr (VMOVDI2PDIrr 7506 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7507 GR16:$src, sub_16bit))))>; 7508} 7509let Predicates = [HasAVX2, NoVLX] in { 7510 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7511 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7512 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7513 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7514 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7515 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7516 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7517 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7518} 7519 7520// AVX1 broadcast patterns 7521let Predicates = [HasAVX1Only] in { 7522def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7523 (VBROADCASTSSYrm addr:$src)>; 7524def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7525 (VBROADCASTSDYrm addr:$src)>; 7526def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7527 (VBROADCASTSSrm addr:$src)>; 7528} 7529 7530 // Provide fallback in case the load node that is used in the patterns above 7531 // is used by additional users, which prevents the pattern selection. 7532let Predicates = [HasAVX, NoVLX] in { 7533 // 128bit broadcasts: 7534 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7535 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7536 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7537 (VMOVDDUPrm addr:$src)>; 7538 7539 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7540 (VMOVDDUPrr VR128:$src)>; 7541} 7542 7543let Predicates = [HasAVX1Only] in { 7544 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7545 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7546 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7547 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7548 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7549 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7550 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7551 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7552 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7553 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7554 7555 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7556 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7557 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7558 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7559 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7560 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7561 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7562 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7563 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7564 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7565 7566 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7567 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7568 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7569 (VMOVDDUPrm addr:$src)>; 7570} 7571 7572//===----------------------------------------------------------------------===// 7573// VPERM - Permute instructions 7574// 7575 7576multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7577 ValueType OpVT, X86FoldableSchedWrite Sched, 7578 X86MemOperand memOp> { 7579 let Predicates = [HasAVX2, NoVLX] in { 7580 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7581 (ins VR256:$src1, VR256:$src2), 7582 !strconcat(OpcodeStr, 7583 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7584 [(set VR256:$dst, 7585 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7586 Sched<[Sched]>, VEX_4V, VEX_L; 7587 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7588 (ins VR256:$src1, memOp:$src2), 7589 !strconcat(OpcodeStr, 7590 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7591 [(set VR256:$dst, 7592 (OpVT (X86VPermv VR256:$src1, 7593 (load addr:$src2))))]>, 7594 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; 7595 } 7596} 7597 7598defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7599let ExeDomain = SSEPackedSingle in 7600defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7601 7602multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7603 ValueType OpVT, X86FoldableSchedWrite Sched, 7604 X86MemOperand memOp> { 7605 let Predicates = [HasAVX2, NoVLX] in { 7606 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7607 (ins VR256:$src1, u8imm:$src2), 7608 !strconcat(OpcodeStr, 7609 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7610 [(set VR256:$dst, 7611 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7612 Sched<[Sched]>, VEX, VEX_L; 7613 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7614 (ins memOp:$src1, u8imm:$src2), 7615 !strconcat(OpcodeStr, 7616 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7617 [(set VR256:$dst, 7618 (OpVT (X86VPermi (mem_frag addr:$src1), 7619 (i8 timm:$src2))))]>, 7620 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7621 } 7622} 7623 7624defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7625 WriteShuffle256, i256mem>, VEX_W; 7626let ExeDomain = SSEPackedDouble in 7627defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7628 WriteFShuffle256, f256mem>, VEX_W; 7629 7630//===----------------------------------------------------------------------===// 7631// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 7632// 7633let isCommutable = 1 in 7634def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7635 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7636 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7637 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7638 (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>, 7639 VEX_4V, VEX_L; 7640def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7641 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7642 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7643 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 7644 (i8 timm:$src3)))]>, 7645 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7646 7647let Predicates = [HasAVX2] in 7648def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7649 VR256:$src1, (i8 timm:$imm))), 7650 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; 7651 7652 7653//===----------------------------------------------------------------------===// 7654// VINSERTI128 - Insert packed integer values 7655// 7656let hasSideEffects = 0 in { 7657def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7658 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7659 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7660 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7661let mayLoad = 1 in 7662def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7663 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7664 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7665 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7666} 7667 7668let Predicates = [HasAVX2, NoVLX] in { 7669 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; 7670 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>; 7671 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>; 7672 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>; 7673} 7674 7675//===----------------------------------------------------------------------===// 7676// VEXTRACTI128 - Extract packed integer values 7677// 7678def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7679 (ins VR256:$src1, u8imm:$src2), 7680 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7681 Sched<[WriteShuffle256]>, VEX, VEX_L; 7682let hasSideEffects = 0, mayStore = 1 in 7683def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7684 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7685 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7686 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7687 7688let Predicates = [HasAVX2, NoVLX] in { 7689 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7690 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7691 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7692 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7693} 7694 7695//===----------------------------------------------------------------------===// 7696// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7697// 7698multiclass avx2_pmovmask<string OpcodeStr, 7699 Intrinsic IntLd128, Intrinsic IntLd256, 7700 Intrinsic IntSt128, Intrinsic IntSt256, 7701 X86SchedWriteMaskMove schedX, 7702 X86SchedWriteMaskMove schedY> { 7703 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7704 (ins VR128:$src1, i128mem:$src2), 7705 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7706 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7707 VEX_4V, Sched<[schedX.RM]>; 7708 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7709 (ins VR256:$src1, i256mem:$src2), 7710 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7711 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7712 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7713 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7714 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7715 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7716 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7717 VEX_4V, Sched<[schedX.MR]>; 7718 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7719 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7720 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7721 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7722 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7723} 7724 7725defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7726 int_x86_avx2_maskload_d, 7727 int_x86_avx2_maskload_d_256, 7728 int_x86_avx2_maskstore_d, 7729 int_x86_avx2_maskstore_d_256, 7730 WriteVecMaskMove32, WriteVecMaskMove32Y>; 7731defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7732 int_x86_avx2_maskload_q, 7733 int_x86_avx2_maskload_q_256, 7734 int_x86_avx2_maskstore_q, 7735 int_x86_avx2_maskstore_q_256, 7736 WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W; 7737 7738multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7739 ValueType MaskVT> { 7740 // masked store 7741 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7742 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7743 // masked load 7744 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7745 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7746 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7747 (VT immAllZerosV))), 7748 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7749} 7750let Predicates = [HasAVX] in { 7751 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7752 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 7753 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 7754 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 7755} 7756let Predicates = [HasAVX1Only] in { 7757 // load/store i32/i64 not supported use ps/pd version 7758 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 7759 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 7760 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 7761 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 7762} 7763let Predicates = [HasAVX2] in { 7764 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 7765 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 7766 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 7767 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 7768} 7769 7770//===----------------------------------------------------------------------===// 7771// SubVector Broadcasts 7772// Provide fallback in case the load node that is used in the patterns above 7773// is used by additional users, which prevents the pattern selection. 7774 7775let Predicates = [HasAVX, NoVLX] in { 7776def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), 7777 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7778 (v2f64 VR128:$src), 1)>; 7779def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), 7780 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7781 (v4f32 VR128:$src), 1)>; 7782} 7783 7784// NOTE: We're using FP instructions here, but execution domain fixing can 7785// convert to integer when profitable. 7786let Predicates = [HasAVX, NoVLX] in { 7787def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 7788 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7789 (v2i64 VR128:$src), 1)>; 7790def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 7791 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7792 (v4i32 VR128:$src), 1)>; 7793def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 7794 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7795 (v8i16 VR128:$src), 1)>; 7796def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 7797 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7798 (v16i8 VR128:$src), 1)>; 7799} 7800 7801//===----------------------------------------------------------------------===// 7802// Variable Bit Shifts 7803// 7804multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 7805 ValueType vt128, ValueType vt256> { 7806 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 7807 (ins VR128:$src1, VR128:$src2), 7808 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7809 [(set VR128:$dst, 7810 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 7811 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 7812 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 7813 (ins VR128:$src1, i128mem:$src2), 7814 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7815 [(set VR128:$dst, 7816 (vt128 (OpNode VR128:$src1, 7817 (vt128 (load addr:$src2)))))]>, 7818 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, 7819 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 7820 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7821 (ins VR256:$src1, VR256:$src2), 7822 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7823 [(set VR256:$dst, 7824 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 7825 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 7826 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7827 (ins VR256:$src1, i256mem:$src2), 7828 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7829 [(set VR256:$dst, 7830 (vt256 (OpNode VR256:$src1, 7831 (vt256 (load addr:$src2)))))]>, 7832 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 7833 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 7834} 7835 7836let Predicates = [HasAVX2, NoVLX] in { 7837 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 7838 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; 7839 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 7840 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; 7841 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 7842} 7843 7844//===----------------------------------------------------------------------===// 7845// VGATHER - GATHER Operations 7846 7847// FIXME: Improve scheduling of gather instructions. 7848multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, 7849 ValueType VTy, RegisterClass RC256, 7850 X86MemOperand memop128, X86MemOperand memop256, 7851 ValueType MTx = VTx, ValueType MTy = VTy> { 7852let mayLoad = 1, hasSideEffects = 0 in { 7853 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 7854 (ins VR128:$src1, memop128:$src2, VR128:$mask), 7855 !strconcat(OpcodeStr, 7856 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7857 []>, VEX, Sched<[WriteLoad]>; 7858 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 7859 (ins RC256:$src1, memop256:$src2, RC256:$mask), 7860 !strconcat(OpcodeStr, 7861 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7862 []>, VEX, VEX_L, Sched<[WriteLoad]>; 7863} 7864} 7865 7866let Predicates = [HasAVX2] in { 7867 let mayLoad = 1, hasSideEffects = 0, Constraints 7868 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 7869 in { 7870 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, 7871 VR256, vx128mem, vx256mem>, VEX_W; 7872 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, 7873 VR256, vx128mem, vy256mem>, VEX_W; 7874 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, 7875 VR256, vx128mem, vy256mem>; 7876 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, 7877 VR128, vx64mem, vy128mem>; 7878 7879 let ExeDomain = SSEPackedDouble in { 7880 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, 7881 VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W; 7882 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, 7883 VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W; 7884 } 7885 7886 let ExeDomain = SSEPackedSingle in { 7887 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, 7888 VR256, vx128mem, vy256mem, v4i32, v8i32>; 7889 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, 7890 VR128, vx64mem, vy128mem, v4i32, v4i32>; 7891 } 7892 } 7893} 7894 7895//===----------------------------------------------------------------------===// 7896// GFNI instructions 7897//===----------------------------------------------------------------------===// 7898 7899multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 7900 RegisterClass RC, PatFrag MemOpFrag, 7901 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7902 let ExeDomain = SSEPackedInt, 7903 AsmString = !if(Is2Addr, 7904 OpcodeStr#"\t{$src2, $dst|$dst, $src2}", 7905 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 7906 let isCommutable = 1 in 7907 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 7908 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 7909 Sched<[SchedWriteVecALU.XMM]>, T8PD; 7910 7911 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 7912 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 7913 (MemOpFrag addr:$src2))))]>, 7914 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD; 7915 } 7916} 7917 7918multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 7919 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 7920 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7921 let AsmString = !if(Is2Addr, 7922 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7923 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 7924 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 7925 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 7926 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 7927 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; 7928 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 7929 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 7930 [(set RC:$dst, (OpVT (OpNode RC:$src1, 7931 (MemOpFrag addr:$src2), 7932 timm:$src3)))], SSEPackedInt>, 7933 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; 7934 } 7935} 7936 7937multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 7938 let Constraints = "$src1 = $dst", 7939 Predicates = [HasGFNI, UseSSE2] in 7940 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 7941 VR128, load, i128mem, 1>; 7942 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 7943 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, 7944 load, i128mem>, VEX_4V, VEX_W; 7945 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, 7946 load, i256mem>, VEX_4V, VEX_L, VEX_W; 7947 } 7948} 7949 7950// GF2P8MULB 7951let Constraints = "$src1 = $dst", 7952 Predicates = [HasGFNI, UseSSE2] in 7953defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 7954 i128mem, 1>; 7955let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 7956 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 7957 i128mem>, VEX_4V; 7958 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 7959 i256mem>, VEX_4V, VEX_L; 7960} 7961// GF2P8AFFINEINVQB, GF2P8AFFINEQB 7962let isCommutable = 0 in { 7963 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 7964 X86GF2P8affineinvqb>, TAPD; 7965 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 7966 X86GF2P8affineqb>, TAPD; 7967} 7968 7969