1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 ComplexPattern mem_cpat, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 118 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 119 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 120 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 121} 122 123//===----------------------------------------------------------------------===// 124// AVX & SSE - Zero/One Vectors 125//===----------------------------------------------------------------------===// 126 127// Alias instruction that maps zero vector to pxor / xorp* for sse. 128// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 129// swizzled by ExecutionDomainFix to pxor. 130// We set canFoldAsLoad because this can be converted to a constant-pool 131// load of an all-zeros value if folding it would be beneficial. 132let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 133 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 134def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 135 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 136} 137 138let Predicates = [NoAVX512] in { 139def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 140def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 141def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 142def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 143def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 144} 145 146 147// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 148// and doesn't need it because on sandy bridge the register is set to zero 149// at the rename stage without using any execution unit, so SET0PSY 150// and SET0PDY can be used for vector int instructions without penalty 151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 152 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 153def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 154 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 155} 156 157let Predicates = [NoAVX512] in { 158def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 159def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 160def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 161def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 163} 164 165// We set canFoldAsLoad because this can be converted to a constant-pool 166// load of an all-ones value if folding it would be beneficial. 167let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 168 isPseudo = 1, SchedRW = [WriteZero] in { 169 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 170 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 171 let Predicates = [HasAVX1Only, OptForMinSize] in { 172 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 173 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 174 } 175 let Predicates = [HasAVX2] in 176 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178} 179 180//===----------------------------------------------------------------------===// 181// SSE 1 & 2 - Move FP Scalar Instructions 182// 183// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 184// register copies because it's a partial register update; Register-to-register 185// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 186// that the insert be implementable in terms of a copy, and just mentioned, we 187// don't use movss/movsd for copies. 188//===----------------------------------------------------------------------===// 189 190multiclass sse12_move_rr<SDNode OpNode, ValueType vt, 191 X86MemOperand x86memop, string base_opc, 192 string asm_opr, Domain d, string Name> { 193 let isCommutable = 1 in 194 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 195 (ins VR128:$src1, VR128:$src2), 196 !strconcat(base_opc, asm_opr), 197 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 198 Sched<[SchedWriteFShuffle.XMM]>; 199 200 // For the disassembler 201 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 202 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 203 (ins VR128:$src1, VR128:$src2), 204 !strconcat(base_opc, asm_opr), []>, 205 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 206} 207 208multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 209 X86MemOperand x86memop, string OpcodeStr, 210 Domain d, string Name, Predicate pred> { 211 // AVX 212 let Predicates = [UseAVX, OptForSize] in 213 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 214 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 215 "V"#Name>, 216 VEX_4V, VEX_LIG, VEX_WIG; 217 218 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 219 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 220 [(store RC:$src, addr:$dst)], d>, 221 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 222 // SSE1 & 2 223 let Constraints = "$src1 = $dst" in { 224 let Predicates = [pred, NoSSE41_Or_OptForSize] in 225 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 226 "\t{$src2, $dst|$dst, $src2}", d, Name>; 227 } 228 229 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 231 [(store RC:$src, addr:$dst)], d>, 232 Sched<[WriteFStore]>; 233 234 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 235 (!cast<Instruction>("V"#NAME#"rr_REV") 236 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 237 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 238 (!cast<Instruction>(NAME#"rr_REV") 239 VR128:$dst, VR128:$src2), 0>; 240} 241 242// Loading from memory automatically zeroing upper bits. 243multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 244 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 245 Domain d> { 246 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 248 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 249 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 250 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 251 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 252 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 253 Sched<[WriteFLoad]>; 254 255 // _alt version uses FR32/FR64 register class. 256 let isCodeGenOnly = 1 in { 257 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 258 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 259 [(set RC:$dst, (mem_pat addr:$src))], d>, 260 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 261 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 263 [(set RC:$dst, (mem_pat addr:$src))], d>, 264 Sched<[WriteFLoad]>; 265 } 266} 267 268defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 269 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 270defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 271 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 272 273let canFoldAsLoad = 1, isReMaterializable = 1 in { 274 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 275 SSEPackedSingle>, XS; 276 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 277 SSEPackedDouble>, XD; 278} 279 280// Patterns 281let Predicates = [UseAVX] in { 282 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 283 (VMOVSSrm addr:$src)>; 284 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 285 (VMOVSDrm addr:$src)>; 286 287 // Represent the same patterns above but in the form they appear for 288 // 256-bit types 289 def : Pat<(v8f32 (X86vzload32 addr:$src)), 290 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 291 def : Pat<(v4f64 (X86vzload64 addr:$src)), 292 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 293} 294 295let Predicates = [UseAVX, OptForSize] in { 296 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 297 // MOVSS to the lower bits. 298 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 299 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 300 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 301 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 302 303 // Move low f32 and clear high bits. 304 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 305 (SUBREG_TO_REG (i32 0), 306 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 307 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 308 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 309 (SUBREG_TO_REG (i32 0), 310 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 311 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 312} 313 314let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 315// Move scalar to XMM zero-extended, zeroing a VR128 then do a 316// MOVSS to the lower bits. 317def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 318 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 319def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 320 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 321} 322 323let Predicates = [UseSSE2] in 324def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 325 (MOVSDrm addr:$src)>; 326 327let Predicates = [UseSSE1] in 328def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 329 (MOVSSrm addr:$src)>; 330 331//===----------------------------------------------------------------------===// 332// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 333//===----------------------------------------------------------------------===// 334 335multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 336 X86MemOperand x86memop, PatFrag ld_frag, 337 string asm, Domain d, 338 X86SchedWriteMoveLS sched> { 339let hasSideEffects = 0, isMoveReg = 1 in 340 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 341 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 342 Sched<[sched.RR]>; 343let canFoldAsLoad = 1, isReMaterializable = 1 in 344 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 345 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 346 [(set RC:$dst, (ld_frag addr:$src))], d>, 347 Sched<[sched.RM]>; 348} 349 350let Predicates = [HasAVX, NoVLX] in { 351defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 352 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 353 PS, VEX, VEX_WIG; 354defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 355 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 356 PD, VEX, VEX_WIG; 357defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 358 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 359 PS, VEX, VEX_WIG; 360defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 361 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 362 PD, VEX, VEX_WIG; 363 364defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 365 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 366 PS, VEX, VEX_L, VEX_WIG; 367defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 368 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 369 PD, VEX, VEX_L, VEX_WIG; 370defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 371 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 372 PS, VEX, VEX_L, VEX_WIG; 373defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 374 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 375 PD, VEX, VEX_L, VEX_WIG; 376} 377 378let Predicates = [UseSSE1] in { 379defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 380 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 381 PS; 382defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 383 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 384 PS; 385} 386let Predicates = [UseSSE2] in { 387defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 388 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 389 PD; 390defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 391 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 392 PD; 393} 394 395let Predicates = [HasAVX, NoVLX] in { 396let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 397def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 398 "movaps\t{$src, $dst|$dst, $src}", 399 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 400 VEX, VEX_WIG; 401def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 402 "movapd\t{$src, $dst|$dst, $src}", 403 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 404 VEX, VEX_WIG; 405def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 406 "movups\t{$src, $dst|$dst, $src}", 407 [(store (v4f32 VR128:$src), addr:$dst)]>, 408 VEX, VEX_WIG; 409def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 410 "movupd\t{$src, $dst|$dst, $src}", 411 [(store (v2f64 VR128:$src), addr:$dst)]>, 412 VEX, VEX_WIG; 413} // SchedRW 414 415let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 416def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 417 "movaps\t{$src, $dst|$dst, $src}", 418 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 419 VEX, VEX_L, VEX_WIG; 420def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 421 "movapd\t{$src, $dst|$dst, $src}", 422 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 423 VEX, VEX_L, VEX_WIG; 424def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 425 "movups\t{$src, $dst|$dst, $src}", 426 [(store (v8f32 VR256:$src), addr:$dst)]>, 427 VEX, VEX_L, VEX_WIG; 428def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 429 "movupd\t{$src, $dst|$dst, $src}", 430 [(store (v4f64 VR256:$src), addr:$dst)]>, 431 VEX, VEX_L, VEX_WIG; 432} // SchedRW 433} // Predicate 434 435// For disassembler 436let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 437 isMoveReg = 1 in { 438let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 439 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 440 (ins VR128:$src), 441 "movaps\t{$src, $dst|$dst, $src}", []>, 442 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 443 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 444 (ins VR128:$src), 445 "movapd\t{$src, $dst|$dst, $src}", []>, 446 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 447 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 448 (ins VR128:$src), 449 "movups\t{$src, $dst|$dst, $src}", []>, 450 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 451 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 452 (ins VR128:$src), 453 "movupd\t{$src, $dst|$dst, $src}", []>, 454 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 455} // SchedRW 456 457let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 458 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 459 (ins VR256:$src), 460 "movaps\t{$src, $dst|$dst, $src}", []>, 461 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 462 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 463 (ins VR256:$src), 464 "movapd\t{$src, $dst|$dst, $src}", []>, 465 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 466 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 467 (ins VR256:$src), 468 "movups\t{$src, $dst|$dst, $src}", []>, 469 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 470 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 471 (ins VR256:$src), 472 "movupd\t{$src, $dst|$dst, $src}", []>, 473 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 474} // SchedRW 475} // Predicate 476 477// Reversed version with ".s" suffix for GAS compatibility. 478def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 479 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 480def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 481 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 482def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 483 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 484def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 485 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 486def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 487 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 488def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 489 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 490def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 491 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 492def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 493 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 494 495let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 496def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 497 "movaps\t{$src, $dst|$dst, $src}", 498 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 499def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 500 "movapd\t{$src, $dst|$dst, $src}", 501 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 502def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 503 "movups\t{$src, $dst|$dst, $src}", 504 [(store (v4f32 VR128:$src), addr:$dst)]>; 505def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 506 "movupd\t{$src, $dst|$dst, $src}", 507 [(store (v2f64 VR128:$src), addr:$dst)]>; 508} // SchedRW 509 510// For disassembler 511let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 512 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 513 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 514 "movaps\t{$src, $dst|$dst, $src}", []>, 515 FoldGenData<"MOVAPSrr">; 516 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 517 "movapd\t{$src, $dst|$dst, $src}", []>, 518 FoldGenData<"MOVAPDrr">; 519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movups\t{$src, $dst|$dst, $src}", []>, 521 FoldGenData<"MOVUPSrr">; 522 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 523 "movupd\t{$src, $dst|$dst, $src}", []>, 524 FoldGenData<"MOVUPDrr">; 525} 526 527// Reversed version with ".s" suffix for GAS compatibility. 528def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 529 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 530def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 531 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 532def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 533 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 534def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 535 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 536 537let Predicates = [HasAVX, NoVLX] in { 538 // 256-bit load/store need to use floating point load/store in case we don't 539 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 540 // available and changing the domain is beneficial. 541 def : Pat<(alignedloadv4i64 addr:$src), 542 (VMOVAPSYrm addr:$src)>; 543 def : Pat<(alignedloadv8i32 addr:$src), 544 (VMOVAPSYrm addr:$src)>; 545 def : Pat<(alignedloadv16i16 addr:$src), 546 (VMOVAPSYrm addr:$src)>; 547 def : Pat<(alignedloadv32i8 addr:$src), 548 (VMOVAPSYrm addr:$src)>; 549 def : Pat<(loadv4i64 addr:$src), 550 (VMOVUPSYrm addr:$src)>; 551 def : Pat<(loadv8i32 addr:$src), 552 (VMOVUPSYrm addr:$src)>; 553 def : Pat<(loadv16i16 addr:$src), 554 (VMOVUPSYrm addr:$src)>; 555 def : Pat<(loadv32i8 addr:$src), 556 (VMOVUPSYrm addr:$src)>; 557 558 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 559 (VMOVAPSYmr addr:$dst, VR256:$src)>; 560 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 561 (VMOVAPSYmr addr:$dst, VR256:$src)>; 562 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 563 (VMOVAPSYmr addr:$dst, VR256:$src)>; 564 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 565 (VMOVAPSYmr addr:$dst, VR256:$src)>; 566 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 567 (VMOVUPSYmr addr:$dst, VR256:$src)>; 568 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 569 (VMOVUPSYmr addr:$dst, VR256:$src)>; 570 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 571 (VMOVUPSYmr addr:$dst, VR256:$src)>; 572 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 573 (VMOVUPSYmr addr:$dst, VR256:$src)>; 574} 575 576// Use movaps / movups for SSE integer load / store (one byte shorter). 577// The instructions selected below are then converted to MOVDQA/MOVDQU 578// during the SSE domain pass. 579let Predicates = [UseSSE1] in { 580 def : Pat<(alignedloadv2i64 addr:$src), 581 (MOVAPSrm addr:$src)>; 582 def : Pat<(alignedloadv4i32 addr:$src), 583 (MOVAPSrm addr:$src)>; 584 def : Pat<(alignedloadv8i16 addr:$src), 585 (MOVAPSrm addr:$src)>; 586 def : Pat<(alignedloadv16i8 addr:$src), 587 (MOVAPSrm addr:$src)>; 588 def : Pat<(loadv2i64 addr:$src), 589 (MOVUPSrm addr:$src)>; 590 def : Pat<(loadv4i32 addr:$src), 591 (MOVUPSrm addr:$src)>; 592 def : Pat<(loadv8i16 addr:$src), 593 (MOVUPSrm addr:$src)>; 594 def : Pat<(loadv16i8 addr:$src), 595 (MOVUPSrm addr:$src)>; 596 597 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 598 (MOVAPSmr addr:$dst, VR128:$src)>; 599 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 600 (MOVAPSmr addr:$dst, VR128:$src)>; 601 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 602 (MOVAPSmr addr:$dst, VR128:$src)>; 603 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 604 (MOVAPSmr addr:$dst, VR128:$src)>; 605 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 606 (MOVUPSmr addr:$dst, VR128:$src)>; 607 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 608 (MOVUPSmr addr:$dst, VR128:$src)>; 609 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 610 (MOVUPSmr addr:$dst, VR128:$src)>; 611 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 612 (MOVUPSmr addr:$dst, VR128:$src)>; 613} 614 615//===----------------------------------------------------------------------===// 616// SSE 1 & 2 - Move Low packed FP Instructions 617//===----------------------------------------------------------------------===// 618 619multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, 620 string base_opc, string asm_opr> { 621 // No pattern as they need be special cased between high and low. 622 let hasSideEffects = 0, mayLoad = 1 in 623 def PSrm : PI<opc, MRMSrcMem, 624 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 625 !strconcat(base_opc, "s", asm_opr), 626 [], SSEPackedSingle>, PS, 627 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 628 629 def PDrm : PI<opc, MRMSrcMem, 630 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 631 !strconcat(base_opc, "d", asm_opr), 632 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 633 (scalar_to_vector (loadf64 addr:$src2)))))], 634 SSEPackedDouble>, PD, 635 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 636} 637 638multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 639 string base_opc> { 640 let Predicates = [UseAVX] in 641 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 642 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 643 VEX_4V, VEX_WIG; 644 645 let Constraints = "$src1 = $dst" in 646 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 647 "\t{$src2, $dst|$dst, $src2}">; 648} 649 650defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 651 652let SchedRW = [WriteFStore] in { 653let Predicates = [UseAVX] in { 654let mayStore = 1, hasSideEffects = 0 in 655def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 656 "movlps\t{$src, $dst|$dst, $src}", 657 []>, 658 VEX, VEX_WIG; 659def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 660 "movlpd\t{$src, $dst|$dst, $src}", 661 [(store (f64 (extractelt (v2f64 VR128:$src), 662 (iPTR 0))), addr:$dst)]>, 663 VEX, VEX_WIG; 664}// UseAVX 665let mayStore = 1, hasSideEffects = 0 in 666def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 667 "movlps\t{$src, $dst|$dst, $src}", 668 []>; 669def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 670 "movlpd\t{$src, $dst|$dst, $src}", 671 [(store (f64 (extractelt (v2f64 VR128:$src), 672 (iPTR 0))), addr:$dst)]>; 673} // SchedRW 674 675let Predicates = [UseSSE1] in { 676 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 677 // end up with a movsd or blend instead of shufp. 678 // No need for aligned load, we're only loading 64-bits. 679 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 680 (i8 -28)), 681 (MOVLPSrm VR128:$src1, addr:$src2)>; 682 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 683 (MOVLPSrm VR128:$src1, addr:$src2)>; 684 685 def : Pat<(v4f32 (X86vzload64 addr:$src)), 686 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 687 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 688 (MOVLPSmr addr:$dst, VR128:$src)>; 689} 690 691//===----------------------------------------------------------------------===// 692// SSE 1 & 2 - Move Hi packed FP Instructions 693//===----------------------------------------------------------------------===// 694 695defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 696 697let SchedRW = [WriteFStore] in { 698// v2f64 extract element 1 is always custom lowered to unpack high to low 699// and extract element 0 so the non-store version isn't too horrible. 700let Predicates = [UseAVX] in { 701let mayStore = 1, hasSideEffects = 0 in 702def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 703 "movhps\t{$src, $dst|$dst, $src}", 704 []>, VEX, VEX_WIG; 705def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 706 "movhpd\t{$src, $dst|$dst, $src}", 707 [(store (f64 (extractelt 708 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 709 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 710} // UseAVX 711let mayStore = 1, hasSideEffects = 0 in 712def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 713 "movhps\t{$src, $dst|$dst, $src}", 714 []>; 715def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 716 "movhpd\t{$src, $dst|$dst, $src}", 717 [(store (f64 (extractelt 718 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 719 (iPTR 0))), addr:$dst)]>; 720} // SchedRW 721 722let Predicates = [UseAVX] in { 723 // Also handle an i64 load because that may get selected as a faster way to 724 // load the data. 725 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 726 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 727 (VMOVHPDrm VR128:$src1, addr:$src2)>; 728 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 729 (VMOVHPDrm VR128:$src1, addr:$src2)>; 730 731 def : Pat<(store (f64 (extractelt 732 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 733 (iPTR 0))), addr:$dst), 734 (VMOVHPDmr addr:$dst, VR128:$src)>; 735 736 // MOVLPD patterns 737 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 738 (VMOVLPDrm VR128:$src1, addr:$src2)>; 739} 740 741let Predicates = [UseSSE1] in { 742 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 743 // end up with a movsd or blend instead of shufp. 744 // No need for aligned load, we're only loading 64-bits. 745 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 746 (MOVHPSrm VR128:$src1, addr:$src2)>; 747 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 748 (MOVHPSrm VR128:$src1, addr:$src2)>; 749 750 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 751 addr:$dst), 752 (MOVHPSmr addr:$dst, VR128:$src)>; 753} 754 755let Predicates = [UseSSE2] in { 756 // MOVHPD patterns 757 758 // Also handle an i64 load because that may get selected as a faster way to 759 // load the data. 760 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 761 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 762 (MOVHPDrm VR128:$src1, addr:$src2)>; 763 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 764 (MOVHPDrm VR128:$src1, addr:$src2)>; 765 766 def : Pat<(store (f64 (extractelt 767 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 768 (iPTR 0))), addr:$dst), 769 (MOVHPDmr addr:$dst, VR128:$src)>; 770 771 // MOVLPD patterns 772 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 773 (MOVLPDrm VR128:$src1, addr:$src2)>; 774} 775 776let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 777 // Use MOVLPD to load into the low bits from a full vector unless we can use 778 // BLENDPD. 779 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 780 (MOVLPDrm VR128:$src1, addr:$src2)>; 781} 782 783//===----------------------------------------------------------------------===// 784// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 785//===----------------------------------------------------------------------===// 786 787let Predicates = [UseAVX] in { 788 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 789 (ins VR128:$src1, VR128:$src2), 790 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 791 [(set VR128:$dst, 792 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 793 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 794 let isCommutable = 1 in 795 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 796 (ins VR128:$src1, VR128:$src2), 797 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 798 [(set VR128:$dst, 799 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 800 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 801 NotMemoryFoldable; 802} 803let Constraints = "$src1 = $dst" in { 804 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 805 (ins VR128:$src1, VR128:$src2), 806 "movlhps\t{$src2, $dst|$dst, $src2}", 807 [(set VR128:$dst, 808 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 809 Sched<[SchedWriteFShuffle.XMM]>; 810 let isCommutable = 1 in 811 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 812 (ins VR128:$src1, VR128:$src2), 813 "movhlps\t{$src2, $dst|$dst, $src2}", 814 [(set VR128:$dst, 815 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 816 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 817} 818 819//===----------------------------------------------------------------------===// 820// SSE 1 & 2 - Conversion Instructions 821//===----------------------------------------------------------------------===// 822 823multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 824 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 825 string asm, string mem, X86FoldableSchedWrite sched, 826 Domain d, 827 SchedRead Int2Fpu = ReadDefault> { 828 let ExeDomain = d in { 829 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 830 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 831 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 832 Sched<[sched, Int2Fpu]>; 833 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 834 mem#"\t{$src, $dst|$dst, $src}", 835 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 836 Sched<[sched.Folded]>; 837 } 838} 839 840multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 841 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 842 string asm, Domain d, X86FoldableSchedWrite sched> { 843let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 844 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 845 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 846 Sched<[sched]>; 847 let mayLoad = 1 in 848 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 849 [(set RC:$dst, (DstTy (any_sint_to_fp 850 (SrcTy (ld_frag addr:$src)))))], d>, 851 Sched<[sched.Folded]>; 852} 853} 854 855multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 856 X86MemOperand x86memop, string asm, string mem, 857 X86FoldableSchedWrite sched, Domain d> { 858let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 859 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 860 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 861 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 862 let mayLoad = 1 in 863 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 864 (ins DstRC:$src1, x86memop:$src), 865 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 866 Sched<[sched.Folded, sched.ReadAfterFold]>; 867} // hasSideEffects = 0 868} 869 870let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 871defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 872 "cvttss2si", "cvttss2si", 873 WriteCvtSS2I, SSEPackedSingle>, 874 XS, VEX, VEX_LIG; 875defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 876 "cvttss2si", "cvttss2si", 877 WriteCvtSS2I, SSEPackedSingle>, 878 XS, VEX, VEX_W, VEX_LIG; 879defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 880 "cvttsd2si", "cvttsd2si", 881 WriteCvtSD2I, SSEPackedDouble>, 882 XD, VEX, VEX_LIG; 883defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 884 "cvttsd2si", "cvttsd2si", 885 WriteCvtSD2I, SSEPackedDouble>, 886 XD, VEX, VEX_W, VEX_LIG; 887} 888 889// The assembler can recognize rr 64-bit instructions by seeing a rxx 890// register, but the same isn't true when only using memory operands, 891// provide other assembly "l" and "q" forms to address this explicitly 892// where appropriate to do so. 893let isCodeGenOnly = 1 in { 894defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 895 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 896 VEX_LIG, SIMD_EXC; 897defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 898 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 899 VEX_W, VEX_LIG, SIMD_EXC; 900defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 901 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 902 VEX_LIG; 903defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 904 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 905 VEX_W, VEX_LIG, SIMD_EXC; 906} // isCodeGenOnly = 1 907 908let Predicates = [UseAVX] in { 909 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 910 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 911 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 912 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 913 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 914 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 915 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 916 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 917 918 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 919 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 920 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 921 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 922 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 923 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 924 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 925 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 926} 927 928let isCodeGenOnly = 1 in { 929defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 930 "cvttss2si", "cvttss2si", 931 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 932defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 933 "cvttss2si", "cvttss2si", 934 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 935defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 936 "cvttsd2si", "cvttsd2si", 937 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 938defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 939 "cvttsd2si", "cvttsd2si", 940 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 941defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 942 "cvtsi2ss", "cvtsi2ss{l}", 943 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; 944defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 945 "cvtsi2ss", "cvtsi2ss{q}", 946 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; 947defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 948 "cvtsi2sd", "cvtsi2sd{l}", 949 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; 950defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 951 "cvtsi2sd", "cvtsi2sd{q}", 952 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; 953} // isCodeGenOnly = 1 954 955// Conversion Instructions Intrinsics - Match intrinsics which expect MM 956// and/or XMM operand(s). 957 958multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 959 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 960 Operand memop, ComplexPattern mem_cpat, string asm, 961 X86FoldableSchedWrite sched, Domain d> { 962let ExeDomain = d in { 963 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 964 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 965 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 966 Sched<[sched]>; 967 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 968 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 969 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>, 970 Sched<[sched.Folded]>; 971} 972} 973 974multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 975 RegisterClass DstRC, X86MemOperand x86memop, 976 string asm, string mem, X86FoldableSchedWrite sched, 977 Domain d, bit Is2Addr = 1> { 978let hasSideEffects = 0, ExeDomain = d in { 979 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 980 !if(Is2Addr, 981 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 982 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 983 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 984 let mayLoad = 1 in 985 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 986 (ins DstRC:$src1, x86memop:$src2), 987 !if(Is2Addr, 988 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 989 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 990 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 991} 992} 993 994let Uses = [MXCSR], mayRaiseFPException = 1 in { 995let Predicates = [UseAVX] in { 996defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 997 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 998 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 999defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1000 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1001 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; 1002} 1003defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1004 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1005 SSEPackedDouble>, XD; 1006defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1007 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1008 SSEPackedDouble>, XD, REX_W; 1009} 1010 1011let Predicates = [UseAVX] in { 1012defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1013 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1014 XS, VEX_4V, VEX_LIG, SIMD_EXC; 1015defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1016 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1017 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1018defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1019 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1020 XD, VEX_4V, VEX_LIG; 1021defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1022 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1023 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1024} 1025let Constraints = "$src1 = $dst" in { 1026 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1027 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1028 XS, SIMD_EXC; 1029 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1030 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1031 XS, REX_W, SIMD_EXC; 1032 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1033 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1034 XD; 1035 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1036 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1037 XD, REX_W, SIMD_EXC; 1038} 1039 1040def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1041 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1042def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1043 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1044def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1045 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1046def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1047 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1048 1049def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1050 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1051def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1052 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1053 1054def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1055 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1056def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1057 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1058def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1059 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1060def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1061 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1062 1063def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1064 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1065def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1066 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1067 1068/// SSE 1 Only 1069 1070// Aliases for intrinsics 1071let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1072defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1073 ssmem, sse_load_f32, "cvttss2si", 1074 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1075defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1076 X86cvtts2Int, ssmem, sse_load_f32, 1077 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1078 XS, VEX, VEX_LIG, VEX_W; 1079defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1080 sdmem, sse_load_f64, "cvttsd2si", 1081 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1082defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1083 X86cvtts2Int, sdmem, sse_load_f64, 1084 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1085 XD, VEX, VEX_LIG, VEX_W; 1086} 1087let Uses = [MXCSR], mayRaiseFPException = 1 in { 1088defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1089 ssmem, sse_load_f32, "cvttss2si", 1090 WriteCvtSS2I, SSEPackedSingle>, XS; 1091defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1092 X86cvtts2Int, ssmem, sse_load_f32, 1093 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1094 XS, REX_W; 1095defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1096 sdmem, sse_load_f64, "cvttsd2si", 1097 WriteCvtSD2I, SSEPackedDouble>, XD; 1098defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1099 X86cvtts2Int, sdmem, sse_load_f64, 1100 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1101 XD, REX_W; 1102} 1103 1104def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1105 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1106def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1107 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1108def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1109 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1110def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1111 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1112def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1113 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1114def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1115 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1116def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1117 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1118def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1119 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1120 1121def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1122 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1123def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1124 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1125def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1126 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1127def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1128 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1129def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1130 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1131def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1132 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1133def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1134 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1135def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1136 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1137 1138let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1139defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1140 ssmem, sse_load_f32, "cvtss2si", 1141 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1142defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1143 ssmem, sse_load_f32, "cvtss2si", 1144 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; 1145} 1146let Uses = [MXCSR], mayRaiseFPException = 1 in { 1147defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1148 ssmem, sse_load_f32, "cvtss2si", 1149 WriteCvtSS2I, SSEPackedSingle>, XS; 1150defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1151 ssmem, sse_load_f32, "cvtss2si", 1152 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W; 1153 1154defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1155 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1156 SSEPackedSingle, WriteCvtI2PS>, 1157 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1158defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1159 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1160 SSEPackedSingle, WriteCvtI2PSY>, 1161 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1162 1163defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1164 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1165 SSEPackedSingle, WriteCvtI2PS>, 1166 PS, Requires<[UseSSE2]>; 1167} 1168 1169// AVX aliases 1170def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1171 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1172def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1173 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1174def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1175 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1176def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1177 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1178def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1179 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1180def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1181 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1182def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1183 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1184def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1185 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1186 1187// SSE aliases 1188def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1189 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1190def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1191 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1192def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1193 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1194def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1195 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1196def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1197 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1198def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1199 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1200def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1201 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1202def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1203 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1204 1205/// SSE 2 Only 1206 1207// Convert scalar double to scalar single 1208let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in { 1209def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1210 (ins FR32:$src1, FR64:$src2), 1211 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1212 VEX_4V, VEX_LIG, VEX_WIG, 1213 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1214let mayLoad = 1 in 1215def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1216 (ins FR32:$src1, f64mem:$src2), 1217 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1218 XD, VEX_4V, VEX_LIG, VEX_WIG, 1219 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1220} 1221 1222def : Pat<(f32 (any_fpround FR64:$src)), 1223 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1224 Requires<[UseAVX]>; 1225 1226let isCodeGenOnly = 1 in { 1227def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1228 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1229 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1230 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1231def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1232 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1233 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1234 XD, Requires<[UseSSE2, OptForSize]>, 1235 Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC; 1236} 1237 1238let Uses = [MXCSR], mayRaiseFPException = 1 in { 1239def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1240 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1241 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1242 [(set VR128:$dst, 1243 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1244 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1245 Sched<[WriteCvtSD2SS]>; 1246def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1247 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1248 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1249 [(set VR128:$dst, 1250 (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>, 1251 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1252 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1253let Constraints = "$src1 = $dst" in { 1254def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1255 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1256 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1257 [(set VR128:$dst, 1258 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1259 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1260def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1261 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1262 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1263 [(set VR128:$dst, 1264 (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>, 1265 XD, Requires<[UseSSE2]>, 1266 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1267} 1268} 1269 1270// Convert scalar single to scalar double 1271// SSE2 instructions with XS prefix 1272let isCodeGenOnly = 1, hasSideEffects = 0 in { 1273def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1274 (ins FR64:$src1, FR32:$src2), 1275 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1276 XS, VEX_4V, VEX_LIG, VEX_WIG, 1277 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1278let mayLoad = 1 in 1279def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1280 (ins FR64:$src1, f32mem:$src2), 1281 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1282 XS, VEX_4V, VEX_LIG, VEX_WIG, 1283 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1284 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1285} // isCodeGenOnly = 1, hasSideEffects = 0 1286 1287def : Pat<(f64 (any_fpextend FR32:$src)), 1288 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1289def : Pat<(any_fpextend (loadf32 addr:$src)), 1290 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1291 1292let isCodeGenOnly = 1 in { 1293def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1294 "cvtss2sd\t{$src, $dst|$dst, $src}", 1295 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1296 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1297def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1298 "cvtss2sd\t{$src, $dst|$dst, $src}", 1299 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1300 XS, Requires<[UseSSE2, OptForSize]>, 1301 Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC; 1302} // isCodeGenOnly = 1 1303 1304let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 1305def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1306 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1307 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1308 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, 1309 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1310let mayLoad = 1 in 1311def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1312 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1313 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1314 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, 1315 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1316let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1317def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1318 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1319 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1320 []>, XS, Requires<[UseSSE2]>, 1321 Sched<[WriteCvtSS2SD]>; 1322let mayLoad = 1 in 1323def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1324 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1325 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1326 []>, XS, Requires<[UseSSE2]>, 1327 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1328} 1329} // hasSideEffects = 0 1330 1331// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1332// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1333// vmovs{s,d} instructions 1334let Predicates = [UseAVX] in { 1335def : Pat<(v4f32 (X86Movss 1336 (v4f32 VR128:$dst), 1337 (v4f32 (scalar_to_vector 1338 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1339 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1340 1341def : Pat<(v2f64 (X86Movsd 1342 (v2f64 VR128:$dst), 1343 (v2f64 (scalar_to_vector 1344 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1345 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1346 1347def : Pat<(v4f32 (X86Movss 1348 (v4f32 VR128:$dst), 1349 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1350 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1351 1352def : Pat<(v4f32 (X86Movss 1353 (v4f32 VR128:$dst), 1354 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1355 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1356 1357def : Pat<(v4f32 (X86Movss 1358 (v4f32 VR128:$dst), 1359 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1360 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1361 1362def : Pat<(v4f32 (X86Movss 1363 (v4f32 VR128:$dst), 1364 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1365 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1366 1367def : Pat<(v2f64 (X86Movsd 1368 (v2f64 VR128:$dst), 1369 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1370 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1371 1372def : Pat<(v2f64 (X86Movsd 1373 (v2f64 VR128:$dst), 1374 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1375 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1376 1377def : Pat<(v2f64 (X86Movsd 1378 (v2f64 VR128:$dst), 1379 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1380 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1381 1382def : Pat<(v2f64 (X86Movsd 1383 (v2f64 VR128:$dst), 1384 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1385 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1386} // Predicates = [UseAVX] 1387 1388let Predicates = [UseSSE2] in { 1389def : Pat<(v4f32 (X86Movss 1390 (v4f32 VR128:$dst), 1391 (v4f32 (scalar_to_vector 1392 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1393 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1394 1395def : Pat<(v2f64 (X86Movsd 1396 (v2f64 VR128:$dst), 1397 (v2f64 (scalar_to_vector 1398 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1399 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1400 1401def : Pat<(v2f64 (X86Movsd 1402 (v2f64 VR128:$dst), 1403 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1404 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1405 1406def : Pat<(v2f64 (X86Movsd 1407 (v2f64 VR128:$dst), 1408 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1409 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1410 1411def : Pat<(v2f64 (X86Movsd 1412 (v2f64 VR128:$dst), 1413 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1414 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1415 1416def : Pat<(v2f64 (X86Movsd 1417 (v2f64 VR128:$dst), 1418 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1419 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1420} // Predicates = [UseSSE2] 1421 1422let Predicates = [UseSSE1] in { 1423def : Pat<(v4f32 (X86Movss 1424 (v4f32 VR128:$dst), 1425 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1426 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1427 1428def : Pat<(v4f32 (X86Movss 1429 (v4f32 VR128:$dst), 1430 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1431 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1432 1433def : Pat<(v4f32 (X86Movss 1434 (v4f32 VR128:$dst), 1435 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1436 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1437 1438def : Pat<(v4f32 (X86Movss 1439 (v4f32 VR128:$dst), 1440 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1441 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1442} // Predicates = [UseSSE1] 1443 1444let Predicates = [HasAVX, NoVLX] in { 1445// Convert packed single/double fp to doubleword 1446def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1447 "cvtps2dq\t{$src, $dst|$dst, $src}", 1448 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1449 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC; 1450def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1451 "cvtps2dq\t{$src, $dst|$dst, $src}", 1452 [(set VR128:$dst, 1453 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1454 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC; 1455def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1456 "cvtps2dq\t{$src, $dst|$dst, $src}", 1457 [(set VR256:$dst, 1458 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1459 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC; 1460def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1461 "cvtps2dq\t{$src, $dst|$dst, $src}", 1462 [(set VR256:$dst, 1463 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1464 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC; 1465} 1466def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1467 "cvtps2dq\t{$src, $dst|$dst, $src}", 1468 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1469 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1470def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1471 "cvtps2dq\t{$src, $dst|$dst, $src}", 1472 [(set VR128:$dst, 1473 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1474 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1475 1476 1477// Convert Packed Double FP to Packed DW Integers 1478let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1479// The assembler can recognize rr 256-bit instructions by seeing a ymm 1480// register, but the same isn't true when using memory operands instead. 1481// Provide other assembly rr and rm forms to address this explicitly. 1482def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1483 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1484 [(set VR128:$dst, 1485 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1486 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1487 1488// XMM only 1489def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1490 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1491 [(set VR128:$dst, 1492 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1493 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1494 1495// YMM only 1496def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1497 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1498 [(set VR128:$dst, 1499 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1500 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1501def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1502 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1503 [(set VR128:$dst, 1504 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1505 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1506} 1507 1508def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1509 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1510def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1511 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1512 1513def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1514 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1515 [(set VR128:$dst, 1516 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1517 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1518def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1519 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1520 [(set VR128:$dst, 1521 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1522 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1523 1524// Convert with truncation packed single/double fp to doubleword 1525// SSE2 packed instructions with XS prefix 1526let Uses = [MXCSR], mayRaiseFPException = 1 in { 1527let Predicates = [HasAVX, NoVLX] in { 1528def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1529 "cvttps2dq\t{$src, $dst|$dst, $src}", 1530 [(set VR128:$dst, 1531 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1532 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1533def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1534 "cvttps2dq\t{$src, $dst|$dst, $src}", 1535 [(set VR128:$dst, 1536 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1537 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1538def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1539 "cvttps2dq\t{$src, $dst|$dst, $src}", 1540 [(set VR256:$dst, 1541 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1542 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1543def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1544 "cvttps2dq\t{$src, $dst|$dst, $src}", 1545 [(set VR256:$dst, 1546 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1547 VEX, VEX_L, 1548 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1549} 1550 1551def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1552 "cvttps2dq\t{$src, $dst|$dst, $src}", 1553 [(set VR128:$dst, 1554 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1555 Sched<[WriteCvtPS2I]>; 1556def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1557 "cvttps2dq\t{$src, $dst|$dst, $src}", 1558 [(set VR128:$dst, 1559 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1560 Sched<[WriteCvtPS2ILd]>; 1561} 1562 1563// The assembler can recognize rr 256-bit instructions by seeing a ymm 1564// register, but the same isn't true when using memory operands instead. 1565// Provide other assembly rr and rm forms to address this explicitly. 1566let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1567// XMM only 1568def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1569 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1570 [(set VR128:$dst, 1571 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1572 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1573def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1574 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1575 [(set VR128:$dst, 1576 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1577 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1578 1579// YMM only 1580def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1581 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1582 [(set VR128:$dst, 1583 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1584 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1585def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1586 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1587 [(set VR128:$dst, 1588 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1589 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1590} // Predicates = [HasAVX, NoVLX] 1591 1592def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1593 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1594def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1595 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1596 1597let Predicates = [HasAVX, NoVLX] in { 1598 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1599 (VCVTTPD2DQYrr VR256:$src)>; 1600 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1601 (VCVTTPD2DQYrm addr:$src)>; 1602} 1603 1604def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1605 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1606 [(set VR128:$dst, 1607 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1608 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1609def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1610 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1611 [(set VR128:$dst, 1612 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1613 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1614 1615// Convert packed single to packed double 1616let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1617 // SSE2 instructions without OpSize prefix 1618def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1619 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1620 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1621 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1622def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1623 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1624 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1625 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1626def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1627 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1628 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1629 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1630def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1631 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1632 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1633 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1634} 1635 1636let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1637def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1638 "cvtps2pd\t{$src, $dst|$dst, $src}", 1639 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1640 PS, Sched<[WriteCvtPS2PD]>; 1641def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1642 "cvtps2pd\t{$src, $dst|$dst, $src}", 1643 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1644 PS, Sched<[WriteCvtPS2PD.Folded]>; 1645} 1646 1647// Convert Packed DW Integers to Packed Double FP 1648let Predicates = [HasAVX, NoVLX] in { 1649let hasSideEffects = 0, mayLoad = 1 in 1650def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1651 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1652 [(set VR128:$dst, 1653 (v2f64 (X86any_VSintToFP 1654 (bc_v4i32 1655 (v2i64 (scalar_to_vector 1656 (loadi64 addr:$src)))))))]>, 1657 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1658def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1659 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1660 [(set VR128:$dst, 1661 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1662 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1663def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1664 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1665 [(set VR256:$dst, 1666 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1667 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1668 VEX_WIG; 1669def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1670 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1671 [(set VR256:$dst, 1672 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1673 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1674} 1675 1676let hasSideEffects = 0, mayLoad = 1 in 1677def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1678 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1679 [(set VR128:$dst, 1680 (v2f64 (X86any_VSintToFP 1681 (bc_v4i32 1682 (v2i64 (scalar_to_vector 1683 (loadi64 addr:$src)))))))]>, 1684 Sched<[WriteCvtI2PDLd]>; 1685def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1686 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1687 [(set VR128:$dst, 1688 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1689 Sched<[WriteCvtI2PD]>; 1690 1691// AVX register conversion intrinsics 1692let Predicates = [HasAVX, NoVLX] in { 1693 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1694 (VCVTDQ2PDrm addr:$src)>; 1695} // Predicates = [HasAVX, NoVLX] 1696 1697// SSE2 register conversion intrinsics 1698let Predicates = [UseSSE2] in { 1699 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1700 (CVTDQ2PDrm addr:$src)>; 1701} // Predicates = [UseSSE2] 1702 1703// Convert packed double to packed single 1704// The assembler can recognize rr 256-bit instructions by seeing a ymm 1705// register, but the same isn't true when using memory operands instead. 1706// Provide other assembly rr and rm forms to address this explicitly. 1707let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1708// XMM only 1709def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1710 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1711 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, 1712 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1713def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1714 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1715 [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>, 1716 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1717 1718def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1719 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1720 [(set VR128:$dst, (X86any_vfpround VR256:$src))]>, 1721 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1722def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1723 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1724 [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>, 1725 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1726} // Predicates = [HasAVX, NoVLX] 1727 1728def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1729 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1730def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1731 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1732 1733def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1734 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1735 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, 1736 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1737def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1738 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1739 [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>, 1740 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1741 1742//===----------------------------------------------------------------------===// 1743// SSE 1 & 2 - Compare Instructions 1744//===----------------------------------------------------------------------===// 1745 1746// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1747multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1748 SDNode OpNode, ValueType VT, 1749 PatFrag ld_frag, string asm, 1750 X86FoldableSchedWrite sched> { 1751let Uses = [MXCSR], mayRaiseFPException = 1 in { 1752 let isCommutable = 1 in 1753 def rr : SIi8<0xC2, MRMSrcReg, 1754 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1755 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>, 1756 Sched<[sched]>; 1757 def rm : SIi8<0xC2, MRMSrcMem, 1758 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1759 [(set RC:$dst, (OpNode (VT RC:$src1), 1760 (ld_frag addr:$src2), timm:$cc))]>, 1761 Sched<[sched.Folded, sched.ReadAfterFold]>; 1762} 1763} 1764 1765let isCodeGenOnly = 1 in { 1766 let ExeDomain = SSEPackedSingle in 1767 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, 1768 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1769 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; 1770 let ExeDomain = SSEPackedDouble in 1771 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, 1772 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1773 SchedWriteFCmpSizes.PD.Scl>, 1774 XD, VEX_4V, VEX_LIG, VEX_WIG; 1775 1776 let Constraints = "$src1 = $dst" in { 1777 let ExeDomain = SSEPackedSingle in 1778 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, 1779 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1780 SchedWriteFCmpSizes.PS.Scl>, XS; 1781 let ExeDomain = SSEPackedDouble in 1782 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, 1783 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1784 SchedWriteFCmpSizes.PD.Scl>, XD; 1785 } 1786} 1787 1788multiclass sse12_cmp_scalar_int<Operand memop, 1789 Intrinsic Int, string asm, X86FoldableSchedWrite sched, 1790 ComplexPattern mem_cpat> { 1791let Uses = [MXCSR], mayRaiseFPException = 1 in { 1792 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1793 (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, 1794 [(set VR128:$dst, (Int VR128:$src1, 1795 VR128:$src, timm:$cc))]>, 1796 Sched<[sched]>; 1797let mayLoad = 1 in 1798 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1799 (ins VR128:$src1, memop:$src, u8imm:$cc), asm, 1800 [(set VR128:$dst, (Int VR128:$src1, 1801 mem_cpat:$src, timm:$cc))]>, 1802 Sched<[sched.Folded, sched.ReadAfterFold]>; 1803} 1804} 1805 1806// Aliases to match intrinsics which expect XMM operand(s). 1807let ExeDomain = SSEPackedSingle in 1808defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, 1809 "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", 1810 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1811 XS, VEX_4V, VEX_LIG, VEX_WIG; 1812let ExeDomain = SSEPackedDouble in 1813defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, 1814 "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", 1815 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1816 XD, VEX_4V, VEX_LIG, VEX_WIG; 1817let Constraints = "$src1 = $dst" in { 1818 let ExeDomain = SSEPackedSingle in 1819 defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, 1820 "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}", 1821 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1822 let ExeDomain = SSEPackedDouble in 1823 defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, 1824 "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}", 1825 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1826} 1827 1828 1829// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1830multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 1831 ValueType vt, X86MemOperand x86memop, 1832 PatFrag ld_frag, string OpcodeStr, Domain d, 1833 X86FoldableSchedWrite sched = WriteFCom> { 1834let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, 1835 ExeDomain = d in { 1836 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1837 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1838 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1839 Sched<[sched]>; 1840let mayLoad = 1 in 1841 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1842 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1843 [(set EFLAGS, (OpNode (vt RC:$src1), 1844 (ld_frag addr:$src2)))]>, 1845 Sched<[sched.Folded, sched.ReadAfterFold]>; 1846} 1847} 1848 1849// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1850multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1851 ValueType vt, Operand memop, 1852 ComplexPattern mem_cpat, string OpcodeStr, 1853 Domain d, 1854 X86FoldableSchedWrite sched = WriteFCom> { 1855let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in { 1856 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1857 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1858 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1859 Sched<[sched]>; 1860let mayLoad = 1 in 1861 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1862 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1863 [(set EFLAGS, (OpNode (vt RC:$src1), 1864 mem_cpat:$src2))]>, 1865 Sched<[sched.Folded, sched.ReadAfterFold]>; 1866} 1867} 1868 1869let Defs = [EFLAGS] in { 1870 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1871 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1872 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1873 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1874 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1875 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1876 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1877 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1878 1879 let isCodeGenOnly = 1 in { 1880 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1881 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1882 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1883 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1884 1885 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1886 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1887 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1888 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1889 } 1890 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1891 "ucomiss", SSEPackedSingle>, PS; 1892 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1893 "ucomisd", SSEPackedDouble>, PD; 1894 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1895 "comiss", SSEPackedSingle>, PS; 1896 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1897 "comisd", SSEPackedDouble>, PD; 1898 1899 let isCodeGenOnly = 1 in { 1900 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1901 sse_load_f32, "ucomiss", SSEPackedSingle>, PS; 1902 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1903 sse_load_f64, "ucomisd", SSEPackedDouble>, PD; 1904 1905 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1906 sse_load_f32, "comiss", SSEPackedSingle>, PS; 1907 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1908 sse_load_f64, "comisd", SSEPackedDouble>, PD; 1909 } 1910} // Defs = [EFLAGS] 1911 1912// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1913multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1914 ValueType VT, string asm, 1915 X86FoldableSchedWrite sched, 1916 Domain d, PatFrag ld_frag> { 1917let Uses = [MXCSR], mayRaiseFPException = 1 in { 1918 let isCommutable = 1 in 1919 def rri : PIi8<0xC2, MRMSrcReg, 1920 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1921 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1922 Sched<[sched]>; 1923 def rmi : PIi8<0xC2, MRMSrcMem, 1924 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1925 [(set RC:$dst, 1926 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1927 Sched<[sched.Folded, sched.ReadAfterFold]>; 1928} 1929} 1930 1931defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1932 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1933 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 1934defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1935 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1936 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 1937defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1938 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1939 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 1940defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1941 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1942 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 1943let Constraints = "$src1 = $dst" in { 1944 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1945 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1946 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 1947 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1948 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1949 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 1950} 1951 1952def CommutableCMPCC : PatLeaf<(timm), [{ 1953 uint64_t Imm = N->getZExtValue() & 0x7; 1954 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 1955}]>; 1956 1957// Patterns to select compares with loads in first operand. 1958let Predicates = [HasAVX] in { 1959 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 1960 CommutableCMPCC:$cc)), 1961 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 1962 1963 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 1964 CommutableCMPCC:$cc)), 1965 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 1966 1967 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 1968 CommutableCMPCC:$cc)), 1969 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 1970 1971 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 1972 CommutableCMPCC:$cc)), 1973 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 1974 1975 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1976 CommutableCMPCC:$cc)), 1977 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 1978 1979 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 1980 CommutableCMPCC:$cc)), 1981 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 1982} 1983 1984let Predicates = [UseSSE2] in { 1985 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 1986 CommutableCMPCC:$cc)), 1987 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 1988 1989 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1990 CommutableCMPCC:$cc)), 1991 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 1992} 1993 1994let Predicates = [UseSSE1] in { 1995 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 1996 CommutableCMPCC:$cc)), 1997 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 1998 1999 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2000 CommutableCMPCC:$cc)), 2001 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2002} 2003 2004//===----------------------------------------------------------------------===// 2005// SSE 1 & 2 - Shuffle Instructions 2006//===----------------------------------------------------------------------===// 2007 2008/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2009multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2010 ValueType vt, string asm, PatFrag mem_frag, 2011 X86FoldableSchedWrite sched, Domain d, 2012 bit IsCommutable = 0> { 2013 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2014 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2015 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2016 (i8 timm:$src3))))], d>, 2017 Sched<[sched.Folded, sched.ReadAfterFold]>; 2018 let isCommutable = IsCommutable in 2019 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2020 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2021 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2022 (i8 timm:$src3))))], d>, 2023 Sched<[sched]>; 2024} 2025 2026let Predicates = [HasAVX, NoVLX] in { 2027 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2028 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2029 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2030 PS, VEX_4V, VEX_WIG; 2031 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2032 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2033 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2034 PS, VEX_4V, VEX_L, VEX_WIG; 2035 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2036 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2037 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2038 PD, VEX_4V, VEX_WIG; 2039 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2040 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2041 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2042 PD, VEX_4V, VEX_L, VEX_WIG; 2043} 2044let Constraints = "$src1 = $dst" in { 2045 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2046 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2047 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2048 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2049 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2050 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2051} 2052 2053//===----------------------------------------------------------------------===// 2054// SSE 1 & 2 - Unpack FP Instructions 2055//===----------------------------------------------------------------------===// 2056 2057/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2058multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2059 PatFrag mem_frag, RegisterClass RC, 2060 X86MemOperand x86memop, string asm, 2061 X86FoldableSchedWrite sched, Domain d, 2062 bit IsCommutable = 0> { 2063 let isCommutable = IsCommutable in 2064 def rr : PI<opc, MRMSrcReg, 2065 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2066 asm, [(set RC:$dst, 2067 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2068 Sched<[sched]>; 2069 def rm : PI<opc, MRMSrcMem, 2070 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2071 asm, [(set RC:$dst, 2072 (vt (OpNode RC:$src1, 2073 (mem_frag addr:$src2))))], d>, 2074 Sched<[sched.Folded, sched.ReadAfterFold]>; 2075} 2076 2077let Predicates = [HasAVX, NoVLX] in { 2078defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2079 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2080 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2081defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2082 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2083 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2084defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2085 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2086 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2087defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2088 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2089 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2090 2091defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2092 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2093 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2094defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2095 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2096 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2097defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2098 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2099 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2100defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2101 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2102 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2103}// Predicates = [HasAVX, NoVLX] 2104 2105let Constraints = "$src1 = $dst" in { 2106 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2107 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2108 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2109 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2110 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2111 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2112 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2113 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2114 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2115 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2116 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2117 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2118} // Constraints = "$src1 = $dst" 2119 2120let Predicates = [HasAVX1Only] in { 2121 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2122 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2123 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2124 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2125 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2126 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2127 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2128 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2129 2130 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2131 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2132 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2133 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2134 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2135 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2136 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2137 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2138} 2139 2140let Predicates = [UseSSE2] in { 2141 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2142 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2143 (v2f64 (simple_load addr:$src2)))), 2144 (MOVHPDrm VR128:$src1, addr:$src2)>; 2145} 2146 2147//===----------------------------------------------------------------------===// 2148// SSE 1 & 2 - Extract Floating-Point Sign mask 2149//===----------------------------------------------------------------------===// 2150 2151/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2152multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2153 string asm, Domain d> { 2154 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2155 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2156 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2157 Sched<[WriteFMOVMSK]>; 2158} 2159 2160let Predicates = [HasAVX] in { 2161 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2162 SSEPackedSingle>, PS, VEX, VEX_WIG; 2163 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2164 SSEPackedDouble>, PD, VEX, VEX_WIG; 2165 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2166 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2167 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2168 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2169 2170 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2171 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2172 (VMOVMSKPSrr VR128:$src)>; 2173 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2174 (VMOVMSKPDrr VR128:$src)>; 2175 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2176 (VMOVMSKPSYrr VR256:$src)>; 2177 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2178 (VMOVMSKPDYrr VR256:$src)>; 2179} 2180 2181defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2182 SSEPackedSingle>, PS; 2183defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2184 SSEPackedDouble>, PD; 2185 2186let Predicates = [UseSSE2] in { 2187 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2188 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2189 (MOVMSKPSrr VR128:$src)>; 2190 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2191 (MOVMSKPDrr VR128:$src)>; 2192} 2193 2194//===---------------------------------------------------------------------===// 2195// SSE2 - Packed Integer Logical Instructions 2196//===---------------------------------------------------------------------===// 2197 2198let ExeDomain = SSEPackedInt in { // SSE integer instructions 2199 2200/// PDI_binop_rm - Simple SSE2 binary operator. 2201multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2202 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2203 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2204 bit IsCommutable, bit Is2Addr> { 2205 let isCommutable = IsCommutable in 2206 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2207 (ins RC:$src1, RC:$src2), 2208 !if(Is2Addr, 2209 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2210 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2211 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2212 Sched<[sched]>; 2213 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2214 (ins RC:$src1, x86memop:$src2), 2215 !if(Is2Addr, 2216 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2217 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2218 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2219 Sched<[sched.Folded, sched.ReadAfterFold]>; 2220} 2221} // ExeDomain = SSEPackedInt 2222 2223multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2224 ValueType OpVT128, ValueType OpVT256, 2225 X86SchedWriteWidths sched, bit IsCommutable, 2226 Predicate prd> { 2227let Predicates = [HasAVX, prd] in 2228 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2229 VR128, load, i128mem, sched.XMM, 2230 IsCommutable, 0>, VEX_4V, VEX_WIG; 2231 2232let Constraints = "$src1 = $dst" in 2233 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2234 memop, i128mem, sched.XMM, IsCommutable, 1>; 2235 2236let Predicates = [HasAVX2, prd] in 2237 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2238 OpVT256, VR256, load, i256mem, sched.YMM, 2239 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2240} 2241 2242// These are ordered here for pattern ordering requirements with the fp versions 2243 2244defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2245 SchedWriteVecLogic, 1, NoVLX>; 2246defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2247 SchedWriteVecLogic, 1, NoVLX>; 2248defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2249 SchedWriteVecLogic, 1, NoVLX>; 2250defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2251 SchedWriteVecLogic, 0, NoVLX>; 2252 2253//===----------------------------------------------------------------------===// 2254// SSE 1 & 2 - Logical Instructions 2255//===----------------------------------------------------------------------===// 2256 2257/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2258/// 2259/// There are no patterns here because isel prefers integer versions for SSE2 2260/// and later. There are SSE1 v4f32 patterns later. 2261multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2262 SDNode OpNode, X86SchedWriteWidths sched> { 2263 let Predicates = [HasAVX, NoVLX] in { 2264 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2265 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2266 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2267 2268 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2269 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2270 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2271 2272 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2273 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2274 [], [], 0>, PS, VEX_4V, VEX_WIG; 2275 2276 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2277 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2278 [], [], 0>, PD, VEX_4V, VEX_WIG; 2279 } 2280 2281 let Constraints = "$src1 = $dst" in { 2282 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2283 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2284 [], []>, PS; 2285 2286 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2287 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2288 [], []>, PD; 2289 } 2290} 2291 2292defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; 2293defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; 2294defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; 2295let isCommutable = 0 in 2296 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; 2297 2298let Predicates = [HasAVX2, NoVLX] in { 2299 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2300 (VPANDYrr VR256:$src1, VR256:$src2)>; 2301 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2302 (VPANDYrr VR256:$src1, VR256:$src2)>; 2303 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2304 (VPANDYrr VR256:$src1, VR256:$src2)>; 2305 2306 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2307 (VPORYrr VR256:$src1, VR256:$src2)>; 2308 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2309 (VPORYrr VR256:$src1, VR256:$src2)>; 2310 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2311 (VPORYrr VR256:$src1, VR256:$src2)>; 2312 2313 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2314 (VPXORYrr VR256:$src1, VR256:$src2)>; 2315 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2316 (VPXORYrr VR256:$src1, VR256:$src2)>; 2317 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2318 (VPXORYrr VR256:$src1, VR256:$src2)>; 2319 2320 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2321 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2322 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2323 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2324 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2325 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2326 2327 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2328 (VPANDYrm VR256:$src1, addr:$src2)>; 2329 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2330 (VPANDYrm VR256:$src1, addr:$src2)>; 2331 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2332 (VPANDYrm VR256:$src1, addr:$src2)>; 2333 2334 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2335 (VPORYrm VR256:$src1, addr:$src2)>; 2336 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2337 (VPORYrm VR256:$src1, addr:$src2)>; 2338 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2339 (VPORYrm VR256:$src1, addr:$src2)>; 2340 2341 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2342 (VPXORYrm VR256:$src1, addr:$src2)>; 2343 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2344 (VPXORYrm VR256:$src1, addr:$src2)>; 2345 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2346 (VPXORYrm VR256:$src1, addr:$src2)>; 2347 2348 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2349 (VPANDNYrm VR256:$src1, addr:$src2)>; 2350 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2351 (VPANDNYrm VR256:$src1, addr:$src2)>; 2352 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2353 (VPANDNYrm VR256:$src1, addr:$src2)>; 2354} 2355 2356// If only AVX1 is supported, we need to handle integer operations with 2357// floating point instructions since the integer versions aren't available. 2358let Predicates = [HasAVX1Only] in { 2359 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2360 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2361 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2362 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2363 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2364 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2365 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2366 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2367 2368 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2369 (VORPSYrr VR256:$src1, VR256:$src2)>; 2370 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2371 (VORPSYrr VR256:$src1, VR256:$src2)>; 2372 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2373 (VORPSYrr VR256:$src1, VR256:$src2)>; 2374 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2375 (VORPSYrr VR256:$src1, VR256:$src2)>; 2376 2377 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2378 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2379 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2380 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2381 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2382 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2383 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2384 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2385 2386 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2387 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2388 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2389 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2390 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2391 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2392 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2393 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2394 2395 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2396 (VANDPSYrm VR256:$src1, addr:$src2)>; 2397 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2398 (VANDPSYrm VR256:$src1, addr:$src2)>; 2399 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2400 (VANDPSYrm VR256:$src1, addr:$src2)>; 2401 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2402 (VANDPSYrm VR256:$src1, addr:$src2)>; 2403 2404 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2405 (VORPSYrm VR256:$src1, addr:$src2)>; 2406 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2407 (VORPSYrm VR256:$src1, addr:$src2)>; 2408 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2409 (VORPSYrm VR256:$src1, addr:$src2)>; 2410 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2411 (VORPSYrm VR256:$src1, addr:$src2)>; 2412 2413 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2414 (VXORPSYrm VR256:$src1, addr:$src2)>; 2415 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2416 (VXORPSYrm VR256:$src1, addr:$src2)>; 2417 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2418 (VXORPSYrm VR256:$src1, addr:$src2)>; 2419 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2420 (VXORPSYrm VR256:$src1, addr:$src2)>; 2421 2422 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2423 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2424 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2425 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2426 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2427 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2428 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2429 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2430} 2431 2432let Predicates = [HasAVX, NoVLX] in { 2433 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2434 (VPANDrr VR128:$src1, VR128:$src2)>; 2435 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2436 (VPANDrr VR128:$src1, VR128:$src2)>; 2437 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2438 (VPANDrr VR128:$src1, VR128:$src2)>; 2439 2440 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2441 (VPORrr VR128:$src1, VR128:$src2)>; 2442 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2443 (VPORrr VR128:$src1, VR128:$src2)>; 2444 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2445 (VPORrr VR128:$src1, VR128:$src2)>; 2446 2447 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2448 (VPXORrr VR128:$src1, VR128:$src2)>; 2449 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2450 (VPXORrr VR128:$src1, VR128:$src2)>; 2451 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2452 (VPXORrr VR128:$src1, VR128:$src2)>; 2453 2454 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2455 (VPANDNrr VR128:$src1, VR128:$src2)>; 2456 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2457 (VPANDNrr VR128:$src1, VR128:$src2)>; 2458 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2459 (VPANDNrr VR128:$src1, VR128:$src2)>; 2460 2461 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2462 (VPANDrm VR128:$src1, addr:$src2)>; 2463 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2464 (VPANDrm VR128:$src1, addr:$src2)>; 2465 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2466 (VPANDrm VR128:$src1, addr:$src2)>; 2467 2468 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2469 (VPORrm VR128:$src1, addr:$src2)>; 2470 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2471 (VPORrm VR128:$src1, addr:$src2)>; 2472 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2473 (VPORrm VR128:$src1, addr:$src2)>; 2474 2475 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2476 (VPXORrm VR128:$src1, addr:$src2)>; 2477 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2478 (VPXORrm VR128:$src1, addr:$src2)>; 2479 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2480 (VPXORrm VR128:$src1, addr:$src2)>; 2481 2482 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2483 (VPANDNrm VR128:$src1, addr:$src2)>; 2484 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2485 (VPANDNrm VR128:$src1, addr:$src2)>; 2486 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2487 (VPANDNrm VR128:$src1, addr:$src2)>; 2488} 2489 2490let Predicates = [UseSSE2] in { 2491 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2492 (PANDrr VR128:$src1, VR128:$src2)>; 2493 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2494 (PANDrr VR128:$src1, VR128:$src2)>; 2495 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2496 (PANDrr VR128:$src1, VR128:$src2)>; 2497 2498 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2499 (PORrr VR128:$src1, VR128:$src2)>; 2500 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2501 (PORrr VR128:$src1, VR128:$src2)>; 2502 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2503 (PORrr VR128:$src1, VR128:$src2)>; 2504 2505 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2506 (PXORrr VR128:$src1, VR128:$src2)>; 2507 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2508 (PXORrr VR128:$src1, VR128:$src2)>; 2509 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2510 (PXORrr VR128:$src1, VR128:$src2)>; 2511 2512 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2513 (PANDNrr VR128:$src1, VR128:$src2)>; 2514 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2515 (PANDNrr VR128:$src1, VR128:$src2)>; 2516 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2517 (PANDNrr VR128:$src1, VR128:$src2)>; 2518 2519 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2520 (PANDrm VR128:$src1, addr:$src2)>; 2521 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2522 (PANDrm VR128:$src1, addr:$src2)>; 2523 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2524 (PANDrm VR128:$src1, addr:$src2)>; 2525 2526 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2527 (PORrm VR128:$src1, addr:$src2)>; 2528 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2529 (PORrm VR128:$src1, addr:$src2)>; 2530 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2531 (PORrm VR128:$src1, addr:$src2)>; 2532 2533 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2534 (PXORrm VR128:$src1, addr:$src2)>; 2535 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2536 (PXORrm VR128:$src1, addr:$src2)>; 2537 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2538 (PXORrm VR128:$src1, addr:$src2)>; 2539 2540 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2541 (PANDNrm VR128:$src1, addr:$src2)>; 2542 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2543 (PANDNrm VR128:$src1, addr:$src2)>; 2544 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2545 (PANDNrm VR128:$src1, addr:$src2)>; 2546} 2547 2548// Patterns for packed operations when we don't have integer type available. 2549def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2550 (ANDPSrr VR128:$src1, VR128:$src2)>; 2551def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2552 (ORPSrr VR128:$src1, VR128:$src2)>; 2553def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2554 (XORPSrr VR128:$src1, VR128:$src2)>; 2555def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2556 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2557 2558def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2559 (ANDPSrm VR128:$src1, addr:$src2)>; 2560def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2561 (ORPSrm VR128:$src1, addr:$src2)>; 2562def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2563 (XORPSrm VR128:$src1, addr:$src2)>; 2564def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2565 (ANDNPSrm VR128:$src1, addr:$src2)>; 2566 2567//===----------------------------------------------------------------------===// 2568// SSE 1 & 2 - Arithmetic Instructions 2569//===----------------------------------------------------------------------===// 2570 2571/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2572/// vector forms. 2573/// 2574/// In addition, we also have a special variant of the scalar form here to 2575/// represent the associated intrinsic operation. This form is unlike the 2576/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2577/// and leaves the top elements unmodified (therefore these cannot be commuted). 2578/// 2579/// These three forms can each be reg+reg or reg+mem. 2580/// 2581 2582/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2583/// classes below 2584multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2585 SDNode OpNode, X86SchedWriteSizes sched> { 2586let Uses = [MXCSR], mayRaiseFPException = 1 in { 2587 let Predicates = [HasAVX, NoVLX] in { 2588 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2589 VR128, v4f32, f128mem, loadv4f32, 2590 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2591 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2592 VR128, v2f64, f128mem, loadv2f64, 2593 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2594 2595 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2596 OpNode, VR256, v8f32, f256mem, loadv8f32, 2597 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2598 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2599 OpNode, VR256, v4f64, f256mem, loadv4f64, 2600 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2601 } 2602 2603 let Constraints = "$src1 = $dst" in { 2604 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2605 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2606 sched.PS.XMM>, PS; 2607 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2608 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2609 sched.PD.XMM>, PD; 2610 } 2611} 2612} 2613 2614multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2615 X86SchedWriteSizes sched> { 2616let Uses = [MXCSR], mayRaiseFPException = 1 in { 2617 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2618 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2619 XS, VEX_4V, VEX_LIG, VEX_WIG; 2620 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2621 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2622 XD, VEX_4V, VEX_LIG, VEX_WIG; 2623 2624 let Constraints = "$src1 = $dst" in { 2625 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2626 OpNode, FR32, f32mem, SSEPackedSingle, 2627 sched.PS.Scl>, XS; 2628 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2629 OpNode, FR64, f64mem, SSEPackedDouble, 2630 sched.PD.Scl>, XD; 2631 } 2632} 2633} 2634 2635multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2636 SDPatternOperator OpNode, 2637 X86SchedWriteSizes sched> { 2638let Uses = [MXCSR], mayRaiseFPException = 1 in { 2639 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2640 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2641 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2642 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2643 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2644 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2645 2646 let Constraints = "$src1 = $dst" in { 2647 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2648 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2649 SSEPackedSingle, sched.PS.Scl>, XS; 2650 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2651 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2652 SSEPackedDouble, sched.PD.Scl>, XD; 2653 } 2654} 2655} 2656 2657// Binary Arithmetic instructions 2658defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2659 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2660 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2661defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2662 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2663 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2664let isCommutable = 0 in { 2665 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2666 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2667 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2668 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2669 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2670 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2671 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2672 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2673 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2674 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2675 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2676 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2677} 2678 2679let isCodeGenOnly = 1 in { 2680 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2681 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2682 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2683 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2684} 2685 2686// Patterns used to select SSE scalar fp arithmetic instructions from 2687// either: 2688// 2689// (1) a scalar fp operation followed by a blend 2690// 2691// The effect is that the backend no longer emits unnecessary vector 2692// insert instructions immediately after SSE scalar fp instructions 2693// like addss or mulss. 2694// 2695// For example, given the following code: 2696// __m128 foo(__m128 A, __m128 B) { 2697// A[0] += B[0]; 2698// return A; 2699// } 2700// 2701// Previously we generated: 2702// addss %xmm0, %xmm1 2703// movss %xmm1, %xmm0 2704// 2705// We now generate: 2706// addss %xmm1, %xmm0 2707// 2708// (2) a vector packed single/double fp operation followed by a vector insert 2709// 2710// The effect is that the backend converts the packed fp instruction 2711// followed by a vector insert into a single SSE scalar fp instruction. 2712// 2713// For example, given the following code: 2714// __m128 foo(__m128 A, __m128 B) { 2715// __m128 C = A + B; 2716// return (__m128) {c[0], a[1], a[2], a[3]}; 2717// } 2718// 2719// Previously we generated: 2720// addps %xmm0, %xmm1 2721// movss %xmm1, %xmm0 2722// 2723// We now generate: 2724// addss %xmm1, %xmm0 2725 2726// TODO: Some canonicalization in lowering would simplify the number of 2727// patterns we have to try to match. 2728multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, 2729 ValueType VT, ValueType EltTy, 2730 RegisterClass RC, PatFrag ld_frag, 2731 Predicate BasePredicate> { 2732 let Predicates = [BasePredicate] in { 2733 // extracted scalar math op with insert via movss/movsd 2734 def : Pat<(VT (Move (VT VR128:$dst), 2735 (VT (scalar_to_vector 2736 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2737 RC:$src))))), 2738 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2739 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2740 def : Pat<(VT (Move (VT VR128:$dst), 2741 (VT (scalar_to_vector 2742 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2743 (ld_frag addr:$src)))))), 2744 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2745 } 2746 2747 // Repeat for AVX versions of the instructions. 2748 let Predicates = [UseAVX] in { 2749 // extracted scalar math op with insert via movss/movsd 2750 def : Pat<(VT (Move (VT VR128:$dst), 2751 (VT (scalar_to_vector 2752 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2753 RC:$src))))), 2754 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2755 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2756 def : Pat<(VT (Move (VT VR128:$dst), 2757 (VT (scalar_to_vector 2758 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2759 (ld_frag addr:$src)))))), 2760 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2761 } 2762} 2763 2764defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2765defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2766defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2767defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2768 2769defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2770defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2771defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2772defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2773 2774/// Unop Arithmetic 2775/// In addition, we also have a special variant of the scalar form here to 2776/// represent the associated intrinsic operation. This form is unlike the 2777/// plain scalar form, in that it takes an entire vector (instead of a 2778/// scalar) and leaves the top elements undefined. 2779/// 2780/// And, we have a special variant form for a full-vector intrinsic form. 2781 2782/// sse_fp_unop_s - SSE1 unops in scalar form 2783/// For the non-AVX defs, we need $src1 to be tied to $dst because 2784/// the HW instructions are 2 operand / destructive. 2785multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2786 ValueType ScalarVT, X86MemOperand x86memop, 2787 Operand intmemop, SDNode OpNode, Domain d, 2788 X86FoldableSchedWrite sched, Predicate target> { 2789 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2790 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2791 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2792 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2793 Requires<[target]>; 2794 let mayLoad = 1 in 2795 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2796 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2797 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2798 Sched<[sched.Folded]>, 2799 Requires<[target, OptForSize]>; 2800 } 2801 2802 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2803 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2804 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2805 Sched<[sched]>; 2806 let mayLoad = 1 in 2807 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2808 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2809 Sched<[sched.Folded, sched.ReadAfterFold]>; 2810 } 2811 2812} 2813 2814multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, 2815 ComplexPattern int_cpat, Intrinsic Intr, 2816 Predicate target, string Suffix> { 2817 let Predicates = [target] in { 2818 // These are unary operations, but they are modeled as having 2 source operands 2819 // because the high elements of the destination are unchanged in SSE. 2820 def : Pat<(Intr VR128:$src), 2821 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2822 } 2823 // We don't want to fold scalar loads into these instructions unless 2824 // optimizing for size. This is because the folded instruction will have a 2825 // partial register update, while the unfolded sequence will not, e.g. 2826 // movss mem, %xmm0 2827 // rcpss %xmm0, %xmm0 2828 // which has a clobber before the rcp, vs. 2829 // rcpss mem, %xmm0 2830 let Predicates = [target, OptForSize] in { 2831 def : Pat<(Intr int_cpat:$src2), 2832 (!cast<Instruction>(NAME#m_Int) 2833 (vt (IMPLICIT_DEF)), addr:$src2)>; 2834 } 2835} 2836 2837multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, 2838 Intrinsic Intr, Predicate target> { 2839 let Predicates = [target] in { 2840 def : Pat<(Intr VR128:$src), 2841 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2842 VR128:$src)>; 2843 } 2844 let Predicates = [target, OptForSize] in { 2845 def : Pat<(Intr int_cpat:$src2), 2846 (!cast<Instruction>(NAME#m_Int) 2847 (vt (IMPLICIT_DEF)), addr:$src2)>; 2848 } 2849} 2850 2851multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2852 ValueType ScalarVT, X86MemOperand x86memop, 2853 Operand intmemop, SDNode OpNode, Domain d, 2854 X86FoldableSchedWrite sched, Predicate target> { 2855 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2856 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2857 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2858 [], d>, Sched<[sched]>; 2859 let mayLoad = 1 in 2860 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2861 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2862 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2863 } 2864 let hasSideEffects = 0, ExeDomain = d in { 2865 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2866 (ins VR128:$src1, VR128:$src2), 2867 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2868 []>, Sched<[sched]>; 2869 let mayLoad = 1 in 2870 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2871 (ins VR128:$src1, intmemop:$src2), 2872 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2873 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2874 } 2875 2876 // We don't want to fold scalar loads into these instructions unless 2877 // optimizing for size. This is because the folded instruction will have a 2878 // partial register update, while the unfolded sequence will not, e.g. 2879 // vmovss mem, %xmm0 2880 // vrcpss %xmm0, %xmm0, %xmm0 2881 // which has a clobber before the rcp, vs. 2882 // vrcpss mem, %xmm0, %xmm0 2883 // TODO: In theory, we could fold the load, and avoid the stall caused by 2884 // the partial register store, either in BreakFalseDeps or with smarter RA. 2885 let Predicates = [target] in { 2886 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2887 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2888 } 2889 let Predicates = [target, OptForSize] in { 2890 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2891 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2892 addr:$src)>; 2893 } 2894} 2895 2896/// sse1_fp_unop_p - SSE1 unops in packed form. 2897multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 2898 X86SchedWriteWidths sched, list<Predicate> prds> { 2899let Predicates = prds in { 2900 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2901 !strconcat("v", OpcodeStr, 2902 "ps\t{$src, $dst|$dst, $src}"), 2903 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2904 VEX, Sched<[sched.XMM]>, VEX_WIG; 2905 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2906 !strconcat("v", OpcodeStr, 2907 "ps\t{$src, $dst|$dst, $src}"), 2908 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2909 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2910 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2911 !strconcat("v", OpcodeStr, 2912 "ps\t{$src, $dst|$dst, $src}"), 2913 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2914 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2915 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2916 !strconcat("v", OpcodeStr, 2917 "ps\t{$src, $dst|$dst, $src}"), 2918 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2919 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2920} 2921 2922 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2923 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2924 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2925 Sched<[sched.XMM]>; 2926 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2927 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2928 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2929 Sched<[sched.XMM.Folded]>; 2930} 2931 2932/// sse2_fp_unop_p - SSE2 unops in vector forms. 2933multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2934 SDNode OpNode, X86SchedWriteWidths sched> { 2935let Predicates = [HasAVX, NoVLX] in { 2936 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2937 !strconcat("v", OpcodeStr, 2938 "pd\t{$src, $dst|$dst, $src}"), 2939 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2940 VEX, Sched<[sched.XMM]>, VEX_WIG; 2941 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2942 !strconcat("v", OpcodeStr, 2943 "pd\t{$src, $dst|$dst, $src}"), 2944 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2945 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2946 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2947 !strconcat("v", OpcodeStr, 2948 "pd\t{$src, $dst|$dst, $src}"), 2949 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2950 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2951 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2952 !strconcat("v", OpcodeStr, 2953 "pd\t{$src, $dst|$dst, $src}"), 2954 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2955 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2956} 2957 2958 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2959 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2960 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2961 Sched<[sched.XMM]>; 2962 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2963 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2964 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 2965 Sched<[sched.XMM.Folded]>; 2966} 2967 2968multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, 2969 X86SchedWriteWidths sched, Predicate AVXTarget> { 2970 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2971 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2972 UseSSE1, "SS">, XS; 2973 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2974 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2975 AVXTarget>, 2976 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 2977} 2978 2979multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2980 X86SchedWriteWidths sched, Predicate AVXTarget> { 2981 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, 2982 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 2983 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, 2984 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 2985 XS, VEX_4V, VEX_LIG, VEX_WIG; 2986} 2987 2988multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2989 X86SchedWriteWidths sched, Predicate AVXTarget> { 2990 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, 2991 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 2992 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, 2993 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 2994 XD, VEX_4V, VEX_LIG, VEX_WIG; 2995} 2996 2997// Square root. 2998defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 2999 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3000 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3001 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3002 3003// Reciprocal approximations. Note that these typically require refinement 3004// in order to obtain suitable precision. 3005defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3006 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3007 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3008defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3009 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3010 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3011 3012// There is no f64 version of the reciprocal approximation instructions. 3013 3014multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 3015 ValueType VT, Predicate BasePredicate> { 3016 let Predicates = [BasePredicate] in { 3017 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3018 (OpNode (extractelt VT:$src, 0))))), 3019 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3020 } 3021 3022 // Repeat for AVX versions of the instructions. 3023 let Predicates = [UseAVX] in { 3024 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3025 (OpNode (extractelt VT:$src, 0))))), 3026 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3027 } 3028} 3029 3030defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3031defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3032 3033multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3034 SDNode Move, ValueType VT, 3035 Predicate BasePredicate> { 3036 let Predicates = [BasePredicate] in { 3037 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3038 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3039 } 3040 3041 // Repeat for AVX versions of the instructions. 3042 let Predicates = [HasAVX] in { 3043 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3044 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3045 } 3046} 3047 3048defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3049 v4f32, UseSSE1>; 3050defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3051 v4f32, UseSSE1>; 3052 3053 3054//===----------------------------------------------------------------------===// 3055// SSE 1 & 2 - Non-temporal stores 3056//===----------------------------------------------------------------------===// 3057 3058let AddedComplexity = 400 in { // Prefer non-temporal versions 3059let Predicates = [HasAVX, NoVLX] in { 3060let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3061def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3062 (ins f128mem:$dst, VR128:$src), 3063 "movntps\t{$src, $dst|$dst, $src}", 3064 [(alignednontemporalstore (v4f32 VR128:$src), 3065 addr:$dst)]>, VEX, VEX_WIG; 3066def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3067 (ins f128mem:$dst, VR128:$src), 3068 "movntpd\t{$src, $dst|$dst, $src}", 3069 [(alignednontemporalstore (v2f64 VR128:$src), 3070 addr:$dst)]>, VEX, VEX_WIG; 3071} // SchedRW 3072 3073let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3074def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3075 (ins f256mem:$dst, VR256:$src), 3076 "movntps\t{$src, $dst|$dst, $src}", 3077 [(alignednontemporalstore (v8f32 VR256:$src), 3078 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3079def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3080 (ins f256mem:$dst, VR256:$src), 3081 "movntpd\t{$src, $dst|$dst, $src}", 3082 [(alignednontemporalstore (v4f64 VR256:$src), 3083 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3084} // SchedRW 3085 3086let ExeDomain = SSEPackedInt in { 3087def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3088 (ins i128mem:$dst, VR128:$src), 3089 "movntdq\t{$src, $dst|$dst, $src}", 3090 [(alignednontemporalstore (v2i64 VR128:$src), 3091 addr:$dst)]>, VEX, VEX_WIG, 3092 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3093def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3094 (ins i256mem:$dst, VR256:$src), 3095 "movntdq\t{$src, $dst|$dst, $src}", 3096 [(alignednontemporalstore (v4i64 VR256:$src), 3097 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3098 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3099} // ExeDomain 3100} // Predicates 3101 3102let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3103def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3104 "movntps\t{$src, $dst|$dst, $src}", 3105 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3106def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3107 "movntpd\t{$src, $dst|$dst, $src}", 3108 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3109} // SchedRW 3110 3111let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3112def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3113 "movntdq\t{$src, $dst|$dst, $src}", 3114 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3115 3116let SchedRW = [WriteStoreNT] in { 3117// There is no AVX form for instructions below this point 3118def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3119 "movnti{l}\t{$src, $dst|$dst, $src}", 3120 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3121 PS, Requires<[HasSSE2]>; 3122def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3123 "movnti{q}\t{$src, $dst|$dst, $src}", 3124 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3125 PS, Requires<[HasSSE2]>; 3126} // SchedRW = [WriteStoreNT] 3127 3128let Predicates = [HasAVX, NoVLX] in { 3129 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3130 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3131 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3132 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3133 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3134 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3135 3136 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3137 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3138 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3139 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3140 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3141 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3142} 3143 3144let Predicates = [UseSSE2] in { 3145 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3146 (MOVNTDQmr addr:$dst, VR128:$src)>; 3147 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3148 (MOVNTDQmr addr:$dst, VR128:$src)>; 3149 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3150 (MOVNTDQmr addr:$dst, VR128:$src)>; 3151} 3152 3153} // AddedComplexity 3154 3155//===----------------------------------------------------------------------===// 3156// SSE 1 & 2 - Prefetch and memory fence 3157//===----------------------------------------------------------------------===// 3158 3159// Prefetch intrinsic. 3160let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3161def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3162 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3163def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3164 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3165def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3166 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3167def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3168 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3169} 3170 3171// FIXME: How should flush instruction be modeled? 3172let SchedRW = [WriteLoad] in { 3173// Flush cache 3174def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3175 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3176 PS, Requires<[HasSSE2]>; 3177} 3178 3179let SchedRW = [WriteNop] in { 3180// Pause. This "instruction" is encoded as "rep; nop", so even though it 3181// was introduced with SSE2, it's backward compatible. 3182def PAUSE : I<0x90, RawFrm, (outs), (ins), 3183 "pause", [(int_x86_sse2_pause)]>, OBXS; 3184} 3185 3186let SchedRW = [WriteFence] in { 3187// Load, store, and memory fence 3188// TODO: As with mfence, we may want to ease the availablity of sfence/lfence 3189// to include any 64-bit target. 3190def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3191 PS, Requires<[HasSSE1]>; 3192def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3193 PS, Requires<[HasSSE2]>; 3194def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3195 PS, Requires<[HasMFence]>; 3196} // SchedRW 3197 3198def : Pat<(X86MFence), (MFENCE)>; 3199 3200//===----------------------------------------------------------------------===// 3201// SSE 1 & 2 - Load/Store XCSR register 3202//===----------------------------------------------------------------------===// 3203 3204let mayLoad=1, hasSideEffects=1 in 3205def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3206 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3207 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3208let mayStore=1, hasSideEffects=1 in 3209def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3210 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3211 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3212 3213let mayLoad=1, hasSideEffects=1 in 3214def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3215 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3216 TB, Sched<[WriteLDMXCSR]>; 3217let mayStore=1, hasSideEffects=1 in 3218def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3219 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3220 TB, Sched<[WriteSTMXCSR]>; 3221 3222//===---------------------------------------------------------------------===// 3223// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3224//===---------------------------------------------------------------------===// 3225 3226let ExeDomain = SSEPackedInt in { // SSE integer instructions 3227 3228let hasSideEffects = 0 in { 3229def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3230 "movdqa\t{$src, $dst|$dst, $src}", []>, 3231 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3232def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3233 "movdqu\t{$src, $dst|$dst, $src}", []>, 3234 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3235def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3236 "movdqa\t{$src, $dst|$dst, $src}", []>, 3237 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3238def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3239 "movdqu\t{$src, $dst|$dst, $src}", []>, 3240 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3241} 3242 3243// For Disassembler 3244let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3245def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3246 "movdqa\t{$src, $dst|$dst, $src}", []>, 3247 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3248 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3249def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3250 "movdqa\t{$src, $dst|$dst, $src}", []>, 3251 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3252 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3253def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3254 "movdqu\t{$src, $dst|$dst, $src}", []>, 3255 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3256 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3257def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3258 "movdqu\t{$src, $dst|$dst, $src}", []>, 3259 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3260 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3261} 3262 3263let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3264 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3265def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3266 "movdqa\t{$src, $dst|$dst, $src}", 3267 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3268 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3269def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3270 "movdqa\t{$src, $dst|$dst, $src}", []>, 3271 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3272 VEX, VEX_L, VEX_WIG; 3273def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3274 "vmovdqu\t{$src, $dst|$dst, $src}", 3275 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3276 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3277 XS, VEX, VEX_WIG; 3278def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3279 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3280 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3281 XS, VEX, VEX_L, VEX_WIG; 3282} 3283 3284let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3285def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3286 (ins i128mem:$dst, VR128:$src), 3287 "movdqa\t{$src, $dst|$dst, $src}", 3288 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3289 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3290def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3291 (ins i256mem:$dst, VR256:$src), 3292 "movdqa\t{$src, $dst|$dst, $src}", []>, 3293 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3294def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3295 "vmovdqu\t{$src, $dst|$dst, $src}", 3296 [(store (v2i64 VR128:$src), addr:$dst)]>, 3297 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3298def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3299 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3300 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3301} 3302 3303let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3304let hasSideEffects = 0 in { 3305def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3306 "movdqa\t{$src, $dst|$dst, $src}", []>; 3307 3308def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3309 "movdqu\t{$src, $dst|$dst, $src}", []>, 3310 XS, Requires<[UseSSE2]>; 3311} 3312 3313// For Disassembler 3314let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3315def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3316 "movdqa\t{$src, $dst|$dst, $src}", []>, 3317 FoldGenData<"MOVDQArr">; 3318 3319def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3320 "movdqu\t{$src, $dst|$dst, $src}", []>, 3321 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3322} 3323} // SchedRW 3324 3325let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3326 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3327def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3328 "movdqa\t{$src, $dst|$dst, $src}", 3329 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3330def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3331 "movdqu\t{$src, $dst|$dst, $src}", 3332 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3333 XS, Requires<[UseSSE2]>; 3334} 3335 3336let mayStore = 1, hasSideEffects = 0, 3337 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3338def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3339 "movdqa\t{$src, $dst|$dst, $src}", 3340 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3341def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3342 "movdqu\t{$src, $dst|$dst, $src}", 3343 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3344 XS, Requires<[UseSSE2]>; 3345} 3346 3347} // ExeDomain = SSEPackedInt 3348 3349// Reversed version with ".s" suffix for GAS compatibility. 3350def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3351 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3352def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3353 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3354def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3355 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3356def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3357 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3358 3359// Reversed version with ".s" suffix for GAS compatibility. 3360def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3361 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3362def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3363 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3364 3365let Predicates = [HasAVX, NoVLX] in { 3366 // Additional patterns for other integer sizes. 3367 def : Pat<(alignedloadv4i32 addr:$src), 3368 (VMOVDQArm addr:$src)>; 3369 def : Pat<(alignedloadv8i16 addr:$src), 3370 (VMOVDQArm addr:$src)>; 3371 def : Pat<(alignedloadv16i8 addr:$src), 3372 (VMOVDQArm addr:$src)>; 3373 def : Pat<(loadv4i32 addr:$src), 3374 (VMOVDQUrm addr:$src)>; 3375 def : Pat<(loadv8i16 addr:$src), 3376 (VMOVDQUrm addr:$src)>; 3377 def : Pat<(loadv16i8 addr:$src), 3378 (VMOVDQUrm addr:$src)>; 3379 3380 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3381 (VMOVDQAmr addr:$dst, VR128:$src)>; 3382 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3383 (VMOVDQAmr addr:$dst, VR128:$src)>; 3384 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3385 (VMOVDQAmr addr:$dst, VR128:$src)>; 3386 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3387 (VMOVDQUmr addr:$dst, VR128:$src)>; 3388 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3389 (VMOVDQUmr addr:$dst, VR128:$src)>; 3390 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3391 (VMOVDQUmr addr:$dst, VR128:$src)>; 3392} 3393 3394//===---------------------------------------------------------------------===// 3395// SSE2 - Packed Integer Arithmetic Instructions 3396//===---------------------------------------------------------------------===// 3397 3398let ExeDomain = SSEPackedInt in { // SSE integer instructions 3399 3400/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3401multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3402 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3403 PatFrag memop_frag, X86MemOperand x86memop, 3404 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3405 let isCommutable = 1 in 3406 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3407 (ins RC:$src1, RC:$src2), 3408 !if(Is2Addr, 3409 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3410 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3411 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3412 Sched<[sched]>; 3413 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3414 (ins RC:$src1, x86memop:$src2), 3415 !if(Is2Addr, 3416 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3417 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3418 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3419 (memop_frag addr:$src2))))]>, 3420 Sched<[sched.Folded, sched.ReadAfterFold]>; 3421} 3422} // ExeDomain = SSEPackedInt 3423 3424defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3425 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3426defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3427 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3428defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3429 SchedWriteVecALU, 1, NoVLX>; 3430defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3431 SchedWriteVecALU, 1, NoVLX>; 3432defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3433 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3434defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3435 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3436defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3437 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3438defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3439 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3440defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3441 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3442defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3443 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3444defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3445 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3446defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3447 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3448defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3449 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3450defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3451 SchedWriteVecALU, 0, NoVLX>; 3452defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3453 SchedWriteVecALU, 0, NoVLX>; 3454defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3455 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3456defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3457 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3458defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3459 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3460defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3461 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3462defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3463 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3464defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3465 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3466defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3467 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3468defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3469 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3470defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 3471 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3472defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 3473 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3474defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3475 SchedWriteVecIMul, 1, NoVLX>; 3476 3477let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3478defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3479 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3480 VEX_4V, VEX_WIG; 3481 3482let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3483defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3484 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3485 0>, VEX_4V, VEX_L, VEX_WIG; 3486let Constraints = "$src1 = $dst" in 3487defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3488 memop, i128mem, SchedWriteVecIMul.XMM>; 3489 3490let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3491defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3492 load, i128mem, SchedWritePSADBW.XMM, 0>, 3493 VEX_4V, VEX_WIG; 3494let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3495defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3496 load, i256mem, SchedWritePSADBW.YMM, 0>, 3497 VEX_4V, VEX_L, VEX_WIG; 3498let Constraints = "$src1 = $dst" in 3499defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3500 memop, i128mem, SchedWritePSADBW.XMM>; 3501 3502//===---------------------------------------------------------------------===// 3503// SSE2 - Packed Integer Logical Instructions 3504//===---------------------------------------------------------------------===// 3505 3506multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3507 string OpcodeStr, SDNode OpNode, 3508 SDNode OpNode2, RegisterClass RC, 3509 X86FoldableSchedWrite sched, 3510 X86FoldableSchedWrite schedImm, 3511 ValueType DstVT, ValueType SrcVT, 3512 PatFrag ld_frag, bit Is2Addr = 1> { 3513 // src2 is always 128-bit 3514 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3515 (ins RC:$src1, VR128:$src2), 3516 !if(Is2Addr, 3517 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3518 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3519 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3520 Sched<[sched]>; 3521 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3522 (ins RC:$src1, i128mem:$src2), 3523 !if(Is2Addr, 3524 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3525 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3526 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3527 (SrcVT (ld_frag addr:$src2)))))]>, 3528 Sched<[sched.Folded, sched.ReadAfterFold]>; 3529 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3530 (ins RC:$src1, u8imm:$src2), 3531 !if(Is2Addr, 3532 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3533 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3534 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3535 Sched<[schedImm]>; 3536} 3537 3538multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3539 string OpcodeStr, SDNode OpNode, 3540 SDNode OpNode2, ValueType DstVT128, 3541 ValueType DstVT256, ValueType SrcVT, 3542 X86SchedWriteWidths sched, 3543 X86SchedWriteWidths schedImm, Predicate prd> { 3544let Predicates = [HasAVX, prd] in 3545 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3546 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3547 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; 3548let Predicates = [HasAVX2, prd] in 3549 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3550 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3551 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, 3552 VEX_WIG; 3553let Constraints = "$src1 = $dst" in 3554 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3555 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3556 memop>; 3557} 3558 3559multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3560 SDNode OpNode, RegisterClass RC, ValueType VT, 3561 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3562 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3563 !if(Is2Addr, 3564 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3565 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3566 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3567 Sched<[sched]>; 3568} 3569 3570multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3571 SDNode OpNode, X86SchedWriteWidths sched> { 3572let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3573 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3574 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3575let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3576 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3577 VR256, v32i8, sched.YMM, 0>, 3578 VEX_4V, VEX_L, VEX_WIG; 3579let Constraints = "$src1 = $dst" in 3580 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3581 sched.XMM>; 3582} 3583 3584let ExeDomain = SSEPackedInt in { 3585 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3586 v8i16, v16i16, v8i16, SchedWriteVecShift, 3587 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3588 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3589 v4i32, v8i32, v4i32, SchedWriteVecShift, 3590 SchedWriteVecShiftImm, NoVLX>; 3591 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3592 v2i64, v4i64, v2i64, SchedWriteVecShift, 3593 SchedWriteVecShiftImm, NoVLX>; 3594 3595 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3596 v8i16, v16i16, v8i16, SchedWriteVecShift, 3597 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3598 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3599 v4i32, v8i32, v4i32, SchedWriteVecShift, 3600 SchedWriteVecShiftImm, NoVLX>; 3601 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3602 v2i64, v4i64, v2i64, SchedWriteVecShift, 3603 SchedWriteVecShiftImm, NoVLX>; 3604 3605 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3606 v8i16, v16i16, v8i16, SchedWriteVecShift, 3607 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3608 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3609 v4i32, v8i32, v4i32, SchedWriteVecShift, 3610 SchedWriteVecShiftImm, NoVLX>; 3611 3612 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3613 SchedWriteShuffle>; 3614 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3615 SchedWriteShuffle>; 3616} // ExeDomain = SSEPackedInt 3617 3618//===---------------------------------------------------------------------===// 3619// SSE2 - Packed Integer Comparison Instructions 3620//===---------------------------------------------------------------------===// 3621 3622defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3623 SchedWriteVecALU, 1, TruePredicate>; 3624defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3625 SchedWriteVecALU, 1, TruePredicate>; 3626defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3627 SchedWriteVecALU, 1, TruePredicate>; 3628defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3629 SchedWriteVecALU, 0, TruePredicate>; 3630defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3631 SchedWriteVecALU, 0, TruePredicate>; 3632defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3633 SchedWriteVecALU, 0, TruePredicate>; 3634 3635//===---------------------------------------------------------------------===// 3636// SSE2 - Packed Integer Shuffle Instructions 3637//===---------------------------------------------------------------------===// 3638 3639let ExeDomain = SSEPackedInt in { 3640multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3641 SDNode OpNode, X86SchedWriteWidths sched, 3642 Predicate prd> { 3643let Predicates = [HasAVX, prd] in { 3644 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3645 (ins VR128:$src1, u8imm:$src2), 3646 !strconcat("v", OpcodeStr, 3647 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3648 [(set VR128:$dst, 3649 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3650 VEX, Sched<[sched.XMM]>, VEX_WIG; 3651 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3652 (ins i128mem:$src1, u8imm:$src2), 3653 !strconcat("v", OpcodeStr, 3654 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3655 [(set VR128:$dst, 3656 (vt128 (OpNode (load addr:$src1), 3657 (i8 timm:$src2))))]>, VEX, 3658 Sched<[sched.XMM.Folded]>, VEX_WIG; 3659} 3660 3661let Predicates = [HasAVX2, prd] in { 3662 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3663 (ins VR256:$src1, u8imm:$src2), 3664 !strconcat("v", OpcodeStr, 3665 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3666 [(set VR256:$dst, 3667 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3668 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3669 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3670 (ins i256mem:$src1, u8imm:$src2), 3671 !strconcat("v", OpcodeStr, 3672 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3673 [(set VR256:$dst, 3674 (vt256 (OpNode (load addr:$src1), 3675 (i8 timm:$src2))))]>, VEX, VEX_L, 3676 Sched<[sched.YMM.Folded]>, VEX_WIG; 3677} 3678 3679let Predicates = [UseSSE2] in { 3680 def ri : Ii8<0x70, MRMSrcReg, 3681 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3682 !strconcat(OpcodeStr, 3683 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3684 [(set VR128:$dst, 3685 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3686 Sched<[sched.XMM]>; 3687 def mi : Ii8<0x70, MRMSrcMem, 3688 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3689 !strconcat(OpcodeStr, 3690 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3691 [(set VR128:$dst, 3692 (vt128 (OpNode (memop addr:$src1), 3693 (i8 timm:$src2))))]>, 3694 Sched<[sched.XMM.Folded]>; 3695} 3696} 3697} // ExeDomain = SSEPackedInt 3698 3699defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3700 SchedWriteShuffle, NoVLX>, PD; 3701defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3702 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3703defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3704 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3705 3706//===---------------------------------------------------------------------===// 3707// Packed Integer Pack Instructions (SSE & AVX) 3708//===---------------------------------------------------------------------===// 3709 3710let ExeDomain = SSEPackedInt in { 3711multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3712 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3713 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3714 PatFrag ld_frag, bit Is2Addr = 1> { 3715 def rr : PDI<opc, MRMSrcReg, 3716 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3717 !if(Is2Addr, 3718 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3719 !strconcat(OpcodeStr, 3720 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3721 [(set RC:$dst, 3722 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3723 Sched<[sched]>; 3724 def rm : PDI<opc, MRMSrcMem, 3725 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3726 !if(Is2Addr, 3727 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3728 !strconcat(OpcodeStr, 3729 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3730 [(set RC:$dst, 3731 (OutVT (OpNode (ArgVT RC:$src1), 3732 (ld_frag addr:$src2))))]>, 3733 Sched<[sched.Folded, sched.ReadAfterFold]>; 3734} 3735 3736multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3737 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3738 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3739 PatFrag ld_frag, bit Is2Addr = 1> { 3740 def rr : SS48I<opc, MRMSrcReg, 3741 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3742 !if(Is2Addr, 3743 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3744 !strconcat(OpcodeStr, 3745 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3746 [(set RC:$dst, 3747 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3748 Sched<[sched]>; 3749 def rm : SS48I<opc, MRMSrcMem, 3750 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3751 !if(Is2Addr, 3752 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3753 !strconcat(OpcodeStr, 3754 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3755 [(set RC:$dst, 3756 (OutVT (OpNode (ArgVT RC:$src1), 3757 (ld_frag addr:$src2))))]>, 3758 Sched<[sched.Folded, sched.ReadAfterFold]>; 3759} 3760 3761let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3762 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3763 i128mem, SchedWriteShuffle.XMM, load, 0>, 3764 VEX_4V, VEX_WIG; 3765 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3766 i128mem, SchedWriteShuffle.XMM, load, 0>, 3767 VEX_4V, VEX_WIG; 3768 3769 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3770 i128mem, SchedWriteShuffle.XMM, load, 0>, 3771 VEX_4V, VEX_WIG; 3772 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3773 i128mem, SchedWriteShuffle.XMM, load, 0>, 3774 VEX_4V; 3775} 3776 3777let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3778 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3779 i256mem, SchedWriteShuffle.YMM, load, 0>, 3780 VEX_4V, VEX_L, VEX_WIG; 3781 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3782 i256mem, SchedWriteShuffle.YMM, load, 0>, 3783 VEX_4V, VEX_L, VEX_WIG; 3784 3785 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3786 i256mem, SchedWriteShuffle.YMM, load, 0>, 3787 VEX_4V, VEX_L, VEX_WIG; 3788 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3789 i256mem, SchedWriteShuffle.YMM, load, 0>, 3790 VEX_4V, VEX_L; 3791} 3792 3793let Constraints = "$src1 = $dst" in { 3794 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3795 i128mem, SchedWriteShuffle.XMM, memop>; 3796 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3797 i128mem, SchedWriteShuffle.XMM, memop>; 3798 3799 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3800 i128mem, SchedWriteShuffle.XMM, memop>; 3801 3802 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3803 i128mem, SchedWriteShuffle.XMM, memop>; 3804} 3805} // ExeDomain = SSEPackedInt 3806 3807//===---------------------------------------------------------------------===// 3808// SSE2 - Packed Integer Unpack Instructions 3809//===---------------------------------------------------------------------===// 3810 3811let ExeDomain = SSEPackedInt in { 3812multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3813 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3814 X86FoldableSchedWrite sched, PatFrag ld_frag, 3815 bit Is2Addr = 1> { 3816 def rr : PDI<opc, MRMSrcReg, 3817 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3818 !if(Is2Addr, 3819 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3820 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3821 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3822 Sched<[sched]>; 3823 def rm : PDI<opc, MRMSrcMem, 3824 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3825 !if(Is2Addr, 3826 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3827 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3828 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3829 Sched<[sched.Folded, sched.ReadAfterFold]>; 3830} 3831 3832let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3833 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3834 i128mem, SchedWriteShuffle.XMM, load, 0>, 3835 VEX_4V, VEX_WIG; 3836 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3837 i128mem, SchedWriteShuffle.XMM, load, 0>, 3838 VEX_4V, VEX_WIG; 3839 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3840 i128mem, SchedWriteShuffle.XMM, load, 0>, 3841 VEX_4V, VEX_WIG; 3842 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3843 i128mem, SchedWriteShuffle.XMM, load, 0>, 3844 VEX_4V, VEX_WIG; 3845} 3846 3847let Predicates = [HasAVX, NoVLX] in { 3848 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3849 i128mem, SchedWriteShuffle.XMM, load, 0>, 3850 VEX_4V, VEX_WIG; 3851 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3852 i128mem, SchedWriteShuffle.XMM, load, 0>, 3853 VEX_4V, VEX_WIG; 3854 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3855 i128mem, SchedWriteShuffle.XMM, load, 0>, 3856 VEX_4V, VEX_WIG; 3857 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3858 i128mem, SchedWriteShuffle.XMM, load, 0>, 3859 VEX_4V, VEX_WIG; 3860} 3861 3862let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3863 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3864 i256mem, SchedWriteShuffle.YMM, load, 0>, 3865 VEX_4V, VEX_L, VEX_WIG; 3866 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3867 i256mem, SchedWriteShuffle.YMM, load, 0>, 3868 VEX_4V, VEX_L, VEX_WIG; 3869 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3870 i256mem, SchedWriteShuffle.YMM, load, 0>, 3871 VEX_4V, VEX_L, VEX_WIG; 3872 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3873 i256mem, SchedWriteShuffle.YMM, load, 0>, 3874 VEX_4V, VEX_L, VEX_WIG; 3875} 3876 3877let Predicates = [HasAVX2, NoVLX] in { 3878 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3879 i256mem, SchedWriteShuffle.YMM, load, 0>, 3880 VEX_4V, VEX_L, VEX_WIG; 3881 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3882 i256mem, SchedWriteShuffle.YMM, load, 0>, 3883 VEX_4V, VEX_L, VEX_WIG; 3884 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3885 i256mem, SchedWriteShuffle.YMM, load, 0>, 3886 VEX_4V, VEX_L, VEX_WIG; 3887 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3888 i256mem, SchedWriteShuffle.YMM, load, 0>, 3889 VEX_4V, VEX_L, VEX_WIG; 3890} 3891 3892let Constraints = "$src1 = $dst" in { 3893 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3894 i128mem, SchedWriteShuffle.XMM, memop>; 3895 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3896 i128mem, SchedWriteShuffle.XMM, memop>; 3897 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3898 i128mem, SchedWriteShuffle.XMM, memop>; 3899 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3900 i128mem, SchedWriteShuffle.XMM, memop>; 3901 3902 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3903 i128mem, SchedWriteShuffle.XMM, memop>; 3904 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3905 i128mem, SchedWriteShuffle.XMM, memop>; 3906 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3907 i128mem, SchedWriteShuffle.XMM, memop>; 3908 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3909 i128mem, SchedWriteShuffle.XMM, memop>; 3910} 3911} // ExeDomain = SSEPackedInt 3912 3913//===---------------------------------------------------------------------===// 3914// SSE2 - Packed Integer Extract and Insert 3915//===---------------------------------------------------------------------===// 3916 3917let ExeDomain = SSEPackedInt in { 3918multiclass sse2_pinsrw<bit Is2Addr = 1> { 3919 def rr : Ii8<0xC4, MRMSrcReg, 3920 (outs VR128:$dst), (ins VR128:$src1, 3921 GR32orGR64:$src2, u8imm:$src3), 3922 !if(Is2Addr, 3923 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3924 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3925 [(set VR128:$dst, 3926 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 3927 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3928 def rm : Ii8<0xC4, MRMSrcMem, 3929 (outs VR128:$dst), (ins VR128:$src1, 3930 i16mem:$src2, u8imm:$src3), 3931 !if(Is2Addr, 3932 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3933 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3934 [(set VR128:$dst, 3935 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3936 imm:$src3))]>, 3937 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 3938} 3939 3940// Extract 3941let Predicates = [HasAVX, NoBWI] in 3942def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3943 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3944 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3945 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3946 imm:$src2))]>, 3947 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; 3948def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3949 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3950 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3951 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3952 imm:$src2))]>, 3953 Sched<[WriteVecExtract]>; 3954 3955// Insert 3956let Predicates = [HasAVX, NoBWI] in 3957defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; 3958 3959let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 3960defm PINSRW : sse2_pinsrw, PD; 3961 3962} // ExeDomain = SSEPackedInt 3963 3964//===---------------------------------------------------------------------===// 3965// SSE2 - Packed Mask Creation 3966//===---------------------------------------------------------------------===// 3967 3968let ExeDomain = SSEPackedInt in { 3969 3970def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3971 (ins VR128:$src), 3972 "pmovmskb\t{$src, $dst|$dst, $src}", 3973 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3974 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 3975 3976let Predicates = [HasAVX2] in { 3977def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3978 (ins VR256:$src), 3979 "pmovmskb\t{$src, $dst|$dst, $src}", 3980 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 3981 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 3982} 3983 3984def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 3985 "pmovmskb\t{$src, $dst|$dst, $src}", 3986 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3987 Sched<[WriteVecMOVMSK]>; 3988 3989} // ExeDomain = SSEPackedInt 3990 3991//===---------------------------------------------------------------------===// 3992// SSE2 - Conditional Store 3993//===---------------------------------------------------------------------===// 3994 3995let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3996let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 3997def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 3998 (ins VR128:$src, VR128:$mask), 3999 "maskmovdqu\t{$mask, $src|$src, $mask}", 4000 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4001 VEX, VEX_WIG; 4002let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4003def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4004 (ins VR128:$src, VR128:$mask), 4005 "maskmovdqu\t{$mask, $src|$src, $mask}", 4006 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4007 VEX, VEX_WIG; 4008 4009let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 4010def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4011 "maskmovdqu\t{$mask, $src|$src, $mask}", 4012 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4013let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4014def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4015 "maskmovdqu\t{$mask, $src|$src, $mask}", 4016 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4017 4018} // ExeDomain = SSEPackedInt 4019 4020//===---------------------------------------------------------------------===// 4021// SSE2 - Move Doubleword/Quadword 4022//===---------------------------------------------------------------------===// 4023 4024//===---------------------------------------------------------------------===// 4025// Move Int Doubleword to Packed Double Int 4026// 4027let ExeDomain = SSEPackedInt in { 4028def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4029 "movd\t{$src, $dst|$dst, $src}", 4030 [(set VR128:$dst, 4031 (v4i32 (scalar_to_vector GR32:$src)))]>, 4032 VEX, Sched<[WriteVecMoveFromGpr]>; 4033def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4034 "movd\t{$src, $dst|$dst, $src}", 4035 [(set VR128:$dst, 4036 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4037 VEX, Sched<[WriteVecLoad]>; 4038def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4039 "movq\t{$src, $dst|$dst, $src}", 4040 [(set VR128:$dst, 4041 (v2i64 (scalar_to_vector GR64:$src)))]>, 4042 VEX, Sched<[WriteVecMoveFromGpr]>; 4043let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4044def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4045 "movq\t{$src, $dst|$dst, $src}", []>, 4046 VEX, Sched<[WriteVecLoad]>; 4047let isCodeGenOnly = 1 in 4048def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4049 "movq\t{$src, $dst|$dst, $src}", 4050 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4051 VEX, Sched<[WriteVecMoveFromGpr]>; 4052 4053def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4054 "movd\t{$src, $dst|$dst, $src}", 4055 [(set VR128:$dst, 4056 (v4i32 (scalar_to_vector GR32:$src)))]>, 4057 Sched<[WriteVecMoveFromGpr]>; 4058def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4059 "movd\t{$src, $dst|$dst, $src}", 4060 [(set VR128:$dst, 4061 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4062 Sched<[WriteVecLoad]>; 4063def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4064 "movq\t{$src, $dst|$dst, $src}", 4065 [(set VR128:$dst, 4066 (v2i64 (scalar_to_vector GR64:$src)))]>, 4067 Sched<[WriteVecMoveFromGpr]>; 4068let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4069def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4070 "movq\t{$src, $dst|$dst, $src}", []>, 4071 Sched<[WriteVecLoad]>; 4072let isCodeGenOnly = 1 in 4073def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4074 "movq\t{$src, $dst|$dst, $src}", 4075 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4076 Sched<[WriteVecMoveFromGpr]>; 4077} // ExeDomain = SSEPackedInt 4078 4079//===---------------------------------------------------------------------===// 4080// Move Int Doubleword to Single Scalar 4081// 4082let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4083 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4084 "movd\t{$src, $dst|$dst, $src}", 4085 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4086 VEX, Sched<[WriteVecMoveFromGpr]>; 4087 4088 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4089 "movd\t{$src, $dst|$dst, $src}", 4090 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4091 Sched<[WriteVecMoveFromGpr]>; 4092 4093} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4094 4095//===---------------------------------------------------------------------===// 4096// Move Packed Doubleword Int to Packed Double Int 4097// 4098let ExeDomain = SSEPackedInt in { 4099def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4100 "movd\t{$src, $dst|$dst, $src}", 4101 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4102 (iPTR 0)))]>, VEX, 4103 Sched<[WriteVecMoveToGpr]>; 4104def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4105 (ins i32mem:$dst, VR128:$src), 4106 "movd\t{$src, $dst|$dst, $src}", 4107 [(store (i32 (extractelt (v4i32 VR128:$src), 4108 (iPTR 0))), addr:$dst)]>, 4109 VEX, Sched<[WriteVecStore]>; 4110def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4111 "movd\t{$src, $dst|$dst, $src}", 4112 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4113 (iPTR 0)))]>, 4114 Sched<[WriteVecMoveToGpr]>; 4115def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4116 "movd\t{$src, $dst|$dst, $src}", 4117 [(store (i32 (extractelt (v4i32 VR128:$src), 4118 (iPTR 0))), addr:$dst)]>, 4119 Sched<[WriteVecStore]>; 4120} // ExeDomain = SSEPackedInt 4121 4122//===---------------------------------------------------------------------===// 4123// Move Packed Doubleword Int first element to Doubleword Int 4124// 4125let ExeDomain = SSEPackedInt in { 4126let SchedRW = [WriteVecMoveToGpr] in { 4127def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4128 "movq\t{$src, $dst|$dst, $src}", 4129 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4130 (iPTR 0)))]>, 4131 VEX; 4132 4133def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4134 "movq\t{$src, $dst|$dst, $src}", 4135 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4136 (iPTR 0)))]>; 4137} //SchedRW 4138 4139let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4140def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4141 (ins i64mem:$dst, VR128:$src), 4142 "movq\t{$src, $dst|$dst, $src}", []>, 4143 VEX, Sched<[WriteVecStore]>; 4144let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4145def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4146 "movq\t{$src, $dst|$dst, $src}", []>, 4147 Sched<[WriteVecStore]>; 4148} // ExeDomain = SSEPackedInt 4149 4150//===---------------------------------------------------------------------===// 4151// Bitcast FR64 <-> GR64 4152// 4153let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4154 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4155 "movq\t{$src, $dst|$dst, $src}", 4156 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4157 VEX, Sched<[WriteVecMoveToGpr]>; 4158 4159 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4160 "movq\t{$src, $dst|$dst, $src}", 4161 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4162 Sched<[WriteVecMoveToGpr]>; 4163} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4164 4165//===---------------------------------------------------------------------===// 4166// Move Scalar Single to Double Int 4167// 4168let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4169 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4170 "movd\t{$src, $dst|$dst, $src}", 4171 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4172 VEX, Sched<[WriteVecMoveToGpr]>; 4173 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4174 "movd\t{$src, $dst|$dst, $src}", 4175 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4176 Sched<[WriteVecMoveToGpr]>; 4177} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4178 4179let Predicates = [UseAVX] in { 4180 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4181 (VMOVDI2PDIrr GR32:$src)>; 4182 4183 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4184 (VMOV64toPQIrr GR64:$src)>; 4185 4186 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4187 // These instructions also write zeros in the high part of a 256-bit register. 4188 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4189 (VMOVDI2PDIrm addr:$src)>; 4190 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4191 (VMOVDI2PDIrm addr:$src)>; 4192 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4193 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4194} 4195 4196let Predicates = [UseSSE2] in { 4197 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4198 (MOVDI2PDIrr GR32:$src)>; 4199 4200 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4201 (MOV64toPQIrr GR64:$src)>; 4202 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4203 (MOVDI2PDIrm addr:$src)>; 4204 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4205 (MOVDI2PDIrm addr:$src)>; 4206} 4207 4208// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4209// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4210// these aliases. 4211def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4212 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4213def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4214 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4215// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4216def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4217 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4218def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4219 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4220 4221//===---------------------------------------------------------------------===// 4222// SSE2 - Move Quadword 4223//===---------------------------------------------------------------------===// 4224 4225//===---------------------------------------------------------------------===// 4226// Move Quadword Int to Packed Quadword Int 4227// 4228 4229let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4230def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4231 "vmovq\t{$src, $dst|$dst, $src}", 4232 [(set VR128:$dst, 4233 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4234 VEX, Requires<[UseAVX]>, VEX_WIG; 4235def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4236 "movq\t{$src, $dst|$dst, $src}", 4237 [(set VR128:$dst, 4238 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4239 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4240} // ExeDomain, SchedRW 4241 4242//===---------------------------------------------------------------------===// 4243// Move Packed Quadword Int to Quadword Int 4244// 4245let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4246def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4247 "movq\t{$src, $dst|$dst, $src}", 4248 [(store (i64 (extractelt (v2i64 VR128:$src), 4249 (iPTR 0))), addr:$dst)]>, 4250 VEX, VEX_WIG; 4251def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4252 "movq\t{$src, $dst|$dst, $src}", 4253 [(store (i64 (extractelt (v2i64 VR128:$src), 4254 (iPTR 0))), addr:$dst)]>; 4255} // ExeDomain, SchedRW 4256 4257// For disassembler only 4258let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4259 SchedRW = [SchedWriteVecLogic.XMM] in { 4260def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4261 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4262def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4263 "movq\t{$src, $dst|$dst, $src}", []>; 4264} 4265 4266def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4267 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4268def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4269 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4270 4271let Predicates = [UseAVX] in { 4272 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4273 (VMOVQI2PQIrm addr:$src)>; 4274 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4275 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4276 4277 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4278 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4279} 4280 4281let Predicates = [UseSSE2] in { 4282 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4283 4284 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4285 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4286} 4287 4288//===---------------------------------------------------------------------===// 4289// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4290// IA32 document. movq xmm1, xmm2 does clear the high bits. 4291// 4292let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4293def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4294 "vmovq\t{$src, $dst|$dst, $src}", 4295 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4296 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4297def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4298 "movq\t{$src, $dst|$dst, $src}", 4299 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4300 XS, Requires<[UseSSE2]>; 4301} // ExeDomain, SchedRW 4302 4303let Predicates = [UseAVX] in { 4304 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4305 (VMOVZPQILo2PQIrr VR128:$src)>; 4306} 4307let Predicates = [UseSSE2] in { 4308 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4309 (MOVZPQILo2PQIrr VR128:$src)>; 4310} 4311 4312let Predicates = [UseAVX] in { 4313 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4314 (SUBREG_TO_REG (i32 0), 4315 (v2f64 (VMOVZPQILo2PQIrr 4316 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4317 sub_xmm)>; 4318 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4319 (SUBREG_TO_REG (i32 0), 4320 (v2i64 (VMOVZPQILo2PQIrr 4321 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4322 sub_xmm)>; 4323} 4324 4325//===---------------------------------------------------------------------===// 4326// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4327//===---------------------------------------------------------------------===// 4328 4329multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4330 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4331 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4332def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4333 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4334 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4335 Sched<[sched]>; 4336def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4337 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4338 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4339 Sched<[sched.Folded]>; 4340} 4341 4342let Predicates = [HasAVX, NoVLX] in { 4343 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4344 v4f32, VR128, loadv4f32, f128mem, 4345 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4346 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4347 v4f32, VR128, loadv4f32, f128mem, 4348 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4349 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4350 v8f32, VR256, loadv8f32, f256mem, 4351 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4352 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4353 v8f32, VR256, loadv8f32, f256mem, 4354 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4355} 4356defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4357 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4358defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4359 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4360 4361let Predicates = [HasAVX, NoVLX] in { 4362 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4363 (VMOVSHDUPrr VR128:$src)>; 4364 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4365 (VMOVSHDUPrm addr:$src)>; 4366 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4367 (VMOVSLDUPrr VR128:$src)>; 4368 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4369 (VMOVSLDUPrm addr:$src)>; 4370 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4371 (VMOVSHDUPYrr VR256:$src)>; 4372 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4373 (VMOVSHDUPYrm addr:$src)>; 4374 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4375 (VMOVSLDUPYrr VR256:$src)>; 4376 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4377 (VMOVSLDUPYrm addr:$src)>; 4378} 4379 4380let Predicates = [UseSSE3] in { 4381 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4382 (MOVSHDUPrr VR128:$src)>; 4383 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4384 (MOVSHDUPrm addr:$src)>; 4385 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4386 (MOVSLDUPrr VR128:$src)>; 4387 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4388 (MOVSLDUPrm addr:$src)>; 4389} 4390 4391//===---------------------------------------------------------------------===// 4392// SSE3 - Replicate Double FP - MOVDDUP 4393//===---------------------------------------------------------------------===// 4394 4395multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4396def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4397 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4398 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4399 Sched<[sched.XMM]>; 4400def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4401 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4402 [(set VR128:$dst, 4403 (v2f64 (X86Movddup 4404 (scalar_to_vector (loadf64 addr:$src)))))]>, 4405 Sched<[sched.XMM.Folded]>; 4406} 4407 4408// FIXME: Merge with above classes when there are patterns for the ymm version 4409multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4410def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4411 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4412 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4413 Sched<[sched.YMM]>; 4414def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4415 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4416 [(set VR256:$dst, 4417 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4418 Sched<[sched.YMM.Folded]>; 4419} 4420 4421let Predicates = [HasAVX, NoVLX] in { 4422 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4423 VEX, VEX_WIG; 4424 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4425 VEX, VEX_L, VEX_WIG; 4426} 4427 4428defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4429 4430 4431let Predicates = [HasAVX, NoVLX] in { 4432 def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), 4433 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4434 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4435 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4436} 4437 4438let Predicates = [UseSSE3] in { 4439 // No need for aligned memory as this only loads 64-bits. 4440 def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))), 4441 (MOVDDUPrm addr:$src)>; 4442 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4443 (MOVDDUPrm addr:$src)>; 4444} 4445 4446//===---------------------------------------------------------------------===// 4447// SSE3 - Move Unaligned Integer 4448//===---------------------------------------------------------------------===// 4449 4450let Predicates = [HasAVX] in { 4451 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4452 "vlddqu\t{$src, $dst|$dst, $src}", 4453 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4454 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4455 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4456 "vlddqu\t{$src, $dst|$dst, $src}", 4457 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4458 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4459} // Predicates 4460 4461def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4462 "lddqu\t{$src, $dst|$dst, $src}", 4463 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4464 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4465 4466//===---------------------------------------------------------------------===// 4467// SSE3 - Arithmetic 4468//===---------------------------------------------------------------------===// 4469 4470multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4471 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4472 PatFrag ld_frag, bit Is2Addr = 1> { 4473let Uses = [MXCSR], mayRaiseFPException = 1 in { 4474 def rr : I<0xD0, MRMSrcReg, 4475 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4476 !if(Is2Addr, 4477 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4478 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4479 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4480 Sched<[sched]>; 4481 def rm : I<0xD0, MRMSrcMem, 4482 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4483 !if(Is2Addr, 4484 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4485 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4486 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4487 Sched<[sched.Folded, sched.ReadAfterFold]>; 4488} 4489} 4490 4491let Predicates = [HasAVX] in { 4492 let ExeDomain = SSEPackedSingle in { 4493 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4494 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4495 XD, VEX_4V, VEX_WIG; 4496 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4497 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4498 XD, VEX_4V, VEX_L, VEX_WIG; 4499 } 4500 let ExeDomain = SSEPackedDouble in { 4501 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4502 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4503 PD, VEX_4V, VEX_WIG; 4504 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4505 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4506 PD, VEX_4V, VEX_L, VEX_WIG; 4507 } 4508} 4509let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4510 let ExeDomain = SSEPackedSingle in 4511 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4512 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4513 let ExeDomain = SSEPackedDouble in 4514 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4515 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4516} 4517 4518//===---------------------------------------------------------------------===// 4519// SSE3 Instructions 4520//===---------------------------------------------------------------------===// 4521 4522// Horizontal ops 4523multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4524 X86MemOperand x86memop, SDNode OpNode, 4525 X86FoldableSchedWrite sched, PatFrag ld_frag, 4526 bit Is2Addr = 1> { 4527let Uses = [MXCSR], mayRaiseFPException = 1 in { 4528 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4529 !if(Is2Addr, 4530 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4531 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4532 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4533 Sched<[sched]>; 4534 4535 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4536 !if(Is2Addr, 4537 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4538 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4539 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4540 Sched<[sched.Folded, sched.ReadAfterFold]>; 4541} 4542} 4543multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4544 X86MemOperand x86memop, SDNode OpNode, 4545 X86FoldableSchedWrite sched, PatFrag ld_frag, 4546 bit Is2Addr = 1> { 4547let Uses = [MXCSR], mayRaiseFPException = 1 in { 4548 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4549 !if(Is2Addr, 4550 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4551 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4552 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4553 Sched<[sched]>; 4554 4555 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4556 !if(Is2Addr, 4557 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4559 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4560 Sched<[sched.Folded, sched.ReadAfterFold]>; 4561} 4562} 4563 4564let Predicates = [HasAVX] in { 4565 let ExeDomain = SSEPackedSingle in { 4566 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4567 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4568 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4569 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4570 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4571 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4572 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4573 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4574 } 4575 let ExeDomain = SSEPackedDouble in { 4576 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4577 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4578 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4579 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4580 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4581 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4582 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4583 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4584 } 4585} 4586 4587let Constraints = "$src1 = $dst" in { 4588 let ExeDomain = SSEPackedSingle in { 4589 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4590 WriteFHAdd, memopv4f32>; 4591 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4592 WriteFHAdd, memopv4f32>; 4593 } 4594 let ExeDomain = SSEPackedDouble in { 4595 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4596 WriteFHAdd, memopv2f64>; 4597 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4598 WriteFHAdd, memopv2f64>; 4599 } 4600} 4601 4602//===---------------------------------------------------------------------===// 4603// SSSE3 - Packed Absolute Instructions 4604//===---------------------------------------------------------------------===// 4605 4606/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4607multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4608 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4609 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4610 (ins VR128:$src), 4611 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4612 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4613 Sched<[sched.XMM]>; 4614 4615 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4616 (ins i128mem:$src), 4617 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4618 [(set VR128:$dst, 4619 (vt (OpNode (ld_frag addr:$src))))]>, 4620 Sched<[sched.XMM.Folded]>; 4621} 4622 4623/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4624multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4625 SDNode OpNode, X86SchedWriteWidths sched> { 4626 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4627 (ins VR256:$src), 4628 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4629 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4630 Sched<[sched.YMM]>; 4631 4632 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4633 (ins i256mem:$src), 4634 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4635 [(set VR256:$dst, 4636 (vt (OpNode (load addr:$src))))]>, 4637 Sched<[sched.YMM.Folded]>; 4638} 4639 4640let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4641 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4642 load>, VEX, VEX_WIG; 4643 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4644 load>, VEX, VEX_WIG; 4645} 4646let Predicates = [HasAVX, NoVLX] in { 4647 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4648 load>, VEX, VEX_WIG; 4649} 4650let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4651 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4652 VEX, VEX_L, VEX_WIG; 4653 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4654 VEX, VEX_L, VEX_WIG; 4655} 4656let Predicates = [HasAVX2, NoVLX] in { 4657 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4658 VEX, VEX_L, VEX_WIG; 4659} 4660 4661defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4662 memop>; 4663defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4664 memop>; 4665defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4666 memop>; 4667 4668//===---------------------------------------------------------------------===// 4669// SSSE3 - Packed Binary Operator Instructions 4670//===---------------------------------------------------------------------===// 4671 4672/// SS3I_binop_rm - Simple SSSE3 bin op 4673multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4674 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4675 PatFrag memop_frag, X86MemOperand x86memop, 4676 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4677 let isCommutable = 1 in 4678 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4679 (ins RC:$src1, RC:$src2), 4680 !if(Is2Addr, 4681 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4682 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4683 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4684 Sched<[sched]>; 4685 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4686 (ins RC:$src1, x86memop:$src2), 4687 !if(Is2Addr, 4688 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4689 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4690 [(set RC:$dst, 4691 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4692 Sched<[sched.Folded, sched.ReadAfterFold]>; 4693} 4694 4695/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4696multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4697 Intrinsic IntId128, X86FoldableSchedWrite sched, 4698 PatFrag ld_frag, bit Is2Addr = 1> { 4699 let isCommutable = 1 in 4700 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4701 (ins VR128:$src1, VR128:$src2), 4702 !if(Is2Addr, 4703 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4704 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4705 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4706 Sched<[sched]>; 4707 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4708 (ins VR128:$src1, i128mem:$src2), 4709 !if(Is2Addr, 4710 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4711 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4712 [(set VR128:$dst, 4713 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4714 Sched<[sched.Folded, sched.ReadAfterFold]>; 4715} 4716 4717multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4718 Intrinsic IntId256, 4719 X86FoldableSchedWrite sched> { 4720 let isCommutable = 1 in 4721 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4722 (ins VR256:$src1, VR256:$src2), 4723 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4724 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4725 Sched<[sched]>; 4726 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4727 (ins VR256:$src1, i256mem:$src2), 4728 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4729 [(set VR256:$dst, 4730 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4731 Sched<[sched.Folded, sched.ReadAfterFold]>; 4732} 4733 4734let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4735let isCommutable = 0 in { 4736 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4737 VR128, load, i128mem, 4738 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4739 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4740 v16i8, VR128, load, i128mem, 4741 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4742} 4743defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4744 VR128, load, i128mem, 4745 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4746} 4747 4748let ImmT = NoImm, Predicates = [HasAVX] in { 4749let isCommutable = 0 in { 4750 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4751 load, i128mem, 4752 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4753 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4754 load, i128mem, 4755 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4756 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4757 load, i128mem, 4758 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4759 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4760 load, i128mem, 4761 SchedWritePHAdd.XMM, 0>, VEX_4V; 4762 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4763 int_x86_ssse3_psign_b_128, 4764 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4765 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4766 int_x86_ssse3_psign_w_128, 4767 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4768 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4769 int_x86_ssse3_psign_d_128, 4770 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4771 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4772 int_x86_ssse3_phadd_sw_128, 4773 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4774 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4775 int_x86_ssse3_phsub_sw_128, 4776 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4777} 4778} 4779 4780let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4781let isCommutable = 0 in { 4782 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4783 VR256, load, i256mem, 4784 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4785 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4786 v32i8, VR256, load, i256mem, 4787 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4788} 4789defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4790 VR256, load, i256mem, 4791 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4792} 4793 4794let ImmT = NoImm, Predicates = [HasAVX2] in { 4795let isCommutable = 0 in { 4796 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4797 VR256, load, i256mem, 4798 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4799 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4800 load, i256mem, 4801 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4802 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4803 VR256, load, i256mem, 4804 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4805 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4806 load, i256mem, 4807 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; 4808 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4809 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4810 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4811 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4812 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4813 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4814 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4815 int_x86_avx2_phadd_sw, 4816 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4817 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4818 int_x86_avx2_phsub_sw, 4819 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4820} 4821} 4822 4823// None of these have i8 immediate fields. 4824let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4825let isCommutable = 0 in { 4826 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4827 memop, i128mem, SchedWritePHAdd.XMM>; 4828 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4829 memop, i128mem, SchedWritePHAdd.XMM>; 4830 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4831 memop, i128mem, SchedWritePHAdd.XMM>; 4832 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4833 memop, i128mem, SchedWritePHAdd.XMM>; 4834 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4835 SchedWriteVecALU.XMM, memop>; 4836 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4837 SchedWriteVecALU.XMM, memop>; 4838 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4839 SchedWriteVecALU.XMM, memop>; 4840 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4841 memop, i128mem, SchedWriteVarShuffle.XMM>; 4842 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4843 int_x86_ssse3_phadd_sw_128, 4844 SchedWritePHAdd.XMM, memop>; 4845 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4846 int_x86_ssse3_phsub_sw_128, 4847 SchedWritePHAdd.XMM, memop>; 4848 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4849 v16i8, VR128, memop, i128mem, 4850 SchedWriteVecIMul.XMM>; 4851} 4852defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4853 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4854} 4855 4856//===---------------------------------------------------------------------===// 4857// SSSE3 - Packed Align Instruction Patterns 4858//===---------------------------------------------------------------------===// 4859 4860multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4861 PatFrag memop_frag, X86MemOperand x86memop, 4862 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4863 let hasSideEffects = 0 in { 4864 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4865 (ins RC:$src1, RC:$src2, u8imm:$src3), 4866 !if(Is2Addr, 4867 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4868 !strconcat(asm, 4869 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4870 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4871 Sched<[sched]>; 4872 let mayLoad = 1 in 4873 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4874 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4875 !if(Is2Addr, 4876 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4877 !strconcat(asm, 4878 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4879 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4880 (memop_frag addr:$src2), 4881 (i8 timm:$src3))))]>, 4882 Sched<[sched.Folded, sched.ReadAfterFold]>; 4883 } 4884} 4885 4886let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4887 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4888 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4889let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4890 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4891 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4892let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4893 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4894 SchedWriteShuffle.XMM>; 4895 4896//===---------------------------------------------------------------------===// 4897// SSSE3 - Thread synchronization 4898//===---------------------------------------------------------------------===// 4899 4900let SchedRW = [WriteSystem] in { 4901let Uses = [EAX, ECX, EDX] in 4902def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4903 TB, Requires<[HasSSE3, Not64BitMode]>; 4904let Uses = [RAX, ECX, EDX] in 4905def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4906 TB, Requires<[HasSSE3, In64BitMode]>; 4907 4908let Uses = [ECX, EAX] in 4909def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4910 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4911} // SchedRW 4912 4913def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4914def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4915 4916def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4917 Requires<[Not64BitMode]>; 4918def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4919 Requires<[In64BitMode]>; 4920 4921//===----------------------------------------------------------------------===// 4922// SSE4.1 - Packed Move with Sign/Zero Extend 4923// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 4924//===----------------------------------------------------------------------===// 4925 4926multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4927 RegisterClass OutRC, RegisterClass InRC, 4928 X86FoldableSchedWrite sched> { 4929 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4930 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4931 Sched<[sched]>; 4932 4933 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4934 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4935 Sched<[sched.Folded]>; 4936} 4937 4938multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 4939 X86MemOperand MemOp, X86MemOperand MemYOp, 4940 Predicate prd> { 4941 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 4942 SchedWriteShuffle.XMM>; 4943 let Predicates = [HasAVX, prd] in 4944 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 4945 VR128, VR128, SchedWriteShuffle.XMM>, 4946 VEX, VEX_WIG; 4947 let Predicates = [HasAVX2, prd] in 4948 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 4949 VR256, VR128, WriteShuffle256>, 4950 VEX, VEX_L, VEX_WIG; 4951} 4952 4953multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4954 X86MemOperand MemYOp, Predicate prd> { 4955 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 4956 MemOp, MemYOp, prd>; 4957 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 4958 !strconcat("pmovzx", OpcodeStr), 4959 MemOp, MemYOp, prd>; 4960} 4961 4962defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 4963defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 4964defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 4965 4966defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 4967defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 4968 4969defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 4970 4971// AVX2 Patterns 4972multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 4973 SDNode ExtOp, SDNode InVecOp> { 4974 // Register-Register patterns 4975 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4976 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 4977 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 4978 } 4979 let Predicates = [HasAVX2, NoVLX] in { 4980 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 4981 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 4982 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 4983 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 4984 4985 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 4986 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 4987 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 4988 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 4989 4990 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 4991 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 4992 } 4993 4994 // Simple Register-Memory patterns 4995 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4996 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4997 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4998 4999 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 5000 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5001 } 5002 5003 let Predicates = [HasAVX2, NoVLX] in { 5004 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5005 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5006 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5007 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5008 5009 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5010 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5011 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5012 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5013 5014 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5015 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5016 } 5017 5018 // AVX2 Register-Memory patterns 5019 let Predicates = [HasAVX2, NoVLX] in { 5020 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5021 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5022 5023 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5024 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5025 def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))), 5026 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5027 5028 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5029 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5030 5031 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5032 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5033 def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))), 5034 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5035 5036 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5037 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5038 def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))), 5039 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5040 } 5041} 5042 5043defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5044defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5045 5046// SSE4.1/AVX patterns. 5047multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5048 SDNode ExtOp> { 5049 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5050 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5051 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5052 } 5053 let Predicates = [HasAVX, NoVLX] in { 5054 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5055 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5056 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5057 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5058 5059 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5060 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5061 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5062 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5063 5064 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5065 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5066 } 5067 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5068 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5069 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5070 } 5071 let Predicates = [HasAVX, NoVLX] in { 5072 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5073 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5074 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5075 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5076 5077 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5078 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5079 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5080 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5081 5082 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5083 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5084 } 5085 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5086 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5087 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5088 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5089 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5090 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5091 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5092 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5093 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5094 } 5095 let Predicates = [HasAVX, NoVLX] in { 5096 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5097 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5098 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5099 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5100 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5101 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5102 5103 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5104 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5105 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5106 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5107 5108 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5109 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5110 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5111 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5112 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5113 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5114 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5115 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5116 5117 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5118 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5119 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5120 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5121 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5122 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5123 5124 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5125 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5126 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5127 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5128 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5129 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5130 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5131 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5132 } 5133} 5134 5135defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5136defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5137 5138let Predicates = [UseSSE41] in { 5139 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5140 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5141} 5142 5143//===----------------------------------------------------------------------===// 5144// SSE4.1 - Extract Instructions 5145//===----------------------------------------------------------------------===// 5146 5147/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5148multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5149 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5150 (ins VR128:$src1, u8imm:$src2), 5151 !strconcat(OpcodeStr, 5152 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5153 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5154 imm:$src2))]>, 5155 Sched<[WriteVecExtract]>; 5156 let hasSideEffects = 0, mayStore = 1 in 5157 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5158 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5159 !strconcat(OpcodeStr, 5160 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5161 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), 5162 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5163} 5164 5165let Predicates = [HasAVX, NoBWI] in 5166 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; 5167 5168defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5169 5170 5171/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5172multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5173 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5174 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5175 (ins VR128:$src1, u8imm:$src2), 5176 !strconcat(OpcodeStr, 5177 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5178 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5179 5180 let hasSideEffects = 0, mayStore = 1 in 5181 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5182 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5183 !strconcat(OpcodeStr, 5184 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5185 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), 5186 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5187} 5188 5189let Predicates = [HasAVX, NoBWI] in 5190 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; 5191 5192defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5193 5194 5195/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5196multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5197 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5198 (ins VR128:$src1, u8imm:$src2), 5199 !strconcat(OpcodeStr, 5200 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5201 [(set GR32:$dst, 5202 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5203 Sched<[WriteVecExtract]>; 5204 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5205 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5206 !strconcat(OpcodeStr, 5207 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5208 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5209 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5210} 5211 5212let Predicates = [HasAVX, NoDQI] in 5213 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5214 5215defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5216 5217/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5218multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5219 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5220 (ins VR128:$src1, u8imm:$src2), 5221 !strconcat(OpcodeStr, 5222 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5223 [(set GR64:$dst, 5224 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5225 Sched<[WriteVecExtract]>; 5226 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5227 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5228 !strconcat(OpcodeStr, 5229 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5230 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5231 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5232} 5233 5234let Predicates = [HasAVX, NoDQI] in 5235 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5236 5237defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5238 5239/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5240/// destination 5241multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5242 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5243 (ins VR128:$src1, u8imm:$src2), 5244 !strconcat(OpcodeStr, 5245 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5246 [(set GR32orGR64:$dst, 5247 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5248 Sched<[WriteVecExtract]>; 5249 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5250 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5251 !strconcat(OpcodeStr, 5252 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5253 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5254 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5255} 5256 5257let ExeDomain = SSEPackedSingle in { 5258 let Predicates = [UseAVX] in 5259 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5260 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5261} 5262 5263//===----------------------------------------------------------------------===// 5264// SSE4.1 - Insert Instructions 5265//===----------------------------------------------------------------------===// 5266 5267multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5268 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5269 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5270 !if(Is2Addr, 5271 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5272 !strconcat(asm, 5273 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5274 [(set VR128:$dst, 5275 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 5276 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5277 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5278 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5279 !if(Is2Addr, 5280 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5281 !strconcat(asm, 5282 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5283 [(set VR128:$dst, 5284 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>, 5285 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5286} 5287 5288let Predicates = [HasAVX, NoBWI] in 5289 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; 5290let Constraints = "$src1 = $dst" in 5291 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5292 5293multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5294 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5295 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5296 !if(Is2Addr, 5297 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5298 !strconcat(asm, 5299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5300 [(set VR128:$dst, 5301 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5302 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5303 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5304 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5305 !if(Is2Addr, 5306 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5307 !strconcat(asm, 5308 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5309 [(set VR128:$dst, 5310 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5311 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5312} 5313 5314let Predicates = [HasAVX, NoDQI] in 5315 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5316let Constraints = "$src1 = $dst" in 5317 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5318 5319multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5320 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5321 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5322 !if(Is2Addr, 5323 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5324 !strconcat(asm, 5325 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5326 [(set VR128:$dst, 5327 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5328 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5329 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5330 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5331 !if(Is2Addr, 5332 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5333 !strconcat(asm, 5334 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5335 [(set VR128:$dst, 5336 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5337 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5338} 5339 5340let Predicates = [HasAVX, NoDQI] in 5341 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5342let Constraints = "$src1 = $dst" in 5343 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5344 5345// insertps has a few different modes, there's the first two here below which 5346// are optimized inserts that won't zero arbitrary elements in the destination 5347// vector. The next one matches the intrinsic and could zero arbitrary elements 5348// in the target vector. 5349multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5350 let isCommutable = 1 in 5351 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5352 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5353 !if(Is2Addr, 5354 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5355 !strconcat(asm, 5356 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5357 [(set VR128:$dst, 5358 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5359 Sched<[SchedWriteFShuffle.XMM]>; 5360 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5361 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5362 !if(Is2Addr, 5363 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5364 !strconcat(asm, 5365 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5366 [(set VR128:$dst, 5367 (X86insertps VR128:$src1, 5368 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5369 timm:$src3))]>, 5370 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5371} 5372 5373let ExeDomain = SSEPackedSingle in { 5374 let Predicates = [UseAVX] in 5375 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5376 VEX_4V, VEX_WIG; 5377 let Constraints = "$src1 = $dst" in 5378 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5379} 5380 5381//===----------------------------------------------------------------------===// 5382// SSE4.1 - Round Instructions 5383//===----------------------------------------------------------------------===// 5384 5385multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5386 X86MemOperand x86memop, RegisterClass RC, 5387 ValueType VT, PatFrag mem_frag, SDNode OpNode, 5388 X86FoldableSchedWrite sched> { 5389 // Intrinsic operation, reg. 5390 // Vector intrinsic operation, reg 5391let Uses = [MXCSR], mayRaiseFPException = 1 in { 5392 def r : SS4AIi8<opc, MRMSrcReg, 5393 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5394 !strconcat(OpcodeStr, 5395 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5396 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5397 Sched<[sched]>; 5398 5399 // Vector intrinsic operation, mem 5400 def m : SS4AIi8<opc, MRMSrcMem, 5401 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5402 !strconcat(OpcodeStr, 5403 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5404 [(set RC:$dst, 5405 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5406 Sched<[sched.Folded]>; 5407} 5408} 5409 5410multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5411 string OpcodeStr, X86FoldableSchedWrite sched> { 5412let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5413 def SSr : SS4AIi8<opcss, MRMSrcReg, 5414 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5415 !strconcat(OpcodeStr, 5416 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5417 []>, Sched<[sched]>; 5418 5419 let mayLoad = 1 in 5420 def SSm : SS4AIi8<opcss, MRMSrcMem, 5421 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5422 !strconcat(OpcodeStr, 5423 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5424 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5425} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5426 5427let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5428 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5429 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5430 !strconcat(OpcodeStr, 5431 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5432 []>, Sched<[sched]>; 5433 5434 let mayLoad = 1 in 5435 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5436 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5437 !strconcat(OpcodeStr, 5438 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5439 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5440} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5441} 5442 5443multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5444 string OpcodeStr, X86FoldableSchedWrite sched> { 5445let Uses = [MXCSR], mayRaiseFPException = 1 in { 5446let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5447 def SSr : SS4AIi8<opcss, MRMSrcReg, 5448 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5449 !strconcat(OpcodeStr, 5450 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5451 []>, Sched<[sched]>; 5452 5453 let mayLoad = 1 in 5454 def SSm : SS4AIi8<opcss, MRMSrcMem, 5455 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5456 !strconcat(OpcodeStr, 5457 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5458 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5459} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5460 5461let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5462 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5463 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5464 !strconcat(OpcodeStr, 5465 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5466 []>, Sched<[sched]>; 5467 5468 let mayLoad = 1 in 5469 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5470 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5471 !strconcat(OpcodeStr, 5472 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5473 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5474} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5475} 5476} 5477 5478multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5479 string OpcodeStr, X86FoldableSchedWrite sched, 5480 ValueType VT32, ValueType VT64, 5481 SDNode OpNode, bit Is2Addr = 1> { 5482let Uses = [MXCSR], mayRaiseFPException = 1 in { 5483let ExeDomain = SSEPackedSingle in { 5484 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5485 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5486 !if(Is2Addr, 5487 !strconcat(OpcodeStr, 5488 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5489 !strconcat(OpcodeStr, 5490 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5491 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5492 Sched<[sched]>; 5493 5494 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5495 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5496 !if(Is2Addr, 5497 !strconcat(OpcodeStr, 5498 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5499 !strconcat(OpcodeStr, 5500 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5501 [(set VR128:$dst, 5502 (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>, 5503 Sched<[sched.Folded, sched.ReadAfterFold]>; 5504} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5505 5506let ExeDomain = SSEPackedDouble in { 5507 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5508 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5509 !if(Is2Addr, 5510 !strconcat(OpcodeStr, 5511 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5512 !strconcat(OpcodeStr, 5513 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5514 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5515 Sched<[sched]>; 5516 5517 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5518 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5519 !if(Is2Addr, 5520 !strconcat(OpcodeStr, 5521 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5522 !strconcat(OpcodeStr, 5523 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5524 [(set VR128:$dst, 5525 (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>, 5526 Sched<[sched.Folded, sched.ReadAfterFold]>; 5527} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5528} 5529} 5530 5531// FP round - roundss, roundps, roundsd, roundpd 5532let Predicates = [HasAVX, NoVLX] in { 5533 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5534 // Intrinsic form 5535 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5536 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5537 VEX, VEX_WIG; 5538 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5539 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5540 VEX, VEX_L, VEX_WIG; 5541 } 5542 5543 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5544 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5545 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5546 VEX, VEX_WIG; 5547 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5548 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5549 VEX, VEX_L, VEX_WIG; 5550 } 5551} 5552let Predicates = [UseAVX] in { 5553 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5554 v4f32, v2f64, X86RndScales, 0>, 5555 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5556 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5557 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5558} 5559 5560let Predicates = [UseAVX] in { 5561 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5562 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5563 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5564 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5565} 5566 5567let Predicates = [UseAVX, OptForSize] in { 5568 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5569 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5570 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5571 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5572} 5573 5574let ExeDomain = SSEPackedSingle in 5575defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5576 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5577let ExeDomain = SSEPackedDouble in 5578defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5579 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5580 5581defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5582 5583let Constraints = "$src1 = $dst" in 5584defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5585 v4f32, v2f64, X86RndScales>; 5586 5587let Predicates = [UseSSE41] in { 5588 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5589 (ROUNDSSr FR32:$src1, timm:$src2)>; 5590 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5591 (ROUNDSDr FR64:$src1, timm:$src2)>; 5592} 5593 5594let Predicates = [UseSSE41, OptForSize] in { 5595 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5596 (ROUNDSSm addr:$src1, timm:$src2)>; 5597 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5598 (ROUNDSDm addr:$src1, timm:$src2)>; 5599} 5600 5601//===----------------------------------------------------------------------===// 5602// SSE4.1 - Packed Bit Test 5603//===----------------------------------------------------------------------===// 5604 5605// ptest instruction we'll lower to this in X86ISelLowering primarily from 5606// the intel intrinsic that corresponds to this. 5607let Defs = [EFLAGS], Predicates = [HasAVX] in { 5608def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5609 "vptest\t{$src2, $src1|$src1, $src2}", 5610 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5611 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5612def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5613 "vptest\t{$src2, $src1|$src1, $src2}", 5614 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5615 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5616 VEX, VEX_WIG; 5617 5618def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5619 "vptest\t{$src2, $src1|$src1, $src2}", 5620 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5621 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5622def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5623 "vptest\t{$src2, $src1|$src1, $src2}", 5624 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5625 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5626 VEX, VEX_L, VEX_WIG; 5627} 5628 5629let Defs = [EFLAGS] in { 5630def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5631 "ptest\t{$src2, $src1|$src1, $src2}", 5632 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5633 Sched<[SchedWriteVecTest.XMM]>; 5634def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5635 "ptest\t{$src2, $src1|$src1, $src2}", 5636 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5637 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5638} 5639 5640// The bit test instructions below are AVX only 5641multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5642 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5643 X86FoldableSchedWrite sched> { 5644 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5645 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5646 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5647 Sched<[sched]>, VEX; 5648 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5649 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5650 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5651 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5652} 5653 5654let Defs = [EFLAGS], Predicates = [HasAVX] in { 5655let ExeDomain = SSEPackedSingle in { 5656defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5657 SchedWriteFTest.XMM>; 5658defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5659 SchedWriteFTest.YMM>, VEX_L; 5660} 5661let ExeDomain = SSEPackedDouble in { 5662defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5663 SchedWriteFTest.XMM>; 5664defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5665 SchedWriteFTest.YMM>, VEX_L; 5666} 5667} 5668 5669//===----------------------------------------------------------------------===// 5670// SSE4.1 - Misc Instructions 5671//===----------------------------------------------------------------------===// 5672 5673let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5674 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5675 "popcnt{w}\t{$src, $dst|$dst, $src}", 5676 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5677 Sched<[WritePOPCNT]>, OpSize16, XS; 5678 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5679 "popcnt{w}\t{$src, $dst|$dst, $src}", 5680 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5681 (implicit EFLAGS)]>, 5682 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5683 5684 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5685 "popcnt{l}\t{$src, $dst|$dst, $src}", 5686 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5687 Sched<[WritePOPCNT]>, OpSize32, XS; 5688 5689 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5690 "popcnt{l}\t{$src, $dst|$dst, $src}", 5691 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5692 (implicit EFLAGS)]>, 5693 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5694 5695 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5696 "popcnt{q}\t{$src, $dst|$dst, $src}", 5697 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5698 Sched<[WritePOPCNT]>, XS; 5699 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5700 "popcnt{q}\t{$src, $dst|$dst, $src}", 5701 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5702 (implicit EFLAGS)]>, 5703 Sched<[WritePOPCNT.Folded]>, XS; 5704} 5705 5706// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5707multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5708 SDNode OpNode, PatFrag ld_frag, 5709 X86FoldableSchedWrite Sched> { 5710 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5711 (ins VR128:$src), 5712 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5713 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5714 Sched<[Sched]>; 5715 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5716 (ins i128mem:$src), 5717 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5718 [(set VR128:$dst, 5719 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5720 Sched<[Sched.Folded]>; 5721} 5722 5723// PHMIN has the same profile as PSAD, thus we use the same scheduling 5724// model, although the naming is misleading. 5725let Predicates = [HasAVX] in 5726defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5727 X86phminpos, load, 5728 WritePHMINPOS>, VEX, VEX_WIG; 5729defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5730 X86phminpos, memop, 5731 WritePHMINPOS>; 5732 5733/// SS48I_binop_rm - Simple SSE41 binary operator. 5734multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5735 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5736 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5737 bit Is2Addr = 1> { 5738 let isCommutable = 1 in 5739 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5740 (ins RC:$src1, RC:$src2), 5741 !if(Is2Addr, 5742 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5743 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5744 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5745 Sched<[sched]>; 5746 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5747 (ins RC:$src1, x86memop:$src2), 5748 !if(Is2Addr, 5749 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5750 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5751 [(set RC:$dst, 5752 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5753 Sched<[sched.Folded, sched.ReadAfterFold]>; 5754} 5755 5756let Predicates = [HasAVX, NoVLX] in { 5757 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5758 load, i128mem, SchedWriteVecALU.XMM, 0>, 5759 VEX_4V, VEX_WIG; 5760 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5761 load, i128mem, SchedWriteVecALU.XMM, 0>, 5762 VEX_4V, VEX_WIG; 5763 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5764 load, i128mem, SchedWriteVecALU.XMM, 0>, 5765 VEX_4V, VEX_WIG; 5766 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5767 load, i128mem, SchedWriteVecALU.XMM, 0>, 5768 VEX_4V, VEX_WIG; 5769 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5770 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5771 VEX_4V, VEX_WIG; 5772} 5773let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5774 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5775 load, i128mem, SchedWriteVecALU.XMM, 0>, 5776 VEX_4V, VEX_WIG; 5777 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5778 load, i128mem, SchedWriteVecALU.XMM, 0>, 5779 VEX_4V, VEX_WIG; 5780 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5781 load, i128mem, SchedWriteVecALU.XMM, 0>, 5782 VEX_4V, VEX_WIG; 5783 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5784 load, i128mem, SchedWriteVecALU.XMM, 0>, 5785 VEX_4V, VEX_WIG; 5786} 5787 5788let Predicates = [HasAVX2, NoVLX] in { 5789 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5790 load, i256mem, SchedWriteVecALU.YMM, 0>, 5791 VEX_4V, VEX_L, VEX_WIG; 5792 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5793 load, i256mem, SchedWriteVecALU.YMM, 0>, 5794 VEX_4V, VEX_L, VEX_WIG; 5795 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5796 load, i256mem, SchedWriteVecALU.YMM, 0>, 5797 VEX_4V, VEX_L, VEX_WIG; 5798 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5799 load, i256mem, SchedWriteVecALU.YMM, 0>, 5800 VEX_4V, VEX_L, VEX_WIG; 5801 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5802 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5803 VEX_4V, VEX_L, VEX_WIG; 5804} 5805let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5806 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5807 load, i256mem, SchedWriteVecALU.YMM, 0>, 5808 VEX_4V, VEX_L, VEX_WIG; 5809 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5810 load, i256mem, SchedWriteVecALU.YMM, 0>, 5811 VEX_4V, VEX_L, VEX_WIG; 5812 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5813 load, i256mem, SchedWriteVecALU.YMM, 0>, 5814 VEX_4V, VEX_L, VEX_WIG; 5815 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5816 load, i256mem, SchedWriteVecALU.YMM, 0>, 5817 VEX_4V, VEX_L, VEX_WIG; 5818} 5819 5820let Constraints = "$src1 = $dst" in { 5821 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5822 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5823 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5824 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5825 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5826 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5827 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5828 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5829 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5830 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5831 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5832 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5833 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5834 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5835 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5836 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5837 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5838 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5839} 5840 5841let Predicates = [HasAVX, NoVLX] in 5842 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5843 load, i128mem, SchedWritePMULLD.XMM, 0>, 5844 VEX_4V, VEX_WIG; 5845let Predicates = [HasAVX] in 5846 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5847 load, i128mem, SchedWriteVecALU.XMM, 0>, 5848 VEX_4V, VEX_WIG; 5849 5850let Predicates = [HasAVX2, NoVLX] in 5851 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5852 load, i256mem, SchedWritePMULLD.YMM, 0>, 5853 VEX_4V, VEX_L, VEX_WIG; 5854let Predicates = [HasAVX2] in 5855 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5856 load, i256mem, SchedWriteVecALU.YMM, 0>, 5857 VEX_4V, VEX_L, VEX_WIG; 5858 5859let Constraints = "$src1 = $dst" in { 5860 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5861 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5862 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5863 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5864} 5865 5866/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5867multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5868 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5869 X86MemOperand x86memop, bit Is2Addr, 5870 X86FoldableSchedWrite sched> { 5871 let isCommutable = 1 in 5872 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5873 (ins RC:$src1, RC:$src2, u8imm:$src3), 5874 !if(Is2Addr, 5875 !strconcat(OpcodeStr, 5876 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5877 !strconcat(OpcodeStr, 5878 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5879 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5880 Sched<[sched]>; 5881 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5882 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5883 !if(Is2Addr, 5884 !strconcat(OpcodeStr, 5885 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5886 !strconcat(OpcodeStr, 5887 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5888 [(set RC:$dst, 5889 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 5890 Sched<[sched.Folded, sched.ReadAfterFold]>; 5891} 5892 5893/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5894multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5895 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5896 X86MemOperand x86memop, bit Is2Addr, 5897 X86FoldableSchedWrite sched> { 5898 let isCommutable = 1 in 5899 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5900 (ins RC:$src1, RC:$src2, u8imm:$src3), 5901 !if(Is2Addr, 5902 !strconcat(OpcodeStr, 5903 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5904 !strconcat(OpcodeStr, 5905 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5906 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 5907 Sched<[sched]>; 5908 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5909 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5910 !if(Is2Addr, 5911 !strconcat(OpcodeStr, 5912 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5913 !strconcat(OpcodeStr, 5914 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5915 [(set RC:$dst, 5916 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 5917 Sched<[sched.Folded, sched.ReadAfterFold]>; 5918} 5919 5920def BlendCommuteImm2 : SDNodeXForm<timm, [{ 5921 uint8_t Imm = N->getZExtValue() & 0x03; 5922 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 5923}]>; 5924 5925def BlendCommuteImm4 : SDNodeXForm<timm, [{ 5926 uint8_t Imm = N->getZExtValue() & 0x0f; 5927 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 5928}]>; 5929 5930def BlendCommuteImm8 : SDNodeXForm<timm, [{ 5931 uint8_t Imm = N->getZExtValue() & 0xff; 5932 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 5933}]>; 5934 5935// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 5936def BlendScaleImm4 : SDNodeXForm<timm, [{ 5937 uint8_t Imm = N->getZExtValue(); 5938 uint8_t NewImm = 0; 5939 for (unsigned i = 0; i != 4; ++i) { 5940 if (Imm & (1 << i)) 5941 NewImm |= 0x3 << (i * 2); 5942 } 5943 return getI8Imm(NewImm, SDLoc(N)); 5944}]>; 5945 5946// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 5947def BlendScaleImm2 : SDNodeXForm<timm, [{ 5948 uint8_t Imm = N->getZExtValue(); 5949 uint8_t NewImm = 0; 5950 for (unsigned i = 0; i != 2; ++i) { 5951 if (Imm & (1 << i)) 5952 NewImm |= 0xf << (i * 4); 5953 } 5954 return getI8Imm(NewImm, SDLoc(N)); 5955}]>; 5956 5957// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 5958def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 5959 uint8_t Imm = N->getZExtValue(); 5960 uint8_t NewImm = 0; 5961 for (unsigned i = 0; i != 2; ++i) { 5962 if (Imm & (1 << i)) 5963 NewImm |= 0x3 << (i * 2); 5964 } 5965 return getI8Imm(NewImm, SDLoc(N)); 5966}]>; 5967 5968// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 5969def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 5970 uint8_t Imm = N->getZExtValue(); 5971 uint8_t NewImm = 0; 5972 for (unsigned i = 0; i != 4; ++i) { 5973 if (Imm & (1 << i)) 5974 NewImm |= 0x3 << (i * 2); 5975 } 5976 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5977}]>; 5978 5979// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 5980def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 5981 uint8_t Imm = N->getZExtValue(); 5982 uint8_t NewImm = 0; 5983 for (unsigned i = 0; i != 2; ++i) { 5984 if (Imm & (1 << i)) 5985 NewImm |= 0xf << (i * 4); 5986 } 5987 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5988}]>; 5989 5990// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 5991def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 5992 uint8_t Imm = N->getZExtValue(); 5993 uint8_t NewImm = 0; 5994 for (unsigned i = 0; i != 2; ++i) { 5995 if (Imm & (1 << i)) 5996 NewImm |= 0x3 << (i * 2); 5997 } 5998 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 5999}]>; 6000 6001let Predicates = [HasAVX] in { 6002 let isCommutable = 0 in { 6003 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6004 VR128, load, i128mem, 0, 6005 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6006 } 6007 6008let Uses = [MXCSR], mayRaiseFPException = 1 in { 6009 let ExeDomain = SSEPackedSingle in 6010 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6011 VR128, load, f128mem, 0, 6012 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6013 let ExeDomain = SSEPackedDouble in 6014 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6015 VR128, load, f128mem, 0, 6016 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6017 let ExeDomain = SSEPackedSingle in 6018 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6019 VR256, load, i256mem, 0, 6020 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6021} 6022} 6023 6024let Predicates = [HasAVX2] in { 6025 let isCommutable = 0 in { 6026 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6027 VR256, load, i256mem, 0, 6028 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6029 } 6030} 6031 6032let Constraints = "$src1 = $dst" in { 6033 let isCommutable = 0 in { 6034 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6035 VR128, memop, i128mem, 1, 6036 SchedWriteMPSAD.XMM>; 6037 } 6038 6039 let ExeDomain = SSEPackedSingle in 6040 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6041 VR128, memop, f128mem, 1, 6042 SchedWriteDPPS.XMM>, SIMD_EXC; 6043 let ExeDomain = SSEPackedDouble in 6044 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6045 VR128, memop, f128mem, 1, 6046 SchedWriteDPPD.XMM>, SIMD_EXC; 6047} 6048 6049/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6050multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6051 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6052 X86MemOperand x86memop, bit Is2Addr, Domain d, 6053 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6054let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6055 let isCommutable = 1 in 6056 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6057 (ins RC:$src1, RC:$src2, u8imm:$src3), 6058 !if(Is2Addr, 6059 !strconcat(OpcodeStr, 6060 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6061 !strconcat(OpcodeStr, 6062 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6063 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6064 Sched<[sched]>; 6065 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6066 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6067 !if(Is2Addr, 6068 !strconcat(OpcodeStr, 6069 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6070 !strconcat(OpcodeStr, 6071 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6072 [(set RC:$dst, 6073 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6074 Sched<[sched.Folded, sched.ReadAfterFold]>; 6075} 6076 6077 // Pattern to commute if load is in first source. 6078 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6079 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6080 (commuteXForm timm:$src3))>; 6081} 6082 6083let Predicates = [HasAVX] in { 6084 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6085 VR128, load, f128mem, 0, SSEPackedSingle, 6086 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6087 VEX_4V, VEX_WIG; 6088 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6089 VR256, load, f256mem, 0, SSEPackedSingle, 6090 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6091 VEX_4V, VEX_L, VEX_WIG; 6092 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6093 VR128, load, f128mem, 0, SSEPackedDouble, 6094 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6095 VEX_4V, VEX_WIG; 6096 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6097 VR256, load, f256mem, 0, SSEPackedDouble, 6098 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6099 VEX_4V, VEX_L, VEX_WIG; 6100 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6101 VR128, load, i128mem, 0, SSEPackedInt, 6102 SchedWriteBlend.XMM, BlendCommuteImm8>, 6103 VEX_4V, VEX_WIG; 6104} 6105 6106let Predicates = [HasAVX2] in { 6107 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6108 VR256, load, i256mem, 0, SSEPackedInt, 6109 SchedWriteBlend.YMM, BlendCommuteImm8>, 6110 VEX_4V, VEX_L, VEX_WIG; 6111} 6112 6113// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6114// ExecutionDomainFixPass will cleanup domains later on. 6115let Predicates = [HasAVX1Only] in { 6116def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6117 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6118def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6119 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6120def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6121 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6122 6123// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6124// it from becoming movsd via commuting under optsize. 6125def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6126 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6127def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6128 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6129def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6130 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6131 6132def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6133 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6134def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6135 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6136def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6137 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6138 6139// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6140// it from becoming movss via commuting under optsize. 6141def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6142 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6143def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6144 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6145def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6146 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6147} 6148 6149defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6150 VR128, memop, f128mem, 1, SSEPackedSingle, 6151 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6152defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6153 VR128, memop, f128mem, 1, SSEPackedDouble, 6154 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6155defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6156 VR128, memop, i128mem, 1, SSEPackedInt, 6157 SchedWriteBlend.XMM, BlendCommuteImm8>; 6158 6159let Predicates = [UseSSE41] in { 6160// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6161// it from becoming movss via commuting under optsize. 6162def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6163 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6164def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6165 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6166def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6167 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6168 6169def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6170 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6171def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6172 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6173def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6174 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6175} 6176 6177// For insertion into the zero index (low half) of a 256-bit vector, it is 6178// more efficient to generate a blend with immediate instead of an insert*128. 6179let Predicates = [HasAVX] in { 6180def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6181 (VBLENDPDYrri VR256:$src1, 6182 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6183 VR128:$src2, sub_xmm), 0x3)>; 6184def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6185 (VBLENDPSYrri VR256:$src1, 6186 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6187 VR128:$src2, sub_xmm), 0xf)>; 6188 6189def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6190 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6191 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6192def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6193 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6194 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6195} 6196 6197/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6198multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6199 X86MemOperand x86memop, ValueType VT, 6200 PatFrag mem_frag, SDNode OpNode, 6201 X86FoldableSchedWrite sched> { 6202 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6203 (ins RC:$src1, RC:$src2, RC:$src3), 6204 !strconcat(OpcodeStr, 6205 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6206 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6207 SSEPackedInt>, TAPD, VEX_4V, 6208 Sched<[sched]>; 6209 6210 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6211 (ins RC:$src1, x86memop:$src2, RC:$src3), 6212 !strconcat(OpcodeStr, 6213 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6214 [(set RC:$dst, 6215 (OpNode RC:$src3, (mem_frag addr:$src2), 6216 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, 6217 Sched<[sched.Folded, sched.ReadAfterFold, 6218 // x86memop:$src2 6219 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6220 ReadDefault, 6221 // RC::$src3 6222 sched.ReadAfterFold]>; 6223} 6224 6225let Predicates = [HasAVX] in { 6226let ExeDomain = SSEPackedDouble in { 6227defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6228 v2f64, loadv2f64, X86Blendv, 6229 SchedWriteFVarBlend.XMM>; 6230defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6231 v4f64, loadv4f64, X86Blendv, 6232 SchedWriteFVarBlend.YMM>, VEX_L; 6233} // ExeDomain = SSEPackedDouble 6234let ExeDomain = SSEPackedSingle in { 6235defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6236 v4f32, loadv4f32, X86Blendv, 6237 SchedWriteFVarBlend.XMM>; 6238defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6239 v8f32, loadv8f32, X86Blendv, 6240 SchedWriteFVarBlend.YMM>, VEX_L; 6241} // ExeDomain = SSEPackedSingle 6242defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6243 v16i8, loadv16i8, X86Blendv, 6244 SchedWriteVarBlend.XMM>; 6245} 6246 6247let Predicates = [HasAVX2] in { 6248defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6249 v32i8, loadv32i8, X86Blendv, 6250 SchedWriteVarBlend.YMM>, VEX_L; 6251} 6252 6253let Predicates = [HasAVX] in { 6254 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6255 (v4i32 VR128:$src2))), 6256 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6257 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6258 (v2i64 VR128:$src2))), 6259 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6260 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6261 (v8i32 VR256:$src2))), 6262 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6263 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6264 (v4i64 VR256:$src2))), 6265 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6266} 6267 6268// Prefer a movss or movsd over a blendps when optimizing for size. these were 6269// changed to use blends because blends have better throughput on sandybridge 6270// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6271let Predicates = [HasAVX, OptForSpeed] in { 6272 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6273 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6274 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6275 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6276 6277 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6278 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6279 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6280 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6281 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6282 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6283 6284 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6285 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6286 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6287 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6288 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6289 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6290 6291 // Move low f32 and clear high bits. 6292 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6293 (SUBREG_TO_REG (i32 0), 6294 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6295 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6296 (i8 1))), sub_xmm)>; 6297 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6298 (SUBREG_TO_REG (i32 0), 6299 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6300 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6301 (i8 3))), sub_xmm)>; 6302} 6303 6304// Prefer a movss or movsd over a blendps when optimizing for size. these were 6305// changed to use blends because blends have better throughput on sandybridge 6306// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6307let Predicates = [UseSSE41, OptForSpeed] in { 6308 // With SSE41 we can use blends for these patterns. 6309 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6310 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6311 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6312 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6313 6314 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6315 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6316 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6317 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6318 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6319 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6320 6321 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6322 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6323 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6324 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6325 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6326 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6327} 6328 6329 6330/// SS41I_ternary - SSE 4.1 ternary operator 6331let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6332 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6333 PatFrag mem_frag, X86MemOperand x86memop, 6334 SDNode OpNode, X86FoldableSchedWrite sched> { 6335 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6336 (ins VR128:$src1, VR128:$src2), 6337 !strconcat(OpcodeStr, 6338 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6339 [(set VR128:$dst, 6340 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6341 Sched<[sched]>; 6342 6343 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6344 (ins VR128:$src1, x86memop:$src2), 6345 !strconcat(OpcodeStr, 6346 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6347 [(set VR128:$dst, 6348 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6349 Sched<[sched.Folded, sched.ReadAfterFold]>; 6350 } 6351} 6352 6353let ExeDomain = SSEPackedDouble in 6354defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6355 X86Blendv, SchedWriteFVarBlend.XMM>; 6356let ExeDomain = SSEPackedSingle in 6357defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6358 X86Blendv, SchedWriteFVarBlend.XMM>; 6359defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6360 X86Blendv, SchedWriteVarBlend.XMM>; 6361 6362// Aliases with the implicit xmm0 argument 6363def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6364 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6365def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6366 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6367def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6368 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6369def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6370 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6371def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6372 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6373def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6374 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6375 6376let Predicates = [UseSSE41] in { 6377 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6378 (v4i32 VR128:$src2))), 6379 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6380 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6381 (v2i64 VR128:$src2))), 6382 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6383} 6384 6385let AddedComplexity = 400 in { // Prefer non-temporal versions 6386 6387let Predicates = [HasAVX, NoVLX] in 6388def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6389 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6390 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6391let Predicates = [HasAVX2, NoVLX] in 6392def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6393 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6394 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6395def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6396 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6397 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6398 6399let Predicates = [HasAVX2, NoVLX] in { 6400 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6401 (VMOVNTDQAYrm addr:$src)>; 6402 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6403 (VMOVNTDQAYrm addr:$src)>; 6404 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6405 (VMOVNTDQAYrm addr:$src)>; 6406 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6407 (VMOVNTDQAYrm addr:$src)>; 6408 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6409 (VMOVNTDQAYrm addr:$src)>; 6410 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6411 (VMOVNTDQAYrm addr:$src)>; 6412} 6413 6414let Predicates = [HasAVX, NoVLX] in { 6415 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6416 (VMOVNTDQArm addr:$src)>; 6417 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6418 (VMOVNTDQArm addr:$src)>; 6419 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6420 (VMOVNTDQArm addr:$src)>; 6421 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6422 (VMOVNTDQArm addr:$src)>; 6423 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6424 (VMOVNTDQArm addr:$src)>; 6425 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6426 (VMOVNTDQArm addr:$src)>; 6427} 6428 6429let Predicates = [UseSSE41] in { 6430 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6431 (MOVNTDQArm addr:$src)>; 6432 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6433 (MOVNTDQArm addr:$src)>; 6434 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6435 (MOVNTDQArm addr:$src)>; 6436 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6437 (MOVNTDQArm addr:$src)>; 6438 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6439 (MOVNTDQArm addr:$src)>; 6440 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6441 (MOVNTDQArm addr:$src)>; 6442} 6443 6444} // AddedComplexity 6445 6446//===----------------------------------------------------------------------===// 6447// SSE4.2 - Compare Instructions 6448//===----------------------------------------------------------------------===// 6449 6450/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6451multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6452 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6453 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6454 bit Is2Addr = 1> { 6455 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6456 (ins RC:$src1, RC:$src2), 6457 !if(Is2Addr, 6458 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6459 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6460 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6461 Sched<[sched]>; 6462 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6463 (ins RC:$src1, x86memop:$src2), 6464 !if(Is2Addr, 6465 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6466 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6467 [(set RC:$dst, 6468 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6469 Sched<[sched.Folded, sched.ReadAfterFold]>; 6470} 6471 6472let Predicates = [HasAVX] in 6473 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6474 load, i128mem, SchedWriteVecALU.XMM, 0>, 6475 VEX_4V, VEX_WIG; 6476 6477let Predicates = [HasAVX2] in 6478 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6479 load, i256mem, SchedWriteVecALU.YMM, 0>, 6480 VEX_4V, VEX_L, VEX_WIG; 6481 6482let Constraints = "$src1 = $dst" in 6483 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6484 memop, i128mem, SchedWriteVecALU.XMM>; 6485 6486//===----------------------------------------------------------------------===// 6487// SSE4.2 - String/text Processing Instructions 6488//===----------------------------------------------------------------------===// 6489 6490multiclass pcmpistrm_SS42AI<string asm> { 6491 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6492 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6493 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6494 []>, Sched<[WritePCmpIStrM]>; 6495 let mayLoad = 1 in 6496 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6497 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6498 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6499 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6500} 6501 6502let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6503 let Predicates = [HasAVX] in 6504 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 6505 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6506} 6507 6508multiclass SS42AI_pcmpestrm<string asm> { 6509 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6510 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6511 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6512 []>, Sched<[WritePCmpEStrM]>; 6513 let mayLoad = 1 in 6514 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6515 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6516 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6517 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6518} 6519 6520let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6521 let Predicates = [HasAVX] in 6522 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 6523 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6524} 6525 6526multiclass SS42AI_pcmpistri<string asm> { 6527 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6528 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6529 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6530 []>, Sched<[WritePCmpIStrI]>; 6531 let mayLoad = 1 in 6532 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6533 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6534 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6535 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6536} 6537 6538let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6539 let Predicates = [HasAVX] in 6540 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 6541 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6542} 6543 6544multiclass SS42AI_pcmpestri<string asm> { 6545 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6546 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6547 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6548 []>, Sched<[WritePCmpEStrI]>; 6549 let mayLoad = 1 in 6550 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6551 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6552 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6553 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6554} 6555 6556let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6557 let Predicates = [HasAVX] in 6558 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 6559 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6560} 6561 6562//===----------------------------------------------------------------------===// 6563// SSE4.2 - CRC Instructions 6564//===----------------------------------------------------------------------===// 6565 6566// No CRC instructions have AVX equivalents 6567 6568// crc intrinsic instruction 6569// This set of instructions are only rm, the only difference is the size 6570// of r and m. 6571class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6572 RegisterClass RCIn, SDPatternOperator Int> : 6573 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6574 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6575 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6576 Sched<[WriteCRC32]>; 6577 6578class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6579 X86MemOperand x86memop, SDPatternOperator Int> : 6580 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6581 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6582 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6583 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; 6584 6585let Constraints = "$src1 = $dst" in { 6586 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6587 int_x86_sse42_crc32_32_8>; 6588 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6589 int_x86_sse42_crc32_32_8>; 6590 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6591 int_x86_sse42_crc32_32_16>, OpSize16; 6592 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6593 int_x86_sse42_crc32_32_16>, OpSize16; 6594 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6595 int_x86_sse42_crc32_32_32>, OpSize32; 6596 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6597 int_x86_sse42_crc32_32_32>, OpSize32; 6598 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6599 int_x86_sse42_crc32_64_64>, REX_W; 6600 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6601 int_x86_sse42_crc32_64_64>, REX_W; 6602 let hasSideEffects = 0 in { 6603 let mayLoad = 1 in 6604 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6605 null_frag>, REX_W; 6606 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6607 null_frag>, REX_W; 6608 } 6609} 6610 6611//===----------------------------------------------------------------------===// 6612// SHA-NI Instructions 6613//===----------------------------------------------------------------------===// 6614 6615// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6616multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6617 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6618 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6619 (ins VR128:$src1, VR128:$src2), 6620 !if(UsesXMM0, 6621 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6622 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6623 [!if(UsesXMM0, 6624 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6625 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6626 T8, Sched<[sched]>; 6627 6628 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6629 (ins VR128:$src1, i128mem:$src2), 6630 !if(UsesXMM0, 6631 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6632 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6633 [!if(UsesXMM0, 6634 (set VR128:$dst, (IntId VR128:$src1, 6635 (memop addr:$src2), XMM0)), 6636 (set VR128:$dst, (IntId VR128:$src1, 6637 (memop addr:$src2))))]>, T8, 6638 Sched<[sched.Folded, sched.ReadAfterFold]>; 6639} 6640 6641let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6642 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6643 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6644 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6645 [(set VR128:$dst, 6646 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6647 (i8 timm:$src3)))]>, TA, 6648 Sched<[SchedWriteVecIMul.XMM]>; 6649 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6650 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6651 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6652 [(set VR128:$dst, 6653 (int_x86_sha1rnds4 VR128:$src1, 6654 (memop addr:$src2), 6655 (i8 timm:$src3)))]>, TA, 6656 Sched<[SchedWriteVecIMul.XMM.Folded, 6657 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6658 6659 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6660 SchedWriteVecIMul.XMM>; 6661 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6662 SchedWriteVecIMul.XMM>; 6663 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6664 SchedWriteVecIMul.XMM>; 6665 6666 let Uses=[XMM0] in 6667 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6668 SchedWriteVecIMul.XMM, 1>; 6669 6670 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6671 SchedWriteVecIMul.XMM>; 6672 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6673 SchedWriteVecIMul.XMM>; 6674} 6675 6676// Aliases with explicit %xmm0 6677def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6678 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6679def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6680 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6681 6682//===----------------------------------------------------------------------===// 6683// AES-NI Instructions 6684//===----------------------------------------------------------------------===// 6685 6686multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6687 Intrinsic IntId, PatFrag ld_frag, 6688 bit Is2Addr = 0, RegisterClass RC = VR128, 6689 X86MemOperand MemOp = i128mem> { 6690 let AsmString = OpcodeStr## 6691 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6692 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6693 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6694 (ins RC:$src1, RC:$src2), "", 6695 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6696 Sched<[WriteAESDecEnc]>; 6697 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6698 (ins RC:$src1, MemOp:$src2), "", 6699 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6700 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6701 } 6702} 6703 6704// Perform One Round of an AES Encryption/Decryption Flow 6705let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6706 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6707 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; 6708 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6709 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; 6710 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6711 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; 6712 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6713 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; 6714} 6715 6716let Predicates = [NoVLX, HasVAES] in { 6717 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6718 int_x86_aesni_aesenc_256, load, 0, VR256, 6719 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6720 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6721 int_x86_aesni_aesenclast_256, load, 0, VR256, 6722 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6723 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6724 int_x86_aesni_aesdec_256, load, 0, VR256, 6725 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6726 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6727 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6728 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6729} 6730 6731let Constraints = "$src1 = $dst" in { 6732 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6733 int_x86_aesni_aesenc, memop, 1>; 6734 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6735 int_x86_aesni_aesenclast, memop, 1>; 6736 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6737 int_x86_aesni_aesdec, memop, 1>; 6738 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6739 int_x86_aesni_aesdeclast, memop, 1>; 6740} 6741 6742// Perform the AES InvMixColumn Transformation 6743let Predicates = [HasAVX, HasAES] in { 6744 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6745 (ins VR128:$src1), 6746 "vaesimc\t{$src1, $dst|$dst, $src1}", 6747 [(set VR128:$dst, 6748 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6749 VEX, VEX_WIG; 6750 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6751 (ins i128mem:$src1), 6752 "vaesimc\t{$src1, $dst|$dst, $src1}", 6753 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6754 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6755} 6756def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6757 (ins VR128:$src1), 6758 "aesimc\t{$src1, $dst|$dst, $src1}", 6759 [(set VR128:$dst, 6760 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6761def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6762 (ins i128mem:$src1), 6763 "aesimc\t{$src1, $dst|$dst, $src1}", 6764 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6765 Sched<[WriteAESIMC.Folded]>; 6766 6767// AES Round Key Generation Assist 6768let Predicates = [HasAVX, HasAES] in { 6769 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6770 (ins VR128:$src1, u8imm:$src2), 6771 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6772 [(set VR128:$dst, 6773 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6774 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6775 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6776 (ins i128mem:$src1, u8imm:$src2), 6777 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6778 [(set VR128:$dst, 6779 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6780 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6781} 6782def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6783 (ins VR128:$src1, u8imm:$src2), 6784 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6785 [(set VR128:$dst, 6786 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6787 Sched<[WriteAESKeyGen]>; 6788def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6789 (ins i128mem:$src1, u8imm:$src2), 6790 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6791 [(set VR128:$dst, 6792 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6793 Sched<[WriteAESKeyGen.Folded]>; 6794 6795//===----------------------------------------------------------------------===// 6796// PCLMUL Instructions 6797//===----------------------------------------------------------------------===// 6798 6799// Immediate transform to help with commuting. 6800def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6801 uint8_t Imm = N->getZExtValue(); 6802 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6803}]>; 6804 6805// SSE carry-less Multiplication instructions 6806let Predicates = [NoAVX, HasPCLMUL] in { 6807 let Constraints = "$src1 = $dst" in { 6808 let isCommutable = 1 in 6809 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6810 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6811 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6812 [(set VR128:$dst, 6813 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6814 Sched<[WriteCLMul]>; 6815 6816 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6817 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6818 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6819 [(set VR128:$dst, 6820 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6821 timm:$src3))]>, 6822 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6823 } // Constraints = "$src1 = $dst" 6824 6825 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6826 (i8 timm:$src3)), 6827 (PCLMULQDQrm VR128:$src1, addr:$src2, 6828 (PCLMULCommuteImm timm:$src3))>; 6829} // Predicates = [NoAVX, HasPCLMUL] 6830 6831// SSE aliases 6832foreach HI = ["hq","lq"] in 6833foreach LO = ["hq","lq"] in { 6834 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6835 (PCLMULQDQrr VR128:$dst, VR128:$src, 6836 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6837 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6838 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6839 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6840} 6841 6842// AVX carry-less Multiplication instructions 6843multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6844 PatFrag LdFrag, Intrinsic IntId> { 6845 let isCommutable = 1 in 6846 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6847 (ins RC:$src1, RC:$src2, u8imm:$src3), 6848 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6849 [(set RC:$dst, 6850 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6851 Sched<[WriteCLMul]>; 6852 6853 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6854 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6855 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6856 [(set RC:$dst, 6857 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6858 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6859 6860 // We can commute a load in the first operand by swapping the sources and 6861 // rotating the immediate. 6862 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6863 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6864 (PCLMULCommuteImm timm:$src3))>; 6865} 6866 6867let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6868defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6869 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 6870 6871let Predicates = [NoVLX, HasVPCLMULQDQ] in 6872defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6873 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 6874 6875multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6876 X86MemOperand MemOp, string Hi, string Lo> { 6877 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6878 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 6879 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6880 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6881 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 6882 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6883} 6884 6885multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 6886 X86MemOperand MemOp> { 6887 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 6888 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 6889 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 6890 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 6891} 6892 6893// AVX aliases 6894defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 6895defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 6896 6897//===----------------------------------------------------------------------===// 6898// SSE4A Instructions 6899//===----------------------------------------------------------------------===// 6900 6901let Predicates = [HasSSE4A] in { 6902 6903let ExeDomain = SSEPackedInt in { 6904let Constraints = "$src = $dst" in { 6905def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 6906 (ins VR128:$src, u8imm:$len, u8imm:$idx), 6907 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 6908 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 6909 timm:$idx))]>, 6910 PD, Sched<[SchedWriteVecALU.XMM]>; 6911def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6912 (ins VR128:$src, VR128:$mask), 6913 "extrq\t{$mask, $src|$src, $mask}", 6914 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 6915 VR128:$mask))]>, 6916 PD, Sched<[SchedWriteVecALU.XMM]>; 6917 6918def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 6919 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 6920 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 6921 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 6922 timm:$len, timm:$idx))]>, 6923 XD, Sched<[SchedWriteVecALU.XMM]>; 6924def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6925 (ins VR128:$src, VR128:$mask), 6926 "insertq\t{$mask, $src|$src, $mask}", 6927 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 6928 VR128:$mask))]>, 6929 XD, Sched<[SchedWriteVecALU.XMM]>; 6930} 6931} // ExeDomain = SSEPackedInt 6932 6933// Non-temporal (unaligned) scalar stores. 6934let AddedComplexity = 400 in { // Prefer non-temporal versions 6935let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 6936def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 6937 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 6938 6939def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 6940 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 6941} // SchedRW 6942 6943def : Pat<(nontemporalstore FR32:$src, addr:$dst), 6944 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 6945 6946def : Pat<(nontemporalstore FR64:$src, addr:$dst), 6947 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 6948 6949} // AddedComplexity 6950} // HasSSE4A 6951 6952//===----------------------------------------------------------------------===// 6953// AVX Instructions 6954//===----------------------------------------------------------------------===// 6955 6956//===----------------------------------------------------------------------===// 6957// VBROADCAST - Load from memory and broadcast to all elements of the 6958// destination operand 6959// 6960class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 6961 X86MemOperand x86memop, ValueType VT, 6962 PatFrag bcast_frag, SchedWrite Sched> : 6963 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 6964 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6965 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 6966 Sched<[Sched]>, VEX; 6967 6968// AVX2 adds register forms 6969class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 6970 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 6971 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 6972 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6973 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 6974 Sched<[Sched]>, VEX; 6975 6976let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 6977 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 6978 f32mem, v4f32, X86VBroadcastld32, 6979 SchedWriteFShuffle.XMM.Folded>; 6980 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 6981 f32mem, v8f32, X86VBroadcastld32, 6982 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6983} 6984let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 6985def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 6986 v4f64, X86VBroadcastld64, 6987 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6988 6989let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 6990 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 6991 v4f32, v4f32, SchedWriteFShuffle.XMM>; 6992 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 6993 v8f32, v4f32, WriteFShuffle256>, VEX_L; 6994} 6995let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 6996def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 6997 v4f64, v2f64, WriteFShuffle256>, VEX_L; 6998 6999//===----------------------------------------------------------------------===// 7000// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7001// halves of a 256-bit vector. 7002// 7003let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7004def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7005 (ins i128mem:$src), 7006 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7007 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7008 7009let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7010 ExeDomain = SSEPackedSingle in 7011def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7012 (ins f128mem:$src), 7013 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7014 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7015 7016let Predicates = [HasAVX, NoVLX] in { 7017def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), 7018 (VBROADCASTF128 addr:$src)>; 7019def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), 7020 (VBROADCASTF128 addr:$src)>; 7021} 7022 7023// NOTE: We're using FP instructions here, but execution domain fixing can 7024// convert to integer when profitable. 7025let Predicates = [HasAVX, NoVLX] in { 7026def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 7027 (VBROADCASTF128 addr:$src)>; 7028def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), 7029 (VBROADCASTF128 addr:$src)>; 7030def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), 7031 (VBROADCASTF128 addr:$src)>; 7032def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), 7033 (VBROADCASTF128 addr:$src)>; 7034} 7035 7036//===----------------------------------------------------------------------===// 7037// VINSERTF128 - Insert packed floating-point values 7038// 7039let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7040def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7041 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7042 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7043 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7044let mayLoad = 1 in 7045def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7046 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7047 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7048 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7049} 7050 7051// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7052// with YMM register containing zero. 7053// FIXME: Avoid producing vxorps to clear the fake inputs. 7054let Predicates = [HasAVX1Only] in { 7055def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7056} 7057 7058multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, 7059 PatFrag memop_frag> { 7060 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7061 (iPTR imm)), 7062 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7063 (INSERT_get_vinsert128_imm VR256:$ins))>; 7064 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7065 (From (memop_frag addr:$src2)), 7066 (iPTR imm)), 7067 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7068 (INSERT_get_vinsert128_imm VR256:$ins))>; 7069} 7070 7071let Predicates = [HasAVX, NoVLX] in { 7072 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; 7073 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; 7074} 7075 7076let Predicates = [HasAVX1Only] in { 7077 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; 7078 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>; 7079 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>; 7080 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>; 7081} 7082 7083//===----------------------------------------------------------------------===// 7084// VEXTRACTF128 - Extract packed floating-point values 7085// 7086let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7087def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7088 (ins VR256:$src1, u8imm:$src2), 7089 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7090 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7091let mayStore = 1 in 7092def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7093 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7094 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7095 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7096} 7097 7098multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7099 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7100 (To (!cast<Instruction>(InstrStr#rr) 7101 (From VR256:$src1), 7102 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7103 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7104 (iPTR imm))), addr:$dst), 7105 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7106 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7107} 7108 7109// AVX1 patterns 7110let Predicates = [HasAVX, NoVLX] in { 7111 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7112 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7113} 7114 7115let Predicates = [HasAVX1Only] in { 7116 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7117 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7118 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7119 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7120} 7121 7122//===----------------------------------------------------------------------===// 7123// VMASKMOV - Conditional SIMD Packed Loads and Stores 7124// 7125multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7126 Intrinsic IntLd, Intrinsic IntLd256, 7127 Intrinsic IntSt, Intrinsic IntSt256, 7128 X86SchedWriteMaskMove schedX, 7129 X86SchedWriteMaskMove schedY> { 7130 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7131 (ins VR128:$src1, f128mem:$src2), 7132 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7133 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7134 VEX_4V, Sched<[schedX.RM]>; 7135 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7136 (ins VR256:$src1, f256mem:$src2), 7137 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7138 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7139 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7140 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7141 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7142 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7143 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7144 VEX_4V, Sched<[schedX.MR]>; 7145 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7146 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7147 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7148 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7149 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7150} 7151 7152let ExeDomain = SSEPackedSingle in 7153defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7154 int_x86_avx_maskload_ps, 7155 int_x86_avx_maskload_ps_256, 7156 int_x86_avx_maskstore_ps, 7157 int_x86_avx_maskstore_ps_256, 7158 WriteFMaskMove32, WriteFMaskMove32Y>; 7159let ExeDomain = SSEPackedDouble in 7160defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7161 int_x86_avx_maskload_pd, 7162 int_x86_avx_maskload_pd_256, 7163 int_x86_avx_maskstore_pd, 7164 int_x86_avx_maskstore_pd_256, 7165 WriteFMaskMove64, WriteFMaskMove64Y>; 7166 7167//===----------------------------------------------------------------------===// 7168// VPERMIL - Permute Single and Double Floating-Point Values 7169// 7170 7171multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7172 RegisterClass RC, X86MemOperand x86memop_f, 7173 X86MemOperand x86memop_i, 7174 ValueType f_vt, ValueType i_vt, 7175 X86FoldableSchedWrite sched, 7176 X86FoldableSchedWrite varsched> { 7177 let Predicates = [HasAVX, NoVLX] in { 7178 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7179 (ins RC:$src1, RC:$src2), 7180 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7181 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7182 Sched<[varsched]>; 7183 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7184 (ins RC:$src1, x86memop_i:$src2), 7185 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7186 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7187 (i_vt (load addr:$src2)))))]>, VEX_4V, 7188 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7189 7190 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7191 (ins RC:$src1, u8imm:$src2), 7192 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7193 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7194 Sched<[sched]>; 7195 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7196 (ins x86memop_f:$src1, u8imm:$src2), 7197 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7198 [(set RC:$dst, 7199 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7200 Sched<[sched.Folded]>; 7201 }// Predicates = [HasAVX, NoVLX] 7202} 7203 7204let ExeDomain = SSEPackedSingle in { 7205 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7206 v4f32, v4i32, SchedWriteFShuffle.XMM, 7207 SchedWriteFVarShuffle.XMM>; 7208 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7209 v8f32, v8i32, SchedWriteFShuffle.YMM, 7210 SchedWriteFVarShuffle.YMM>, VEX_L; 7211} 7212let ExeDomain = SSEPackedDouble in { 7213 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7214 v2f64, v2i64, SchedWriteFShuffle.XMM, 7215 SchedWriteFVarShuffle.XMM>; 7216 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7217 v4f64, v4i64, SchedWriteFShuffle.YMM, 7218 SchedWriteFVarShuffle.YMM>, VEX_L; 7219} 7220 7221//===----------------------------------------------------------------------===// 7222// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7223// 7224 7225let ExeDomain = SSEPackedSingle in { 7226let isCommutable = 1 in 7227def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7228 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7229 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7230 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7231 (i8 timm:$src3))))]>, VEX_4V, VEX_L, 7232 Sched<[WriteFShuffle256]>; 7233def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7234 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7235 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7236 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), 7237 (i8 timm:$src3)))]>, VEX_4V, VEX_L, 7238 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7239} 7240 7241// Immediate transform to help with commuting. 7242def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7243 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7244}]>; 7245 7246let Predicates = [HasAVX] in { 7247// Pattern with load in other operand. 7248def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), 7249 VR256:$src1, (i8 timm:$imm))), 7250 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; 7251} 7252 7253let Predicates = [HasAVX1Only] in { 7254def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7255 (VPERM2F128rr VR256:$src1, VR256:$src2, timm:$imm)>; 7256def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 7257 (loadv4i64 addr:$src2), (i8 timm:$imm))), 7258 (VPERM2F128rm VR256:$src1, addr:$src2, timm:$imm)>; 7259// Pattern with load in other operand. 7260def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7261 VR256:$src1, (i8 timm:$imm))), 7262 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; 7263} 7264 7265//===----------------------------------------------------------------------===// 7266// VZERO - Zero YMM registers 7267// Note: These instruction do not affect the YMM16-YMM31. 7268// 7269 7270let SchedRW = [WriteSystem] in { 7271let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7272 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7273 // Zero All YMM registers 7274 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7275 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7276 Requires<[HasAVX]>, VEX_WIG; 7277 7278 // Zero Upper bits of YMM registers 7279 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7280 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7281 Requires<[HasAVX]>, VEX_WIG; 7282} // Defs 7283} // SchedRW 7284 7285//===----------------------------------------------------------------------===// 7286// Half precision conversion instructions 7287// 7288 7289multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7290 X86FoldableSchedWrite sched> { 7291 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7292 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7293 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, 7294 T8PD, VEX, Sched<[sched]>; 7295 let hasSideEffects = 0, mayLoad = 1 in 7296 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7297 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7298 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>, 7299 T8PD, VEX, Sched<[sched.Folded]>; 7300} 7301 7302multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7303 SchedWrite RR, SchedWrite MR> { 7304 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7305 (ins RC:$src1, i32u8imm:$src2), 7306 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7307 [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>, 7308 TAPD, VEX, Sched<[RR]>; 7309 let hasSideEffects = 0, mayStore = 1 in 7310 def mr : Ii8<0x1D, MRMDestMem, (outs), 7311 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7312 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7313 TAPD, VEX, Sched<[MR]>; 7314} 7315 7316let Predicates = [HasF16C, NoVLX] in { 7317 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7318 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7319 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7320 WriteCvtPS2PHSt>, SIMD_EXC; 7321 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7322 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7323 7324 // Pattern match vcvtph2ps of a scalar i64 load. 7325 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7326 (VCVTPH2PSrm addr:$src)>; 7327 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 7328 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7329 (VCVTPH2PSrm addr:$src)>; 7330 7331 def : Pat<(store (f64 (extractelt 7332 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), 7333 (iPTR 0))), addr:$dst), 7334 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7335 def : Pat<(store (i64 (extractelt 7336 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))), 7337 (iPTR 0))), addr:$dst), 7338 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7339 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7340 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7341} 7342 7343// Patterns for matching conversions from float to half-float and vice versa. 7344let Predicates = [HasF16C, NoVLX] in { 7345 // Use MXCSR.RC for rounding instead of explicitly specifying the default 7346 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the 7347 // configurations we support (the default). However, falling back to MXCSR is 7348 // more consistent with other instructions, which are always controlled by it. 7349 // It's encoded as 0b100. 7350 def : Pat<(fp_to_f16 FR32:$src), 7351 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr 7352 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; 7353 7354 def : Pat<(f16_to_fp GR16:$src), 7355 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7356 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; 7357 7358 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), 7359 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7360 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; 7361} 7362 7363//===----------------------------------------------------------------------===// 7364// AVX2 Instructions 7365//===----------------------------------------------------------------------===// 7366 7367/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7368multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7369 ValueType OpVT, X86FoldableSchedWrite sched, 7370 RegisterClass RC, 7371 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7372 let isCommutable = 1 in 7373 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7374 (ins RC:$src1, RC:$src2, u8imm:$src3), 7375 !strconcat(OpcodeStr, 7376 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7377 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7378 Sched<[sched]>, VEX_4V; 7379 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7380 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7381 !strconcat(OpcodeStr, 7382 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7383 [(set RC:$dst, 7384 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7385 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; 7386 7387 // Pattern to commute if load is in first source. 7388 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7389 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7390 (commuteXForm timm:$src3))>; 7391} 7392 7393let Predicates = [HasAVX2] in { 7394defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7395 SchedWriteBlend.XMM, VR128, i128mem, 7396 BlendCommuteImm4>; 7397defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7398 SchedWriteBlend.YMM, VR256, i256mem, 7399 BlendCommuteImm8>, VEX_L; 7400 7401def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7402 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7403def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7404 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7405def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7406 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7407 7408def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7409 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7410def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7411 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7412def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7413 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7414} 7415 7416// For insertion into the zero index (low half) of a 256-bit vector, it is 7417// more efficient to generate a blend with immediate instead of an insert*128. 7418// NOTE: We're using FP instructions here, but exeuction domain fixing should 7419// take care of using integer instructions when profitable. 7420let Predicates = [HasAVX] in { 7421def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7422 (VBLENDPSYrri VR256:$src1, 7423 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7424 VR128:$src2, sub_xmm), 0xf)>; 7425def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7426 (VBLENDPSYrri VR256:$src1, 7427 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7428 VR128:$src2, sub_xmm), 0xf)>; 7429def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7430 (VBLENDPSYrri VR256:$src1, 7431 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7432 VR128:$src2, sub_xmm), 0xf)>; 7433def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7434 (VBLENDPSYrri VR256:$src1, 7435 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7436 VR128:$src2, sub_xmm), 0xf)>; 7437 7438def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7439 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7440 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7441def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7442 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7443 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7444def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7445 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7446 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7447def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7448 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7449 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7450} 7451 7452//===----------------------------------------------------------------------===// 7453// VPBROADCAST - Load from memory and broadcast to all elements of the 7454// destination operand 7455// 7456multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7457 X86MemOperand x86memop, PatFrag bcast_frag, 7458 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7459 let Predicates = [HasAVX2, prd] in { 7460 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7461 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7462 [(set VR128:$dst, 7463 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7464 Sched<[SchedWriteShuffle.XMM]>, VEX; 7465 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7466 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7467 [(set VR128:$dst, 7468 (OpVT128 (bcast_frag addr:$src)))]>, 7469 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7470 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7471 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7472 [(set VR256:$dst, 7473 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7474 Sched<[WriteShuffle256]>, VEX, VEX_L; 7475 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7476 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7477 [(set VR256:$dst, 7478 (OpVT256 (bcast_frag addr:$src)))]>, 7479 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7480 7481 // Provide aliases for broadcast from the same register class that 7482 // automatically does the extract. 7483 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7484 (!cast<Instruction>(NAME#"Yrr") 7485 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7486 } 7487} 7488 7489defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7490 v16i8, v32i8, NoVLX_Or_NoBWI>; 7491defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7492 v8i16, v16i16, NoVLX_Or_NoBWI>; 7493defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7494 v4i32, v8i32, NoVLX>; 7495defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7496 v2i64, v4i64, NoVLX>; 7497 7498let Predicates = [HasAVX2, NoVLX] in { 7499 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. 7500 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), 7501 (VPBROADCASTQrm addr:$src)>; 7502 def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), 7503 (VPBROADCASTQYrm addr:$src)>; 7504 7505 // FIXME this is to handle aligned extloads from i8/i16. 7506 def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 7507 (VPBROADCASTDrm addr:$src)>; 7508 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 7509 (VPBROADCASTDYrm addr:$src)>; 7510} 7511let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7512 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. 7513 // This means we'll encounter truncated i32 loads; match that here. 7514 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7515 (VPBROADCASTWrm addr:$src)>; 7516 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7517 (VPBROADCASTWYrm addr:$src)>; 7518 def : Pat<(v8i16 (X86VBroadcast 7519 (i16 (trunc (i32 (extloadi16 addr:$src)))))), 7520 (VPBROADCASTWrm addr:$src)>; 7521 def : Pat<(v8i16 (X86VBroadcast 7522 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7523 (VPBROADCASTWrm addr:$src)>; 7524 def : Pat<(v16i16 (X86VBroadcast 7525 (i16 (trunc (i32 (extloadi16 addr:$src)))))), 7526 (VPBROADCASTWYrm addr:$src)>; 7527 def : Pat<(v16i16 (X86VBroadcast 7528 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7529 (VPBROADCASTWYrm addr:$src)>; 7530 7531 // FIXME this is to handle aligned extloads from i8. 7532 def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), 7533 (VPBROADCASTWrm addr:$src)>; 7534 def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), 7535 (VPBROADCASTWYrm addr:$src)>; 7536} 7537 7538let Predicates = [HasAVX2, NoVLX] in { 7539 // Provide fallback in case the load node that is used in the patterns above 7540 // is used by additional users, which prevents the pattern selection. 7541 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7542 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7543 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7544 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7545 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7546 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7547} 7548 7549let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7550 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7551 (VPBROADCASTBrr (VMOVDI2PDIrr 7552 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7553 GR8:$src, sub_8bit))))>; 7554 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7555 (VPBROADCASTBYrr (VMOVDI2PDIrr 7556 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7557 GR8:$src, sub_8bit))))>; 7558 7559 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7560 (VPBROADCASTWrr (VMOVDI2PDIrr 7561 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7562 GR16:$src, sub_16bit))))>; 7563 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7564 (VPBROADCASTWYrr (VMOVDI2PDIrr 7565 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7566 GR16:$src, sub_16bit))))>; 7567} 7568let Predicates = [HasAVX2, NoVLX] in { 7569 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7570 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7571 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7572 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7573 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7574 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7575 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7576 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7577} 7578 7579// AVX1 broadcast patterns 7580let Predicates = [HasAVX1Only] in { 7581def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7582 (VBROADCASTSSYrm addr:$src)>; 7583def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7584 (VBROADCASTSDYrm addr:$src)>; 7585def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7586 (VBROADCASTSSrm addr:$src)>; 7587} 7588 7589 // Provide fallback in case the load node that is used in the patterns above 7590 // is used by additional users, which prevents the pattern selection. 7591let Predicates = [HasAVX, NoVLX] in { 7592 // 128bit broadcasts: 7593 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7594 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7595 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7596 (VMOVDDUPrm addr:$src)>; 7597 7598 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7599 (VMOVDDUPrr VR128:$src)>; 7600 def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))), 7601 (VMOVDDUPrm addr:$src)>; 7602 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), 7603 (VMOVDDUPrm addr:$src)>; 7604} 7605 7606let Predicates = [HasAVX1Only] in { 7607 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7608 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7609 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7610 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7611 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7612 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7613 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7614 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7615 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7616 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7617 7618 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7619 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7620 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7621 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7622 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7623 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7624 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7625 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7626 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7627 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7628 7629 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7630 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7631 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7632 (VMOVDDUPrm addr:$src)>; 7633} 7634 7635//===----------------------------------------------------------------------===// 7636// VPERM - Permute instructions 7637// 7638 7639multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7640 ValueType OpVT, X86FoldableSchedWrite Sched, 7641 X86MemOperand memOp> { 7642 let Predicates = [HasAVX2, NoVLX] in { 7643 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7644 (ins VR256:$src1, VR256:$src2), 7645 !strconcat(OpcodeStr, 7646 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7647 [(set VR256:$dst, 7648 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7649 Sched<[Sched]>, VEX_4V, VEX_L; 7650 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7651 (ins VR256:$src1, memOp:$src2), 7652 !strconcat(OpcodeStr, 7653 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7654 [(set VR256:$dst, 7655 (OpVT (X86VPermv VR256:$src1, 7656 (load addr:$src2))))]>, 7657 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; 7658 } 7659} 7660 7661defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7662let ExeDomain = SSEPackedSingle in 7663defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7664 7665multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7666 ValueType OpVT, X86FoldableSchedWrite Sched, 7667 X86MemOperand memOp> { 7668 let Predicates = [HasAVX2, NoVLX] in { 7669 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7670 (ins VR256:$src1, u8imm:$src2), 7671 !strconcat(OpcodeStr, 7672 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7673 [(set VR256:$dst, 7674 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7675 Sched<[Sched]>, VEX, VEX_L; 7676 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7677 (ins memOp:$src1, u8imm:$src2), 7678 !strconcat(OpcodeStr, 7679 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7680 [(set VR256:$dst, 7681 (OpVT (X86VPermi (mem_frag addr:$src1), 7682 (i8 timm:$src2))))]>, 7683 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7684 } 7685} 7686 7687defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7688 WriteShuffle256, i256mem>, VEX_W; 7689let ExeDomain = SSEPackedDouble in 7690defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7691 WriteFShuffle256, f256mem>, VEX_W; 7692 7693//===----------------------------------------------------------------------===// 7694// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 7695// 7696let isCommutable = 1 in 7697def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7698 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7699 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7700 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7701 (i8 timm:$src3))))]>, Sched<[WriteShuffle256]>, 7702 VEX_4V, VEX_L; 7703def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7704 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7705 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7706 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 7707 (i8 timm:$src3)))]>, 7708 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7709 7710let Predicates = [HasAVX2] in 7711def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7712 VR256:$src1, (i8 timm:$imm))), 7713 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm timm:$imm))>; 7714 7715 7716//===----------------------------------------------------------------------===// 7717// VINSERTI128 - Insert packed integer values 7718// 7719let hasSideEffects = 0 in { 7720def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7721 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7722 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7723 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7724let mayLoad = 1 in 7725def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7726 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7727 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7728 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7729} 7730 7731let Predicates = [HasAVX2, NoVLX] in { 7732 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; 7733 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>; 7734 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>; 7735 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>; 7736} 7737 7738//===----------------------------------------------------------------------===// 7739// VEXTRACTI128 - Extract packed integer values 7740// 7741def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7742 (ins VR256:$src1, u8imm:$src2), 7743 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7744 Sched<[WriteShuffle256]>, VEX, VEX_L; 7745let hasSideEffects = 0, mayStore = 1 in 7746def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7747 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7748 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7749 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7750 7751let Predicates = [HasAVX2, NoVLX] in { 7752 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7753 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7754 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7755 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7756} 7757 7758//===----------------------------------------------------------------------===// 7759// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7760// 7761multiclass avx2_pmovmask<string OpcodeStr, 7762 Intrinsic IntLd128, Intrinsic IntLd256, 7763 Intrinsic IntSt128, Intrinsic IntSt256> { 7764 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7765 (ins VR128:$src1, i128mem:$src2), 7766 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7767 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7768 VEX_4V, Sched<[WriteVecMaskedLoad]>; 7769 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7770 (ins VR256:$src1, i256mem:$src2), 7771 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7772 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7773 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>; 7774 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7775 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7776 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7777 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7778 VEX_4V, Sched<[WriteVecMaskedStore]>; 7779 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7780 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7781 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7782 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7783 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>; 7784} 7785 7786defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7787 int_x86_avx2_maskload_d, 7788 int_x86_avx2_maskload_d_256, 7789 int_x86_avx2_maskstore_d, 7790 int_x86_avx2_maskstore_d_256>; 7791defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7792 int_x86_avx2_maskload_q, 7793 int_x86_avx2_maskload_q_256, 7794 int_x86_avx2_maskstore_q, 7795 int_x86_avx2_maskstore_q_256>, VEX_W; 7796 7797multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7798 ValueType MaskVT> { 7799 // masked store 7800 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7801 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7802 // masked load 7803 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7804 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7805 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7806 (VT immAllZerosV))), 7807 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7808} 7809let Predicates = [HasAVX] in { 7810 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7811 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 7812 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 7813 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 7814} 7815let Predicates = [HasAVX1Only] in { 7816 // load/store i32/i64 not supported use ps/pd version 7817 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 7818 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 7819 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 7820 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 7821} 7822let Predicates = [HasAVX2] in { 7823 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 7824 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 7825 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 7826 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 7827} 7828 7829//===----------------------------------------------------------------------===// 7830// SubVector Broadcasts 7831// Provide fallback in case the load node that is used in the patterns above 7832// is used by additional users, which prevents the pattern selection. 7833 7834let Predicates = [HasAVX, NoVLX] in { 7835def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), 7836 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7837 (v2f64 VR128:$src), 1)>; 7838def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), 7839 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7840 (v4f32 VR128:$src), 1)>; 7841} 7842 7843// NOTE: We're using FP instructions here, but execution domain fixing can 7844// convert to integer when profitable. 7845let Predicates = [HasAVX, NoVLX] in { 7846def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 7847 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7848 (v2i64 VR128:$src), 1)>; 7849def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 7850 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7851 (v4i32 VR128:$src), 1)>; 7852def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 7853 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7854 (v8i16 VR128:$src), 1)>; 7855def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 7856 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7857 (v16i8 VR128:$src), 1)>; 7858} 7859 7860//===----------------------------------------------------------------------===// 7861// Variable Bit Shifts 7862// 7863multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 7864 ValueType vt128, ValueType vt256> { 7865 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 7866 (ins VR128:$src1, VR128:$src2), 7867 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7868 [(set VR128:$dst, 7869 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 7870 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 7871 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 7872 (ins VR128:$src1, i128mem:$src2), 7873 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7874 [(set VR128:$dst, 7875 (vt128 (OpNode VR128:$src1, 7876 (vt128 (load addr:$src2)))))]>, 7877 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, 7878 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 7879 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7880 (ins VR256:$src1, VR256:$src2), 7881 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7882 [(set VR256:$dst, 7883 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 7884 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 7885 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7886 (ins VR256:$src1, i256mem:$src2), 7887 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7888 [(set VR256:$dst, 7889 (vt256 (OpNode VR256:$src1, 7890 (vt256 (load addr:$src2)))))]>, 7891 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 7892 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 7893} 7894 7895let Predicates = [HasAVX2, NoVLX] in { 7896 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 7897 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; 7898 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 7899 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; 7900 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 7901} 7902 7903//===----------------------------------------------------------------------===// 7904// VGATHER - GATHER Operations 7905 7906// FIXME: Improve scheduling of gather instructions. 7907multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, 7908 ValueType VTy, PatFrag GatherNode128, 7909 PatFrag GatherNode256, RegisterClass RC256, 7910 X86MemOperand memop128, X86MemOperand memop256, 7911 ValueType MTx = VTx, ValueType MTy = VTy> { 7912 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 7913 (ins VR128:$src1, memop128:$src2, VR128:$mask), 7914 !strconcat(OpcodeStr, 7915 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7916 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), 7917 (GatherNode128 VR128:$src1, VR128:$mask, 7918 vectoraddr:$src2))]>, 7919 VEX, Sched<[WriteLoad]>; 7920 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 7921 (ins RC256:$src1, memop256:$src2, RC256:$mask), 7922 !strconcat(OpcodeStr, 7923 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7924 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), 7925 (GatherNode256 RC256:$src1, RC256:$mask, 7926 vectoraddr:$src2))]>, 7927 VEX, VEX_L, Sched<[WriteLoad]>; 7928} 7929 7930let Predicates = [HasAVX2] in { 7931 let mayLoad = 1, hasSideEffects = 0, Constraints 7932 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 7933 in { 7934 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, 7935 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; 7936 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, 7937 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; 7938 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, 7939 mgatherv8i32, VR256, vx128mem, vy256mem>; 7940 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, 7941 mgatherv4i64, VR128, vx64mem, vy128mem>; 7942 7943 let ExeDomain = SSEPackedDouble in { 7944 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, 7945 mgatherv4i32, VR256, vx128mem, vx256mem, 7946 v2i64, v4i64>, VEX_W; 7947 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, 7948 mgatherv4i64, VR256, vx128mem, vy256mem, 7949 v2i64, v4i64>, VEX_W; 7950 } 7951 7952 let ExeDomain = SSEPackedSingle in { 7953 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, 7954 mgatherv8i32, VR256, vx128mem, vy256mem, 7955 v4i32, v8i32>; 7956 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, 7957 mgatherv4i64, VR128, vx64mem, vy128mem, 7958 v4i32, v4i32>; 7959 } 7960 } 7961} 7962 7963//===----------------------------------------------------------------------===// 7964// GFNI instructions 7965//===----------------------------------------------------------------------===// 7966 7967multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 7968 RegisterClass RC, PatFrag MemOpFrag, 7969 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7970 let ExeDomain = SSEPackedInt, 7971 AsmString = !if(Is2Addr, 7972 OpcodeStr##"\t{$src2, $dst|$dst, $src2}", 7973 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 7974 let isCommutable = 1 in 7975 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 7976 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 7977 Sched<[SchedWriteVecALU.XMM]>, T8PD; 7978 7979 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 7980 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 7981 (MemOpFrag addr:$src2))))]>, 7982 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD; 7983 } 7984} 7985 7986multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 7987 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 7988 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7989 let AsmString = !if(Is2Addr, 7990 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7991 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 7992 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 7993 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 7994 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 7995 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; 7996 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 7997 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 7998 [(set RC:$dst, (OpVT (OpNode RC:$src1, 7999 (MemOpFrag addr:$src2), 8000 timm:$src3)))], SSEPackedInt>, 8001 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; 8002 } 8003} 8004 8005multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8006 let Constraints = "$src1 = $dst", 8007 Predicates = [HasGFNI, UseSSE2] in 8008 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8009 VR128, load, i128mem, 1>; 8010 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 8011 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, 8012 load, i128mem>, VEX_4V, VEX_W; 8013 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, 8014 load, i256mem>, VEX_4V, VEX_L, VEX_W; 8015 } 8016} 8017 8018// GF2P8MULB 8019let Constraints = "$src1 = $dst", 8020 Predicates = [HasGFNI, UseSSE2] in 8021defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 8022 i128mem, 1>; 8023let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 8024 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 8025 i128mem>, VEX_4V; 8026 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 8027 i256mem>, VEX_4V, VEX_L; 8028} 8029// GF2P8AFFINEINVQB, GF2P8AFFINEQB 8030let isCommutable = 0 in { 8031 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8032 X86GF2P8affineinvqb>, TAPD; 8033 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8034 X86GF2P8affineqb>, TAPD; 8035} 8036 8037