1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 PatFrags mem_frags, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 118 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 119 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 120 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 121} 122 123//===----------------------------------------------------------------------===// 124// AVX & SSE - Zero/One Vectors 125//===----------------------------------------------------------------------===// 126 127// Alias instruction that maps zero vector to pxor / xorp* for sse. 128// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 129// swizzled by ExecutionDomainFix to pxor. 130// We set canFoldAsLoad because this can be converted to a constant-pool 131// load of an all-zeros value if folding it would be beneficial. 132let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 133 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 134def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 135 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 136} 137 138let Predicates = [NoAVX512] in { 139def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 140def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 141def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 142def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 143def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 144} 145 146 147// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 148// and doesn't need it because on sandy bridge the register is set to zero 149// at the rename stage without using any execution unit, so SET0PSY 150// and SET0PDY can be used for vector int instructions without penalty 151let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 152 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 153def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 154 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 155} 156 157let Predicates = [NoAVX512] in { 158def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 159def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 160def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 161def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 163} 164 165// We set canFoldAsLoad because this can be converted to a constant-pool 166// load of an all-ones value if folding it would be beneficial. 167let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 168 isPseudo = 1, SchedRW = [WriteZero] in { 169 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 170 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 171 let Predicates = [HasAVX1Only, OptForMinSize] in { 172 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 173 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 174 } 175 let Predicates = [HasAVX2] in 176 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178} 179 180//===----------------------------------------------------------------------===// 181// SSE 1 & 2 - Move FP Scalar Instructions 182// 183// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 184// register copies because it's a partial register update; Register-to-register 185// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 186// that the insert be implementable in terms of a copy, and just mentioned, we 187// don't use movss/movsd for copies. 188//===----------------------------------------------------------------------===// 189 190multiclass sse12_move_rr<SDNode OpNode, ValueType vt, 191 X86MemOperand x86memop, string base_opc, 192 string asm_opr, Domain d, string Name> { 193 let isCommutable = 1 in 194 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 195 (ins VR128:$src1, VR128:$src2), 196 !strconcat(base_opc, asm_opr), 197 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 198 Sched<[SchedWriteFShuffle.XMM]>; 199 200 // For the disassembler 201 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 202 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 203 (ins VR128:$src1, VR128:$src2), 204 !strconcat(base_opc, asm_opr), []>, 205 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 206} 207 208multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 209 X86MemOperand x86memop, string OpcodeStr, 210 Domain d, string Name, Predicate pred> { 211 // AVX 212 let Predicates = [UseAVX, OptForSize] in 213 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 214 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 215 "V"#Name>, 216 VEX_4V, VEX_LIG, VEX_WIG; 217 218 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 219 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 220 [(store RC:$src, addr:$dst)], d>, 221 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 222 // SSE1 & 2 223 let Constraints = "$src1 = $dst" in { 224 let Predicates = [pred, NoSSE41_Or_OptForSize] in 225 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 226 "\t{$src2, $dst|$dst, $src2}", d, Name>; 227 } 228 229 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 230 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 231 [(store RC:$src, addr:$dst)], d>, 232 Sched<[WriteFStore]>; 233 234 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 235 (!cast<Instruction>("V"#NAME#"rr_REV") 236 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 237 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 238 (!cast<Instruction>(NAME#"rr_REV") 239 VR128:$dst, VR128:$src2), 0>; 240} 241 242// Loading from memory automatically zeroing upper bits. 243multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 244 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 245 Domain d> { 246 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 248 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 249 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 250 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 251 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 252 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 253 Sched<[WriteFLoad]>; 254 255 // _alt version uses FR32/FR64 register class. 256 let isCodeGenOnly = 1 in { 257 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 258 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 259 [(set RC:$dst, (mem_pat addr:$src))], d>, 260 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 261 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 262 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 263 [(set RC:$dst, (mem_pat addr:$src))], d>, 264 Sched<[WriteFLoad]>; 265 } 266} 267 268defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 269 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 270defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 271 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 272 273let canFoldAsLoad = 1, isReMaterializable = 1 in { 274 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 275 SSEPackedSingle>, XS; 276 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 277 SSEPackedDouble>, XD; 278} 279 280// Patterns 281let Predicates = [UseAVX] in { 282 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 283 (VMOVSSrm addr:$src)>; 284 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 285 (VMOVSDrm addr:$src)>; 286 287 // Represent the same patterns above but in the form they appear for 288 // 256-bit types 289 def : Pat<(v8f32 (X86vzload32 addr:$src)), 290 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 291 def : Pat<(v4f64 (X86vzload64 addr:$src)), 292 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 293} 294 295let Predicates = [UseAVX, OptForSize] in { 296 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 297 // MOVSS to the lower bits. 298 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 299 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 300 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 301 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 302 303 // Move low f32 and clear high bits. 304 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 305 (SUBREG_TO_REG (i32 0), 306 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 307 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 308 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 309 (SUBREG_TO_REG (i32 0), 310 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 311 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 312} 313 314let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 315// Move scalar to XMM zero-extended, zeroing a VR128 then do a 316// MOVSS to the lower bits. 317def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 318 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 319def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 320 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 321} 322 323let Predicates = [UseSSE2] in 324def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 325 (MOVSDrm addr:$src)>; 326 327let Predicates = [UseSSE1] in 328def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 329 (MOVSSrm addr:$src)>; 330 331//===----------------------------------------------------------------------===// 332// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 333//===----------------------------------------------------------------------===// 334 335multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 336 X86MemOperand x86memop, PatFrag ld_frag, 337 string asm, Domain d, 338 X86SchedWriteMoveLS sched> { 339let hasSideEffects = 0, isMoveReg = 1 in 340 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 341 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 342 Sched<[sched.RR]>; 343let canFoldAsLoad = 1, isReMaterializable = 1 in 344 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 345 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 346 [(set RC:$dst, (ld_frag addr:$src))], d>, 347 Sched<[sched.RM]>; 348} 349 350let Predicates = [HasAVX, NoVLX] in { 351defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 352 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 353 PS, VEX, VEX_WIG; 354defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 355 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 356 PD, VEX, VEX_WIG; 357defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 358 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 359 PS, VEX, VEX_WIG; 360defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 361 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 362 PD, VEX, VEX_WIG; 363 364defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 365 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 366 PS, VEX, VEX_L, VEX_WIG; 367defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 368 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 369 PD, VEX, VEX_L, VEX_WIG; 370defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 371 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 372 PS, VEX, VEX_L, VEX_WIG; 373defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 374 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 375 PD, VEX, VEX_L, VEX_WIG; 376} 377 378let Predicates = [UseSSE1] in { 379defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 380 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 381 PS; 382defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 383 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 384 PS; 385} 386let Predicates = [UseSSE2] in { 387defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 388 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 389 PD; 390defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 391 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 392 PD; 393} 394 395let Predicates = [HasAVX, NoVLX] in { 396let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 397def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 398 "movaps\t{$src, $dst|$dst, $src}", 399 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 400 VEX, VEX_WIG; 401def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 402 "movapd\t{$src, $dst|$dst, $src}", 403 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 404 VEX, VEX_WIG; 405def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 406 "movups\t{$src, $dst|$dst, $src}", 407 [(store (v4f32 VR128:$src), addr:$dst)]>, 408 VEX, VEX_WIG; 409def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 410 "movupd\t{$src, $dst|$dst, $src}", 411 [(store (v2f64 VR128:$src), addr:$dst)]>, 412 VEX, VEX_WIG; 413} // SchedRW 414 415let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 416def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 417 "movaps\t{$src, $dst|$dst, $src}", 418 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 419 VEX, VEX_L, VEX_WIG; 420def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 421 "movapd\t{$src, $dst|$dst, $src}", 422 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 423 VEX, VEX_L, VEX_WIG; 424def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 425 "movups\t{$src, $dst|$dst, $src}", 426 [(store (v8f32 VR256:$src), addr:$dst)]>, 427 VEX, VEX_L, VEX_WIG; 428def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 429 "movupd\t{$src, $dst|$dst, $src}", 430 [(store (v4f64 VR256:$src), addr:$dst)]>, 431 VEX, VEX_L, VEX_WIG; 432} // SchedRW 433} // Predicate 434 435// For disassembler 436let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 437 isMoveReg = 1 in { 438let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 439 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 440 (ins VR128:$src), 441 "movaps\t{$src, $dst|$dst, $src}", []>, 442 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 443 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 444 (ins VR128:$src), 445 "movapd\t{$src, $dst|$dst, $src}", []>, 446 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 447 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 448 (ins VR128:$src), 449 "movups\t{$src, $dst|$dst, $src}", []>, 450 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 451 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 452 (ins VR128:$src), 453 "movupd\t{$src, $dst|$dst, $src}", []>, 454 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 455} // SchedRW 456 457let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 458 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 459 (ins VR256:$src), 460 "movaps\t{$src, $dst|$dst, $src}", []>, 461 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 462 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 463 (ins VR256:$src), 464 "movapd\t{$src, $dst|$dst, $src}", []>, 465 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 466 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 467 (ins VR256:$src), 468 "movups\t{$src, $dst|$dst, $src}", []>, 469 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 470 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 471 (ins VR256:$src), 472 "movupd\t{$src, $dst|$dst, $src}", []>, 473 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 474} // SchedRW 475} // Predicate 476 477// Reversed version with ".s" suffix for GAS compatibility. 478def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 479 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 480def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 481 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 482def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 483 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 484def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 485 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 486def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 487 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 488def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 489 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 490def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 491 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 492def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 493 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 494 495let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 496def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 497 "movaps\t{$src, $dst|$dst, $src}", 498 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 499def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 500 "movapd\t{$src, $dst|$dst, $src}", 501 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 502def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 503 "movups\t{$src, $dst|$dst, $src}", 504 [(store (v4f32 VR128:$src), addr:$dst)]>; 505def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 506 "movupd\t{$src, $dst|$dst, $src}", 507 [(store (v2f64 VR128:$src), addr:$dst)]>; 508} // SchedRW 509 510// For disassembler 511let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 512 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 513 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 514 "movaps\t{$src, $dst|$dst, $src}", []>, 515 FoldGenData<"MOVAPSrr">; 516 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 517 "movapd\t{$src, $dst|$dst, $src}", []>, 518 FoldGenData<"MOVAPDrr">; 519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movups\t{$src, $dst|$dst, $src}", []>, 521 FoldGenData<"MOVUPSrr">; 522 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 523 "movupd\t{$src, $dst|$dst, $src}", []>, 524 FoldGenData<"MOVUPDrr">; 525} 526 527// Reversed version with ".s" suffix for GAS compatibility. 528def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 529 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 530def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 531 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 532def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 533 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 534def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 535 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 536 537let Predicates = [HasAVX, NoVLX] in { 538 // 256-bit load/store need to use floating point load/store in case we don't 539 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 540 // available and changing the domain is beneficial. 541 def : Pat<(alignedloadv4i64 addr:$src), 542 (VMOVAPSYrm addr:$src)>; 543 def : Pat<(alignedloadv8i32 addr:$src), 544 (VMOVAPSYrm addr:$src)>; 545 def : Pat<(alignedloadv16i16 addr:$src), 546 (VMOVAPSYrm addr:$src)>; 547 def : Pat<(alignedloadv32i8 addr:$src), 548 (VMOVAPSYrm addr:$src)>; 549 def : Pat<(loadv4i64 addr:$src), 550 (VMOVUPSYrm addr:$src)>; 551 def : Pat<(loadv8i32 addr:$src), 552 (VMOVUPSYrm addr:$src)>; 553 def : Pat<(loadv16i16 addr:$src), 554 (VMOVUPSYrm addr:$src)>; 555 def : Pat<(loadv32i8 addr:$src), 556 (VMOVUPSYrm addr:$src)>; 557 558 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 559 (VMOVAPSYmr addr:$dst, VR256:$src)>; 560 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 561 (VMOVAPSYmr addr:$dst, VR256:$src)>; 562 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 563 (VMOVAPSYmr addr:$dst, VR256:$src)>; 564 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 565 (VMOVAPSYmr addr:$dst, VR256:$src)>; 566 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 567 (VMOVUPSYmr addr:$dst, VR256:$src)>; 568 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 569 (VMOVUPSYmr addr:$dst, VR256:$src)>; 570 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 571 (VMOVUPSYmr addr:$dst, VR256:$src)>; 572 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 573 (VMOVUPSYmr addr:$dst, VR256:$src)>; 574} 575 576// Use movaps / movups for SSE integer load / store (one byte shorter). 577// The instructions selected below are then converted to MOVDQA/MOVDQU 578// during the SSE domain pass. 579let Predicates = [UseSSE1] in { 580 def : Pat<(alignedloadv2i64 addr:$src), 581 (MOVAPSrm addr:$src)>; 582 def : Pat<(alignedloadv4i32 addr:$src), 583 (MOVAPSrm addr:$src)>; 584 def : Pat<(alignedloadv8i16 addr:$src), 585 (MOVAPSrm addr:$src)>; 586 def : Pat<(alignedloadv16i8 addr:$src), 587 (MOVAPSrm addr:$src)>; 588 def : Pat<(loadv2i64 addr:$src), 589 (MOVUPSrm addr:$src)>; 590 def : Pat<(loadv4i32 addr:$src), 591 (MOVUPSrm addr:$src)>; 592 def : Pat<(loadv8i16 addr:$src), 593 (MOVUPSrm addr:$src)>; 594 def : Pat<(loadv16i8 addr:$src), 595 (MOVUPSrm addr:$src)>; 596 597 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 598 (MOVAPSmr addr:$dst, VR128:$src)>; 599 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 600 (MOVAPSmr addr:$dst, VR128:$src)>; 601 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 602 (MOVAPSmr addr:$dst, VR128:$src)>; 603 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 604 (MOVAPSmr addr:$dst, VR128:$src)>; 605 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 606 (MOVUPSmr addr:$dst, VR128:$src)>; 607 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 608 (MOVUPSmr addr:$dst, VR128:$src)>; 609 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 610 (MOVUPSmr addr:$dst, VR128:$src)>; 611 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 612 (MOVUPSmr addr:$dst, VR128:$src)>; 613} 614 615//===----------------------------------------------------------------------===// 616// SSE 1 & 2 - Move Low packed FP Instructions 617//===----------------------------------------------------------------------===// 618 619multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, 620 string base_opc, string asm_opr> { 621 // No pattern as they need be special cased between high and low. 622 let hasSideEffects = 0, mayLoad = 1 in 623 def PSrm : PI<opc, MRMSrcMem, 624 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 625 !strconcat(base_opc, "s", asm_opr), 626 [], SSEPackedSingle>, PS, 627 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 628 629 def PDrm : PI<opc, MRMSrcMem, 630 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 631 !strconcat(base_opc, "d", asm_opr), 632 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 633 (scalar_to_vector (loadf64 addr:$src2)))))], 634 SSEPackedDouble>, PD, 635 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 636} 637 638multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 639 string base_opc> { 640 let Predicates = [UseAVX] in 641 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 642 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 643 VEX_4V, VEX_WIG; 644 645 let Constraints = "$src1 = $dst" in 646 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 647 "\t{$src2, $dst|$dst, $src2}">; 648} 649 650defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 651 652let SchedRW = [WriteFStore] in { 653let Predicates = [UseAVX] in { 654let mayStore = 1, hasSideEffects = 0 in 655def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 656 "movlps\t{$src, $dst|$dst, $src}", 657 []>, 658 VEX, VEX_WIG; 659def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 660 "movlpd\t{$src, $dst|$dst, $src}", 661 [(store (f64 (extractelt (v2f64 VR128:$src), 662 (iPTR 0))), addr:$dst)]>, 663 VEX, VEX_WIG; 664}// UseAVX 665let mayStore = 1, hasSideEffects = 0 in 666def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 667 "movlps\t{$src, $dst|$dst, $src}", 668 []>; 669def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 670 "movlpd\t{$src, $dst|$dst, $src}", 671 [(store (f64 (extractelt (v2f64 VR128:$src), 672 (iPTR 0))), addr:$dst)]>; 673} // SchedRW 674 675let Predicates = [UseSSE1] in { 676 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 677 // end up with a movsd or blend instead of shufp. 678 // No need for aligned load, we're only loading 64-bits. 679 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 680 (i8 -28)), 681 (MOVLPSrm VR128:$src1, addr:$src2)>; 682 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 683 (MOVLPSrm VR128:$src1, addr:$src2)>; 684 685 def : Pat<(v4f32 (X86vzload64 addr:$src)), 686 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 687 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 688 (MOVLPSmr addr:$dst, VR128:$src)>; 689} 690 691//===----------------------------------------------------------------------===// 692// SSE 1 & 2 - Move Hi packed FP Instructions 693//===----------------------------------------------------------------------===// 694 695defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 696 697let SchedRW = [WriteFStore] in { 698// v2f64 extract element 1 is always custom lowered to unpack high to low 699// and extract element 0 so the non-store version isn't too horrible. 700let Predicates = [UseAVX] in { 701let mayStore = 1, hasSideEffects = 0 in 702def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 703 "movhps\t{$src, $dst|$dst, $src}", 704 []>, VEX, VEX_WIG; 705def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 706 "movhpd\t{$src, $dst|$dst, $src}", 707 [(store (f64 (extractelt 708 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 709 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 710} // UseAVX 711let mayStore = 1, hasSideEffects = 0 in 712def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 713 "movhps\t{$src, $dst|$dst, $src}", 714 []>; 715def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 716 "movhpd\t{$src, $dst|$dst, $src}", 717 [(store (f64 (extractelt 718 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 719 (iPTR 0))), addr:$dst)]>; 720} // SchedRW 721 722let Predicates = [UseAVX] in { 723 // MOVHPD patterns 724 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 725 (VMOVHPDrm VR128:$src1, addr:$src2)>; 726 727 def : Pat<(store (f64 (extractelt 728 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 729 (iPTR 0))), addr:$dst), 730 (VMOVHPDmr addr:$dst, VR128:$src)>; 731 732 // MOVLPD patterns 733 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 734 (VMOVLPDrm VR128:$src1, addr:$src2)>; 735} 736 737let Predicates = [UseSSE1] in { 738 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 739 // end up with a movsd or blend instead of shufp. 740 // No need for aligned load, we're only loading 64-bits. 741 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 742 (MOVHPSrm VR128:$src1, addr:$src2)>; 743 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 744 (MOVHPSrm VR128:$src1, addr:$src2)>; 745 746 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 747 addr:$dst), 748 (MOVHPSmr addr:$dst, VR128:$src)>; 749} 750 751let Predicates = [UseSSE2] in { 752 // MOVHPD patterns 753 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 754 (MOVHPDrm VR128:$src1, addr:$src2)>; 755 756 def : Pat<(store (f64 (extractelt 757 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 758 (iPTR 0))), addr:$dst), 759 (MOVHPDmr addr:$dst, VR128:$src)>; 760 761 // MOVLPD patterns 762 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 763 (MOVLPDrm VR128:$src1, addr:$src2)>; 764} 765 766let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 767 // Use MOVLPD to load into the low bits from a full vector unless we can use 768 // BLENDPD. 769 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 770 (MOVLPDrm VR128:$src1, addr:$src2)>; 771} 772 773//===----------------------------------------------------------------------===// 774// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 775//===----------------------------------------------------------------------===// 776 777let Predicates = [UseAVX] in { 778 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 779 (ins VR128:$src1, VR128:$src2), 780 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 781 [(set VR128:$dst, 782 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 783 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 784 let isCommutable = 1 in 785 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 786 (ins VR128:$src1, VR128:$src2), 787 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 788 [(set VR128:$dst, 789 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 790 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 791 NotMemoryFoldable; 792} 793let Constraints = "$src1 = $dst" in { 794 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 795 (ins VR128:$src1, VR128:$src2), 796 "movlhps\t{$src2, $dst|$dst, $src2}", 797 [(set VR128:$dst, 798 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 799 Sched<[SchedWriteFShuffle.XMM]>; 800 let isCommutable = 1 in 801 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 802 (ins VR128:$src1, VR128:$src2), 803 "movhlps\t{$src2, $dst|$dst, $src2}", 804 [(set VR128:$dst, 805 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 806 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 807} 808 809//===----------------------------------------------------------------------===// 810// SSE 1 & 2 - Conversion Instructions 811//===----------------------------------------------------------------------===// 812 813multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 814 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 815 string asm, string mem, X86FoldableSchedWrite sched, 816 Domain d, 817 SchedRead Int2Fpu = ReadDefault> { 818 let ExeDomain = d in { 819 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 820 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 821 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 822 Sched<[sched, Int2Fpu]>; 823 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 824 mem#"\t{$src, $dst|$dst, $src}", 825 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 826 Sched<[sched.Folded]>; 827 } 828} 829 830multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 831 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 832 string asm, Domain d, X86FoldableSchedWrite sched> { 833let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 834 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 835 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 836 Sched<[sched]>; 837 let mayLoad = 1 in 838 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 839 [(set RC:$dst, (DstTy (any_sint_to_fp 840 (SrcTy (ld_frag addr:$src)))))], d>, 841 Sched<[sched.Folded]>; 842} 843} 844 845multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 846 X86MemOperand x86memop, string asm, string mem, 847 X86FoldableSchedWrite sched, Domain d> { 848let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 849 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 850 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 851 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 852 let mayLoad = 1 in 853 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 854 (ins DstRC:$src1, x86memop:$src), 855 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 856 Sched<[sched.Folded, sched.ReadAfterFold]>; 857} // hasSideEffects = 0 858} 859 860let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 861defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 862 "cvttss2si", "cvttss2si", 863 WriteCvtSS2I, SSEPackedSingle>, 864 XS, VEX, VEX_LIG; 865defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 866 "cvttss2si", "cvttss2si", 867 WriteCvtSS2I, SSEPackedSingle>, 868 XS, VEX, VEX_W, VEX_LIG; 869defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 870 "cvttsd2si", "cvttsd2si", 871 WriteCvtSD2I, SSEPackedDouble>, 872 XD, VEX, VEX_LIG; 873defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 874 "cvttsd2si", "cvttsd2si", 875 WriteCvtSD2I, SSEPackedDouble>, 876 XD, VEX, VEX_W, VEX_LIG; 877 878defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 879 "cvtss2si", "cvtss2si", 880 WriteCvtSS2I, SSEPackedSingle>, 881 XS, VEX, VEX_LIG; 882defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 883 "cvtss2si", "cvtss2si", 884 WriteCvtSS2I, SSEPackedSingle>, 885 XS, VEX, VEX_W, VEX_LIG; 886defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 887 "cvtsd2si", "cvtsd2si", 888 WriteCvtSD2I, SSEPackedDouble>, 889 XD, VEX, VEX_LIG; 890defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 891 "cvtsd2si", "cvtsd2si", 892 WriteCvtSD2I, SSEPackedDouble>, 893 XD, VEX, VEX_W, VEX_LIG; 894} 895 896// The assembler can recognize rr 64-bit instructions by seeing a rxx 897// register, but the same isn't true when only using memory operands, 898// provide other assembly "l" and "q" forms to address this explicitly 899// where appropriate to do so. 900let isCodeGenOnly = 1 in { 901defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 902 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 903 VEX_LIG, SIMD_EXC; 904defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 905 WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V, 906 VEX_W, VEX_LIG, SIMD_EXC; 907defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 908 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 909 VEX_LIG; 910defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 911 WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V, 912 VEX_W, VEX_LIG, SIMD_EXC; 913} // isCodeGenOnly = 1 914 915let Predicates = [UseAVX] in { 916 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 917 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 918 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 919 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 920 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 921 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 922 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 923 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 924 925 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 926 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 927 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 928 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 929 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 930 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 931 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 932 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 933 934 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; 935 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; 936 937 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; 938 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; 939} 940 941let isCodeGenOnly = 1 in { 942defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 943 "cvttss2si", "cvttss2si", 944 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 945defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 946 "cvttss2si", "cvttss2si", 947 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 948defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 949 "cvttsd2si", "cvttsd2si", 950 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 951defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 952 "cvttsd2si", "cvttsd2si", 953 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 954 955defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 956 "cvtss2si", "cvtss2si", 957 WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; 958defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 959 "cvtss2si", "cvtss2si", 960 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; 961defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 962 "cvtsd2si", "cvtsd2si", 963 WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; 964defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 965 "cvtsd2si", "cvtsd2si", 966 WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; 967 968defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 969 "cvtsi2ss", "cvtsi2ss{l}", 970 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC; 971defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 972 "cvtsi2ss", "cvtsi2ss{q}", 973 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC; 974defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 975 "cvtsi2sd", "cvtsi2sd{l}", 976 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD; 977defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 978 "cvtsi2sd", "cvtsi2sd{q}", 979 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC; 980} // isCodeGenOnly = 1 981 982let Predicates = [UseSSE1] in { 983 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; 984 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; 985} 986 987let Predicates = [UseSSE2] in { 988 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; 989 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; 990} 991 992// Conversion Instructions Intrinsics - Match intrinsics which expect MM 993// and/or XMM operand(s). 994 995multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 996 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 997 Operand memop, PatFrags mem_frags, string asm, 998 X86FoldableSchedWrite sched, Domain d> { 999let ExeDomain = d in { 1000 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1001 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1002 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 1003 Sched<[sched]>; 1004 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1005 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1006 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, 1007 Sched<[sched.Folded]>; 1008} 1009} 1010 1011multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1012 RegisterClass DstRC, X86MemOperand x86memop, 1013 string asm, string mem, X86FoldableSchedWrite sched, 1014 Domain d, bit Is2Addr = 1> { 1015let hasSideEffects = 0, ExeDomain = d in { 1016 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1017 !if(Is2Addr, 1018 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1019 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1020 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 1021 let mayLoad = 1 in 1022 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1023 (ins DstRC:$src1, x86memop:$src2), 1024 !if(Is2Addr, 1025 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 1026 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 1027 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 1028} 1029} 1030 1031let Uses = [MXCSR], mayRaiseFPException = 1 in { 1032let Predicates = [UseAVX] in { 1033defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 1034 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1035 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1036defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1037 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1038 WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; 1039} 1040defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1041 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1042 SSEPackedDouble>, XD; 1043defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1044 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1045 SSEPackedDouble>, XD, REX_W; 1046} 1047 1048let Predicates = [UseAVX] in { 1049defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1050 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1051 XS, VEX_4V, VEX_LIG, SIMD_EXC; 1052defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1053 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1054 XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1055defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1056 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1057 XD, VEX_4V, VEX_LIG; 1058defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1059 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1060 XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC; 1061} 1062let Constraints = "$src1 = $dst" in { 1063 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1064 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1065 XS, SIMD_EXC; 1066 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1067 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1068 XS, REX_W, SIMD_EXC; 1069 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1070 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1071 XD; 1072 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1073 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1074 XD, REX_W, SIMD_EXC; 1075} 1076 1077def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1078 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1079def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1080 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1081def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1082 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1083def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1084 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1085 1086def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1087 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1088def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1089 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1090 1091def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1092 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1093def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1094 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1095def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1096 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1097def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1098 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1099 1100def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1101 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1102def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1103 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1104 1105/// SSE 1 Only 1106 1107// Aliases for intrinsics 1108let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1109defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1110 ssmem, sse_load_f32, "cvttss2si", 1111 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1112defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1113 X86cvtts2Int, ssmem, sse_load_f32, 1114 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1115 XS, VEX, VEX_LIG, VEX_W; 1116defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1117 sdmem, sse_load_f64, "cvttsd2si", 1118 WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG; 1119defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1120 X86cvtts2Int, sdmem, sse_load_f64, 1121 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1122 XD, VEX, VEX_LIG, VEX_W; 1123} 1124let Uses = [MXCSR], mayRaiseFPException = 1 in { 1125defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1126 ssmem, sse_load_f32, "cvttss2si", 1127 WriteCvtSS2I, SSEPackedSingle>, XS; 1128defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1129 X86cvtts2Int, ssmem, sse_load_f32, 1130 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1131 XS, REX_W; 1132defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1133 sdmem, sse_load_f64, "cvttsd2si", 1134 WriteCvtSD2I, SSEPackedDouble>, XD; 1135defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1136 X86cvtts2Int, sdmem, sse_load_f64, 1137 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1138 XD, REX_W; 1139} 1140 1141def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1142 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1143def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1144 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1145def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1146 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1147def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1148 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1149def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1150 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1151def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1152 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1153def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1154 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1155def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1156 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1157 1158def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1159 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1160def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1161 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1162def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1163 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1164def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1165 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1166def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1167 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1168def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1169 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1170def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1171 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1172def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1173 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1174 1175let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1176defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1177 ssmem, sse_load_f32, "cvtss2si", 1178 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; 1179defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1180 ssmem, sse_load_f32, "cvtss2si", 1181 WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; 1182} 1183let Uses = [MXCSR], mayRaiseFPException = 1 in { 1184defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1185 ssmem, sse_load_f32, "cvtss2si", 1186 WriteCvtSS2I, SSEPackedSingle>, XS; 1187defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1188 ssmem, sse_load_f32, "cvtss2si", 1189 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W; 1190 1191defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1192 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1193 SSEPackedSingle, WriteCvtI2PS>, 1194 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1195defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1196 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1197 SSEPackedSingle, WriteCvtI2PSY>, 1198 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1199 1200defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1201 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1202 SSEPackedSingle, WriteCvtI2PS>, 1203 PS, Requires<[UseSSE2]>; 1204} 1205 1206// AVX aliases 1207def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1208 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1209def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1210 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1211def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1212 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1213def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1214 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1215def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1216 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1217def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1218 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1219def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1220 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1221def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1222 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1223 1224// SSE aliases 1225def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1226 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1227def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1228 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1229def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1230 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1231def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1232 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1233def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1234 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1235def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1236 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1237def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1238 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1239def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1240 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1241 1242/// SSE 2 Only 1243 1244// Convert scalar double to scalar single 1245let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], 1246 ExeDomain = SSEPackedSingle in { 1247def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1248 (ins FR32:$src1, FR64:$src2), 1249 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1250 VEX_4V, VEX_LIG, VEX_WIG, 1251 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1252let mayLoad = 1 in 1253def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1254 (ins FR32:$src1, f64mem:$src2), 1255 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1256 XD, VEX_4V, VEX_LIG, VEX_WIG, 1257 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1258} 1259 1260def : Pat<(f32 (any_fpround FR64:$src)), 1261 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1262 Requires<[UseAVX]>; 1263 1264let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1265def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1266 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1267 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1268 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1269def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1270 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1271 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1272 XD, Requires<[UseSSE2, OptForSize]>, 1273 Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC; 1274} 1275 1276let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { 1277def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1278 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1279 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1280 [(set VR128:$dst, 1281 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1282 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1283 Sched<[WriteCvtSD2SS]>; 1284def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1285 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1286 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1287 [(set VR128:$dst, 1288 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1289 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1290 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1291let Constraints = "$src1 = $dst" in { 1292def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1293 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1294 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1295 [(set VR128:$dst, 1296 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1297 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1298def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1299 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1300 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1301 [(set VR128:$dst, 1302 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1303 XD, Requires<[UseSSE2]>, 1304 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1305} 1306} 1307 1308// Convert scalar single to scalar double 1309// SSE2 instructions with XS prefix 1310let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 1311def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1312 (ins FR64:$src1, FR32:$src2), 1313 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1314 XS, VEX_4V, VEX_LIG, VEX_WIG, 1315 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1316let mayLoad = 1 in 1317def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1318 (ins FR64:$src1, f32mem:$src2), 1319 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1320 XS, VEX_4V, VEX_LIG, VEX_WIG, 1321 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1322 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1323} // isCodeGenOnly = 1, hasSideEffects = 0 1324 1325def : Pat<(f64 (any_fpextend FR32:$src)), 1326 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1327def : Pat<(any_fpextend (loadf32 addr:$src)), 1328 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1329 1330let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1331def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1332 "cvtss2sd\t{$src, $dst|$dst, $src}", 1333 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1334 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1335def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1336 "cvtss2sd\t{$src, $dst|$dst, $src}", 1337 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1338 XS, Requires<[UseSSE2, OptForSize]>, 1339 Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC; 1340} // isCodeGenOnly = 1 1341 1342let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, 1343 ExeDomain = SSEPackedSingle in { 1344def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1345 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1346 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1347 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, 1348 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1349let mayLoad = 1 in 1350def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1351 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1352 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1353 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, 1354 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1355let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1356def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1357 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1358 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1359 []>, XS, Requires<[UseSSE2]>, 1360 Sched<[WriteCvtSS2SD]>; 1361let mayLoad = 1 in 1362def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1363 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1364 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1365 []>, XS, Requires<[UseSSE2]>, 1366 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1367} 1368} // hasSideEffects = 0 1369 1370// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1371// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1372// vmovs{s,d} instructions 1373let Predicates = [UseAVX] in { 1374def : Pat<(v4f32 (X86Movss 1375 (v4f32 VR128:$dst), 1376 (v4f32 (scalar_to_vector 1377 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1378 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1379 1380def : Pat<(v2f64 (X86Movsd 1381 (v2f64 VR128:$dst), 1382 (v2f64 (scalar_to_vector 1383 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1384 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1385 1386def : Pat<(v4f32 (X86Movss 1387 (v4f32 VR128:$dst), 1388 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1389 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1390 1391def : Pat<(v4f32 (X86Movss 1392 (v4f32 VR128:$dst), 1393 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1394 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1395 1396def : Pat<(v4f32 (X86Movss 1397 (v4f32 VR128:$dst), 1398 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1399 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1400 1401def : Pat<(v4f32 (X86Movss 1402 (v4f32 VR128:$dst), 1403 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1404 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1405 1406def : Pat<(v2f64 (X86Movsd 1407 (v2f64 VR128:$dst), 1408 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1409 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1410 1411def : Pat<(v2f64 (X86Movsd 1412 (v2f64 VR128:$dst), 1413 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1414 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1415 1416def : Pat<(v2f64 (X86Movsd 1417 (v2f64 VR128:$dst), 1418 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1419 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1420 1421def : Pat<(v2f64 (X86Movsd 1422 (v2f64 VR128:$dst), 1423 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1424 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1425} // Predicates = [UseAVX] 1426 1427let Predicates = [UseSSE2] in { 1428def : Pat<(v4f32 (X86Movss 1429 (v4f32 VR128:$dst), 1430 (v4f32 (scalar_to_vector 1431 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1432 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1433 1434def : Pat<(v2f64 (X86Movsd 1435 (v2f64 VR128:$dst), 1436 (v2f64 (scalar_to_vector 1437 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1438 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1439 1440def : Pat<(v2f64 (X86Movsd 1441 (v2f64 VR128:$dst), 1442 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1443 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1444 1445def : Pat<(v2f64 (X86Movsd 1446 (v2f64 VR128:$dst), 1447 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1448 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1449 1450def : Pat<(v2f64 (X86Movsd 1451 (v2f64 VR128:$dst), 1452 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1453 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1454 1455def : Pat<(v2f64 (X86Movsd 1456 (v2f64 VR128:$dst), 1457 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1458 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1459} // Predicates = [UseSSE2] 1460 1461let Predicates = [UseSSE1] in { 1462def : Pat<(v4f32 (X86Movss 1463 (v4f32 VR128:$dst), 1464 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1465 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1466 1467def : Pat<(v4f32 (X86Movss 1468 (v4f32 VR128:$dst), 1469 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1470 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1471 1472def : Pat<(v4f32 (X86Movss 1473 (v4f32 VR128:$dst), 1474 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1475 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1476 1477def : Pat<(v4f32 (X86Movss 1478 (v4f32 VR128:$dst), 1479 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1480 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1481} // Predicates = [UseSSE1] 1482 1483let Predicates = [HasAVX, NoVLX] in { 1484// Convert packed single/double fp to doubleword 1485def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1486 "cvtps2dq\t{$src, $dst|$dst, $src}", 1487 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1488 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC; 1489def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1490 "cvtps2dq\t{$src, $dst|$dst, $src}", 1491 [(set VR128:$dst, 1492 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1493 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC; 1494def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1495 "cvtps2dq\t{$src, $dst|$dst, $src}", 1496 [(set VR256:$dst, 1497 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1498 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC; 1499def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1500 "cvtps2dq\t{$src, $dst|$dst, $src}", 1501 [(set VR256:$dst, 1502 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1503 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC; 1504} 1505def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1506 "cvtps2dq\t{$src, $dst|$dst, $src}", 1507 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1508 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1509def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1510 "cvtps2dq\t{$src, $dst|$dst, $src}", 1511 [(set VR128:$dst, 1512 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1513 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1514 1515 1516// Convert Packed Double FP to Packed DW Integers 1517let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1518// The assembler can recognize rr 256-bit instructions by seeing a ymm 1519// register, but the same isn't true when using memory operands instead. 1520// Provide other assembly rr and rm forms to address this explicitly. 1521def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1522 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1523 [(set VR128:$dst, 1524 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1525 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1526 1527// XMM only 1528def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1529 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1530 [(set VR128:$dst, 1531 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1532 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1533 1534// YMM only 1535def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1536 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1537 [(set VR128:$dst, 1538 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1539 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1540def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1541 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1542 [(set VR128:$dst, 1543 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1544 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1545} 1546 1547def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1548 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1549def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1550 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1551 1552def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1553 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1554 [(set VR128:$dst, 1555 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1556 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1557def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1558 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1559 [(set VR128:$dst, 1560 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1561 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1562 1563// Convert with truncation packed single/double fp to doubleword 1564// SSE2 packed instructions with XS prefix 1565let Uses = [MXCSR], mayRaiseFPException = 1 in { 1566let Predicates = [HasAVX, NoVLX] in { 1567def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1568 "cvttps2dq\t{$src, $dst|$dst, $src}", 1569 [(set VR128:$dst, 1570 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1571 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1572def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1573 "cvttps2dq\t{$src, $dst|$dst, $src}", 1574 [(set VR128:$dst, 1575 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1576 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1577def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1578 "cvttps2dq\t{$src, $dst|$dst, $src}", 1579 [(set VR256:$dst, 1580 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1581 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1582def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1583 "cvttps2dq\t{$src, $dst|$dst, $src}", 1584 [(set VR256:$dst, 1585 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1586 VEX, VEX_L, 1587 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1588} 1589 1590def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1591 "cvttps2dq\t{$src, $dst|$dst, $src}", 1592 [(set VR128:$dst, 1593 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1594 Sched<[WriteCvtPS2I]>; 1595def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1596 "cvttps2dq\t{$src, $dst|$dst, $src}", 1597 [(set VR128:$dst, 1598 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1599 Sched<[WriteCvtPS2ILd]>; 1600} 1601 1602// The assembler can recognize rr 256-bit instructions by seeing a ymm 1603// register, but the same isn't true when using memory operands instead. 1604// Provide other assembly rr and rm forms to address this explicitly. 1605let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1606// XMM only 1607def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1608 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1609 [(set VR128:$dst, 1610 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1611 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1612def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1613 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1614 [(set VR128:$dst, 1615 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1616 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1617 1618// YMM only 1619def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1620 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1621 [(set VR128:$dst, 1622 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1623 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1624def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1625 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1626 [(set VR128:$dst, 1627 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1628 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1629} // Predicates = [HasAVX, NoVLX] 1630 1631def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1632 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1633def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1634 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1635 1636let Predicates = [HasAVX, NoVLX] in { 1637 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1638 (VCVTTPD2DQYrr VR256:$src)>; 1639 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1640 (VCVTTPD2DQYrm addr:$src)>; 1641} 1642 1643def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1644 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1645 [(set VR128:$dst, 1646 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1647 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1648def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1649 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1650 [(set VR128:$dst, 1651 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1652 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1653 1654// Convert packed single to packed double 1655let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1656 // SSE2 instructions without OpSize prefix 1657def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1658 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1659 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1660 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1661def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1662 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1663 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1664 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1665def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1666 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1667 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1668 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1669def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1670 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1671 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1672 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1673} 1674 1675let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1676def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1677 "cvtps2pd\t{$src, $dst|$dst, $src}", 1678 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1679 PS, Sched<[WriteCvtPS2PD]>; 1680def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1681 "cvtps2pd\t{$src, $dst|$dst, $src}", 1682 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1683 PS, Sched<[WriteCvtPS2PD.Folded]>; 1684} 1685 1686// Convert Packed DW Integers to Packed Double FP 1687let Predicates = [HasAVX, NoVLX] in { 1688let hasSideEffects = 0, mayLoad = 1 in 1689def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1690 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1691 [(set VR128:$dst, 1692 (v2f64 (X86any_VSintToFP 1693 (bc_v4i32 1694 (v2i64 (scalar_to_vector 1695 (loadi64 addr:$src)))))))]>, 1696 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1697def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1698 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1699 [(set VR128:$dst, 1700 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1701 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1702def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1703 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1704 [(set VR256:$dst, 1705 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1706 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1707 VEX_WIG; 1708def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1709 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1710 [(set VR256:$dst, 1711 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1712 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1713} 1714 1715let hasSideEffects = 0, mayLoad = 1 in 1716def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1717 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1718 [(set VR128:$dst, 1719 (v2f64 (X86any_VSintToFP 1720 (bc_v4i32 1721 (v2i64 (scalar_to_vector 1722 (loadi64 addr:$src)))))))]>, 1723 Sched<[WriteCvtI2PDLd]>; 1724def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1725 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1726 [(set VR128:$dst, 1727 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1728 Sched<[WriteCvtI2PD]>; 1729 1730// AVX register conversion intrinsics 1731let Predicates = [HasAVX, NoVLX] in { 1732 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1733 (VCVTDQ2PDrm addr:$src)>; 1734} // Predicates = [HasAVX, NoVLX] 1735 1736// SSE2 register conversion intrinsics 1737let Predicates = [UseSSE2] in { 1738 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1739 (CVTDQ2PDrm addr:$src)>; 1740} // Predicates = [UseSSE2] 1741 1742// Convert packed double to packed single 1743// The assembler can recognize rr 256-bit instructions by seeing a ymm 1744// register, but the same isn't true when using memory operands instead. 1745// Provide other assembly rr and rm forms to address this explicitly. 1746let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1747// XMM only 1748def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1749 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1750 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, 1751 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1752def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1753 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1754 [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>, 1755 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1756 1757def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1758 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1759 [(set VR128:$dst, (X86any_vfpround VR256:$src))]>, 1760 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1761def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1762 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1763 [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>, 1764 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1765} // Predicates = [HasAVX, NoVLX] 1766 1767def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1768 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1769def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1770 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1771 1772def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1773 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1774 [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>, 1775 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1776def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1777 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1778 [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>, 1779 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1780 1781//===----------------------------------------------------------------------===// 1782// SSE 1 & 2 - Compare Instructions 1783//===----------------------------------------------------------------------===// 1784 1785// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1786multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1787 Operand memop, SDNode OpNode, ValueType VT, 1788 PatFrag ld_frag, string asm, 1789 X86FoldableSchedWrite sched, 1790 PatFrags mem_frags> { 1791 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1792 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, 1793 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1794 VR128:$src2, timm:$cc))]>, 1795 Sched<[sched]>, SIMD_EXC; 1796 let mayLoad = 1 in 1797 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1798 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, 1799 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1800 (mem_frags addr:$src2), timm:$cc))]>, 1801 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1802 1803 let isCodeGenOnly = 1 in { 1804 let isCommutable = 1 in 1805 def rr : SIi8<0xC2, MRMSrcReg, 1806 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1807 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, 1808 Sched<[sched]>, SIMD_EXC; 1809 def rm : SIi8<0xC2, MRMSrcMem, 1810 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1811 [(set RC:$dst, (OpNode RC:$src1, 1812 (ld_frag addr:$src2), timm:$cc))]>, 1813 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1814 } 1815} 1816 1817let ExeDomain = SSEPackedSingle in 1818defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1819 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1820 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1821 XS, VEX_4V, VEX_LIG, VEX_WIG; 1822let ExeDomain = SSEPackedDouble in 1823defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1824 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1825 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1826 XD, VEX_4V, VEX_LIG, VEX_WIG; 1827 1828let Constraints = "$src1 = $dst" in { 1829 let ExeDomain = SSEPackedSingle in 1830 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1831 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1832 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1833 let ExeDomain = SSEPackedDouble in 1834 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1835 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1836 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1837} 1838 1839// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1840multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 1841 ValueType vt, X86MemOperand x86memop, 1842 PatFrag ld_frag, string OpcodeStr, Domain d, 1843 X86FoldableSchedWrite sched = WriteFComX> { 1844 let ExeDomain = d in { 1845 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1846 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1847 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1848 Sched<[sched]>, SIMD_EXC; 1849 let mayLoad = 1 in 1850 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1851 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1852 [(set EFLAGS, (OpNode (vt RC:$src1), 1853 (ld_frag addr:$src2)))]>, 1854 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1855} 1856} 1857 1858// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1859multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1860 ValueType vt, Operand memop, 1861 PatFrags mem_frags, string OpcodeStr, 1862 Domain d, 1863 X86FoldableSchedWrite sched = WriteFComX> { 1864let ExeDomain = d in { 1865 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1866 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1867 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1868 Sched<[sched]>, SIMD_EXC; 1869let mayLoad = 1 in 1870 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1871 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1872 [(set EFLAGS, (OpNode (vt RC:$src1), 1873 (mem_frags addr:$src2)))]>, 1874 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1875} 1876} 1877 1878let Defs = [EFLAGS] in { 1879 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1880 "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1881 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1882 "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1883 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1884 "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1885 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1886 "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1887 1888 let isCodeGenOnly = 1 in { 1889 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1890 sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1891 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1892 sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1893 1894 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1895 sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG; 1896 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1897 sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG; 1898 } 1899 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1900 "ucomiss", SSEPackedSingle>, PS; 1901 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1902 "ucomisd", SSEPackedDouble>, PD; 1903 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1904 "comiss", SSEPackedSingle>, PS; 1905 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1906 "comisd", SSEPackedDouble>, PD; 1907 1908 let isCodeGenOnly = 1 in { 1909 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1910 sse_load_f32, "ucomiss", SSEPackedSingle>, PS; 1911 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1912 sse_load_f64, "ucomisd", SSEPackedDouble>, PD; 1913 1914 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1915 sse_load_f32, "comiss", SSEPackedSingle>, PS; 1916 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1917 sse_load_f64, "comisd", SSEPackedDouble>, PD; 1918 } 1919} // Defs = [EFLAGS] 1920 1921// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1922multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1923 ValueType VT, string asm, 1924 X86FoldableSchedWrite sched, 1925 Domain d, PatFrag ld_frag> { 1926 let isCommutable = 1 in 1927 def rri : PIi8<0xC2, MRMSrcReg, 1928 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1929 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1930 Sched<[sched]>, SIMD_EXC; 1931 def rmi : PIi8<0xC2, MRMSrcMem, 1932 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1933 [(set RC:$dst, 1934 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1935 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1936} 1937 1938defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1939 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1940 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 1941defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1942 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1943 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 1944defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1945 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1946 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 1947defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1948 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1949 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 1950let Constraints = "$src1 = $dst" in { 1951 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1952 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1953 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 1954 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1955 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1956 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 1957} 1958 1959def CommutableCMPCC : PatLeaf<(timm), [{ 1960 uint64_t Imm = N->getZExtValue() & 0x7; 1961 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 1962}]>; 1963 1964// Patterns to select compares with loads in first operand. 1965let Predicates = [HasAVX] in { 1966 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 1967 CommutableCMPCC:$cc)), 1968 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 1969 1970 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 1971 CommutableCMPCC:$cc)), 1972 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 1973 1974 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 1975 CommutableCMPCC:$cc)), 1976 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 1977 1978 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 1979 CommutableCMPCC:$cc)), 1980 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 1981 1982 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1983 CommutableCMPCC:$cc)), 1984 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 1985 1986 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 1987 CommutableCMPCC:$cc)), 1988 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 1989} 1990 1991let Predicates = [UseSSE2] in { 1992 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 1993 CommutableCMPCC:$cc)), 1994 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 1995 1996 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1997 CommutableCMPCC:$cc)), 1998 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 1999} 2000 2001let Predicates = [UseSSE1] in { 2002 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 2003 CommutableCMPCC:$cc)), 2004 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2005 2006 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2007 CommutableCMPCC:$cc)), 2008 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2009} 2010 2011//===----------------------------------------------------------------------===// 2012// SSE 1 & 2 - Shuffle Instructions 2013//===----------------------------------------------------------------------===// 2014 2015/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2016multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2017 ValueType vt, string asm, PatFrag mem_frag, 2018 X86FoldableSchedWrite sched, Domain d, 2019 bit IsCommutable = 0> { 2020 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2021 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2022 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2023 (i8 timm:$src3))))], d>, 2024 Sched<[sched.Folded, sched.ReadAfterFold]>; 2025 let isCommutable = IsCommutable in 2026 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2027 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2028 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2029 (i8 timm:$src3))))], d>, 2030 Sched<[sched]>; 2031} 2032 2033let Predicates = [HasAVX, NoVLX] in { 2034 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2035 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2036 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2037 PS, VEX_4V, VEX_WIG; 2038 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2039 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2040 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2041 PS, VEX_4V, VEX_L, VEX_WIG; 2042 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2043 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2044 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2045 PD, VEX_4V, VEX_WIG; 2046 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2047 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2048 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2049 PD, VEX_4V, VEX_L, VEX_WIG; 2050} 2051let Constraints = "$src1 = $dst" in { 2052 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2053 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2054 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2055 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2056 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2057 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2058} 2059 2060//===----------------------------------------------------------------------===// 2061// SSE 1 & 2 - Unpack FP Instructions 2062//===----------------------------------------------------------------------===// 2063 2064/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2065multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2066 PatFrag mem_frag, RegisterClass RC, 2067 X86MemOperand x86memop, string asm, 2068 X86FoldableSchedWrite sched, Domain d, 2069 bit IsCommutable = 0> { 2070 let isCommutable = IsCommutable in 2071 def rr : PI<opc, MRMSrcReg, 2072 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2073 asm, [(set RC:$dst, 2074 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2075 Sched<[sched]>; 2076 def rm : PI<opc, MRMSrcMem, 2077 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2078 asm, [(set RC:$dst, 2079 (vt (OpNode RC:$src1, 2080 (mem_frag addr:$src2))))], d>, 2081 Sched<[sched.Folded, sched.ReadAfterFold]>; 2082} 2083 2084let Predicates = [HasAVX, NoVLX] in { 2085defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2086 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2087 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2088defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2089 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2090 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2091defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2092 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2093 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2094defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2095 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2096 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2097 2098defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2099 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2100 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2101defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2102 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2103 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2104defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2105 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2106 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2107defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2108 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2109 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2110}// Predicates = [HasAVX, NoVLX] 2111 2112let Constraints = "$src1 = $dst" in { 2113 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2114 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2115 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2116 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2117 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2118 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2119 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2120 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2121 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2122 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2123 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2124 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2125} // Constraints = "$src1 = $dst" 2126 2127let Predicates = [HasAVX1Only] in { 2128 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2129 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2130 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2131 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2132 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2133 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2134 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2135 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2136 2137 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2138 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2139 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2140 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2141 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2142 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2143 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2144 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2145} 2146 2147let Predicates = [UseSSE2] in { 2148 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2149 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2150 (v2f64 (simple_load addr:$src2)))), 2151 (MOVHPDrm VR128:$src1, addr:$src2)>; 2152} 2153 2154//===----------------------------------------------------------------------===// 2155// SSE 1 & 2 - Extract Floating-Point Sign mask 2156//===----------------------------------------------------------------------===// 2157 2158/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2159multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2160 string asm, Domain d> { 2161 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2162 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2163 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2164 Sched<[WriteFMOVMSK]>; 2165} 2166 2167let Predicates = [HasAVX] in { 2168 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2169 SSEPackedSingle>, PS, VEX, VEX_WIG; 2170 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2171 SSEPackedDouble>, PD, VEX, VEX_WIG; 2172 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2173 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2174 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2175 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2176 2177 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2178 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2179 (VMOVMSKPSrr VR128:$src)>; 2180 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2181 (VMOVMSKPDrr VR128:$src)>; 2182 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2183 (VMOVMSKPSYrr VR256:$src)>; 2184 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2185 (VMOVMSKPDYrr VR256:$src)>; 2186} 2187 2188defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2189 SSEPackedSingle>, PS; 2190defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2191 SSEPackedDouble>, PD; 2192 2193let Predicates = [UseSSE2] in { 2194 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2195 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2196 (MOVMSKPSrr VR128:$src)>; 2197 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2198 (MOVMSKPDrr VR128:$src)>; 2199} 2200 2201//===---------------------------------------------------------------------===// 2202// SSE2 - Packed Integer Logical Instructions 2203//===---------------------------------------------------------------------===// 2204 2205let ExeDomain = SSEPackedInt in { // SSE integer instructions 2206 2207/// PDI_binop_rm - Simple SSE2 binary operator. 2208multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2209 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2210 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2211 bit IsCommutable, bit Is2Addr> { 2212 let isCommutable = IsCommutable in 2213 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2214 (ins RC:$src1, RC:$src2), 2215 !if(Is2Addr, 2216 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2217 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2218 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2219 Sched<[sched]>; 2220 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2221 (ins RC:$src1, x86memop:$src2), 2222 !if(Is2Addr, 2223 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2224 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2225 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2226 Sched<[sched.Folded, sched.ReadAfterFold]>; 2227} 2228} // ExeDomain = SSEPackedInt 2229 2230multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2231 ValueType OpVT128, ValueType OpVT256, 2232 X86SchedWriteWidths sched, bit IsCommutable, 2233 Predicate prd> { 2234let Predicates = [HasAVX, prd] in 2235 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2236 VR128, load, i128mem, sched.XMM, 2237 IsCommutable, 0>, VEX_4V, VEX_WIG; 2238 2239let Constraints = "$src1 = $dst" in 2240 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2241 memop, i128mem, sched.XMM, IsCommutable, 1>; 2242 2243let Predicates = [HasAVX2, prd] in 2244 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2245 OpVT256, VR256, load, i256mem, sched.YMM, 2246 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2247} 2248 2249// These are ordered here for pattern ordering requirements with the fp versions 2250 2251defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2252 SchedWriteVecLogic, 1, NoVLX>; 2253defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2254 SchedWriteVecLogic, 1, NoVLX>; 2255defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2256 SchedWriteVecLogic, 1, NoVLX>; 2257defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2258 SchedWriteVecLogic, 0, NoVLX>; 2259 2260//===----------------------------------------------------------------------===// 2261// SSE 1 & 2 - Logical Instructions 2262//===----------------------------------------------------------------------===// 2263 2264/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2265/// 2266/// There are no patterns here because isel prefers integer versions for SSE2 2267/// and later. There are SSE1 v4f32 patterns later. 2268multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2269 SDNode OpNode, X86SchedWriteWidths sched> { 2270 let Predicates = [HasAVX, NoVLX] in { 2271 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2272 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2273 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2274 2275 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2276 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2277 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2278 2279 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2280 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2281 [], [], 0>, PS, VEX_4V, VEX_WIG; 2282 2283 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2284 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2285 [], [], 0>, PD, VEX_4V, VEX_WIG; 2286 } 2287 2288 let Constraints = "$src1 = $dst" in { 2289 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2290 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2291 [], []>, PS; 2292 2293 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2294 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2295 [], []>, PD; 2296 } 2297} 2298 2299defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; 2300defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; 2301defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; 2302let isCommutable = 0 in 2303 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; 2304 2305let Predicates = [HasAVX2, NoVLX] in { 2306 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2307 (VPANDYrr VR256:$src1, VR256:$src2)>; 2308 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2309 (VPANDYrr VR256:$src1, VR256:$src2)>; 2310 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2311 (VPANDYrr VR256:$src1, VR256:$src2)>; 2312 2313 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2314 (VPORYrr VR256:$src1, VR256:$src2)>; 2315 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2316 (VPORYrr VR256:$src1, VR256:$src2)>; 2317 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2318 (VPORYrr VR256:$src1, VR256:$src2)>; 2319 2320 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2321 (VPXORYrr VR256:$src1, VR256:$src2)>; 2322 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2323 (VPXORYrr VR256:$src1, VR256:$src2)>; 2324 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2325 (VPXORYrr VR256:$src1, VR256:$src2)>; 2326 2327 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2328 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2329 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2330 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2331 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2332 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2333 2334 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2335 (VPANDYrm VR256:$src1, addr:$src2)>; 2336 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2337 (VPANDYrm VR256:$src1, addr:$src2)>; 2338 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2339 (VPANDYrm VR256:$src1, addr:$src2)>; 2340 2341 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2342 (VPORYrm VR256:$src1, addr:$src2)>; 2343 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2344 (VPORYrm VR256:$src1, addr:$src2)>; 2345 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2346 (VPORYrm VR256:$src1, addr:$src2)>; 2347 2348 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2349 (VPXORYrm VR256:$src1, addr:$src2)>; 2350 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2351 (VPXORYrm VR256:$src1, addr:$src2)>; 2352 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2353 (VPXORYrm VR256:$src1, addr:$src2)>; 2354 2355 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2356 (VPANDNYrm VR256:$src1, addr:$src2)>; 2357 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2358 (VPANDNYrm VR256:$src1, addr:$src2)>; 2359 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2360 (VPANDNYrm VR256:$src1, addr:$src2)>; 2361} 2362 2363// If only AVX1 is supported, we need to handle integer operations with 2364// floating point instructions since the integer versions aren't available. 2365let Predicates = [HasAVX1Only] in { 2366 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2367 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2368 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2369 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2370 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2371 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2372 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2373 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2374 2375 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2376 (VORPSYrr VR256:$src1, VR256:$src2)>; 2377 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2378 (VORPSYrr VR256:$src1, VR256:$src2)>; 2379 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2380 (VORPSYrr VR256:$src1, VR256:$src2)>; 2381 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2382 (VORPSYrr VR256:$src1, VR256:$src2)>; 2383 2384 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2385 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2386 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2387 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2388 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2389 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2390 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2391 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2392 2393 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2394 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2395 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2396 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2397 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2398 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2399 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2400 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2401 2402 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2403 (VANDPSYrm VR256:$src1, addr:$src2)>; 2404 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2405 (VANDPSYrm VR256:$src1, addr:$src2)>; 2406 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2407 (VANDPSYrm VR256:$src1, addr:$src2)>; 2408 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2409 (VANDPSYrm VR256:$src1, addr:$src2)>; 2410 2411 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2412 (VORPSYrm VR256:$src1, addr:$src2)>; 2413 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2414 (VORPSYrm VR256:$src1, addr:$src2)>; 2415 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2416 (VORPSYrm VR256:$src1, addr:$src2)>; 2417 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2418 (VORPSYrm VR256:$src1, addr:$src2)>; 2419 2420 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2421 (VXORPSYrm VR256:$src1, addr:$src2)>; 2422 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2423 (VXORPSYrm VR256:$src1, addr:$src2)>; 2424 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2425 (VXORPSYrm VR256:$src1, addr:$src2)>; 2426 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2427 (VXORPSYrm VR256:$src1, addr:$src2)>; 2428 2429 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2430 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2431 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2432 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2433 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2434 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2435 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2436 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2437} 2438 2439let Predicates = [HasAVX, NoVLX] in { 2440 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2441 (VPANDrr VR128:$src1, VR128:$src2)>; 2442 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2443 (VPANDrr VR128:$src1, VR128:$src2)>; 2444 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2445 (VPANDrr VR128:$src1, VR128:$src2)>; 2446 2447 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2448 (VPORrr VR128:$src1, VR128:$src2)>; 2449 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2450 (VPORrr VR128:$src1, VR128:$src2)>; 2451 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2452 (VPORrr VR128:$src1, VR128:$src2)>; 2453 2454 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2455 (VPXORrr VR128:$src1, VR128:$src2)>; 2456 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2457 (VPXORrr VR128:$src1, VR128:$src2)>; 2458 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2459 (VPXORrr VR128:$src1, VR128:$src2)>; 2460 2461 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2462 (VPANDNrr VR128:$src1, VR128:$src2)>; 2463 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2464 (VPANDNrr VR128:$src1, VR128:$src2)>; 2465 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2466 (VPANDNrr VR128:$src1, VR128:$src2)>; 2467 2468 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2469 (VPANDrm VR128:$src1, addr:$src2)>; 2470 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2471 (VPANDrm VR128:$src1, addr:$src2)>; 2472 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2473 (VPANDrm VR128:$src1, addr:$src2)>; 2474 2475 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2476 (VPORrm VR128:$src1, addr:$src2)>; 2477 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2478 (VPORrm VR128:$src1, addr:$src2)>; 2479 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2480 (VPORrm VR128:$src1, addr:$src2)>; 2481 2482 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2483 (VPXORrm VR128:$src1, addr:$src2)>; 2484 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2485 (VPXORrm VR128:$src1, addr:$src2)>; 2486 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2487 (VPXORrm VR128:$src1, addr:$src2)>; 2488 2489 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2490 (VPANDNrm VR128:$src1, addr:$src2)>; 2491 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2492 (VPANDNrm VR128:$src1, addr:$src2)>; 2493 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2494 (VPANDNrm VR128:$src1, addr:$src2)>; 2495} 2496 2497let Predicates = [UseSSE2] in { 2498 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2499 (PANDrr VR128:$src1, VR128:$src2)>; 2500 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2501 (PANDrr VR128:$src1, VR128:$src2)>; 2502 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2503 (PANDrr VR128:$src1, VR128:$src2)>; 2504 2505 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2506 (PORrr VR128:$src1, VR128:$src2)>; 2507 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2508 (PORrr VR128:$src1, VR128:$src2)>; 2509 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2510 (PORrr VR128:$src1, VR128:$src2)>; 2511 2512 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2513 (PXORrr VR128:$src1, VR128:$src2)>; 2514 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2515 (PXORrr VR128:$src1, VR128:$src2)>; 2516 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2517 (PXORrr VR128:$src1, VR128:$src2)>; 2518 2519 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2520 (PANDNrr VR128:$src1, VR128:$src2)>; 2521 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2522 (PANDNrr VR128:$src1, VR128:$src2)>; 2523 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2524 (PANDNrr VR128:$src1, VR128:$src2)>; 2525 2526 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2527 (PANDrm VR128:$src1, addr:$src2)>; 2528 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2529 (PANDrm VR128:$src1, addr:$src2)>; 2530 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2531 (PANDrm VR128:$src1, addr:$src2)>; 2532 2533 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2534 (PORrm VR128:$src1, addr:$src2)>; 2535 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2536 (PORrm VR128:$src1, addr:$src2)>; 2537 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2538 (PORrm VR128:$src1, addr:$src2)>; 2539 2540 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2541 (PXORrm VR128:$src1, addr:$src2)>; 2542 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2543 (PXORrm VR128:$src1, addr:$src2)>; 2544 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2545 (PXORrm VR128:$src1, addr:$src2)>; 2546 2547 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2548 (PANDNrm VR128:$src1, addr:$src2)>; 2549 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2550 (PANDNrm VR128:$src1, addr:$src2)>; 2551 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2552 (PANDNrm VR128:$src1, addr:$src2)>; 2553} 2554 2555// Patterns for packed operations when we don't have integer type available. 2556def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2557 (ANDPSrr VR128:$src1, VR128:$src2)>; 2558def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2559 (ORPSrr VR128:$src1, VR128:$src2)>; 2560def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2561 (XORPSrr VR128:$src1, VR128:$src2)>; 2562def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2563 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2564 2565def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2566 (ANDPSrm VR128:$src1, addr:$src2)>; 2567def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2568 (ORPSrm VR128:$src1, addr:$src2)>; 2569def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2570 (XORPSrm VR128:$src1, addr:$src2)>; 2571def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2572 (ANDNPSrm VR128:$src1, addr:$src2)>; 2573 2574//===----------------------------------------------------------------------===// 2575// SSE 1 & 2 - Arithmetic Instructions 2576//===----------------------------------------------------------------------===// 2577 2578/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2579/// vector forms. 2580/// 2581/// In addition, we also have a special variant of the scalar form here to 2582/// represent the associated intrinsic operation. This form is unlike the 2583/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2584/// and leaves the top elements unmodified (therefore these cannot be commuted). 2585/// 2586/// These three forms can each be reg+reg or reg+mem. 2587/// 2588 2589/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2590/// classes below 2591multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2592 SDNode OpNode, X86SchedWriteSizes sched> { 2593let Uses = [MXCSR], mayRaiseFPException = 1 in { 2594 let Predicates = [HasAVX, NoVLX] in { 2595 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2596 VR128, v4f32, f128mem, loadv4f32, 2597 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2598 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2599 VR128, v2f64, f128mem, loadv2f64, 2600 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2601 2602 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2603 OpNode, VR256, v8f32, f256mem, loadv8f32, 2604 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2605 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2606 OpNode, VR256, v4f64, f256mem, loadv4f64, 2607 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2608 } 2609 2610 let Constraints = "$src1 = $dst" in { 2611 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2612 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2613 sched.PS.XMM>, PS; 2614 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2615 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2616 sched.PD.XMM>, PD; 2617 } 2618} 2619} 2620 2621multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2622 X86SchedWriteSizes sched> { 2623let Uses = [MXCSR], mayRaiseFPException = 1 in { 2624 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2625 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2626 XS, VEX_4V, VEX_LIG, VEX_WIG; 2627 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2628 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2629 XD, VEX_4V, VEX_LIG, VEX_WIG; 2630 2631 let Constraints = "$src1 = $dst" in { 2632 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2633 OpNode, FR32, f32mem, SSEPackedSingle, 2634 sched.PS.Scl>, XS; 2635 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2636 OpNode, FR64, f64mem, SSEPackedDouble, 2637 sched.PD.Scl>, XD; 2638 } 2639} 2640} 2641 2642multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2643 SDPatternOperator OpNode, 2644 X86SchedWriteSizes sched> { 2645let Uses = [MXCSR], mayRaiseFPException = 1 in { 2646 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2647 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2648 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2649 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2650 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2651 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2652 2653 let Constraints = "$src1 = $dst" in { 2654 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2655 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2656 SSEPackedSingle, sched.PS.Scl>, XS; 2657 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2658 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2659 SSEPackedDouble, sched.PD.Scl>, XD; 2660 } 2661} 2662} 2663 2664// Binary Arithmetic instructions 2665defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2666 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2667 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2668defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2669 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2670 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2671let isCommutable = 0 in { 2672 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2673 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2674 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2675 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2676 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2677 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2678 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2679 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2680 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2681 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2682 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2683 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2684} 2685 2686let isCodeGenOnly = 1 in { 2687 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2688 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2689 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2690 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2691} 2692 2693// Patterns used to select SSE scalar fp arithmetic instructions from 2694// either: 2695// 2696// (1) a scalar fp operation followed by a blend 2697// 2698// The effect is that the backend no longer emits unnecessary vector 2699// insert instructions immediately after SSE scalar fp instructions 2700// like addss or mulss. 2701// 2702// For example, given the following code: 2703// __m128 foo(__m128 A, __m128 B) { 2704// A[0] += B[0]; 2705// return A; 2706// } 2707// 2708// Previously we generated: 2709// addss %xmm0, %xmm1 2710// movss %xmm1, %xmm0 2711// 2712// We now generate: 2713// addss %xmm1, %xmm0 2714// 2715// (2) a vector packed single/double fp operation followed by a vector insert 2716// 2717// The effect is that the backend converts the packed fp instruction 2718// followed by a vector insert into a single SSE scalar fp instruction. 2719// 2720// For example, given the following code: 2721// __m128 foo(__m128 A, __m128 B) { 2722// __m128 C = A + B; 2723// return (__m128) {c[0], a[1], a[2], a[3]}; 2724// } 2725// 2726// Previously we generated: 2727// addps %xmm0, %xmm1 2728// movss %xmm1, %xmm0 2729// 2730// We now generate: 2731// addss %xmm1, %xmm0 2732 2733// TODO: Some canonicalization in lowering would simplify the number of 2734// patterns we have to try to match. 2735multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, 2736 ValueType VT, ValueType EltTy, 2737 RegisterClass RC, PatFrag ld_frag, 2738 Predicate BasePredicate> { 2739 let Predicates = [BasePredicate] in { 2740 // extracted scalar math op with insert via movss/movsd 2741 def : Pat<(VT (Move (VT VR128:$dst), 2742 (VT (scalar_to_vector 2743 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2744 RC:$src))))), 2745 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2746 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2747 def : Pat<(VT (Move (VT VR128:$dst), 2748 (VT (scalar_to_vector 2749 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2750 (ld_frag addr:$src)))))), 2751 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2752 } 2753 2754 // Repeat for AVX versions of the instructions. 2755 let Predicates = [UseAVX] in { 2756 // extracted scalar math op with insert via movss/movsd 2757 def : Pat<(VT (Move (VT VR128:$dst), 2758 (VT (scalar_to_vector 2759 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2760 RC:$src))))), 2761 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2762 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2763 def : Pat<(VT (Move (VT VR128:$dst), 2764 (VT (scalar_to_vector 2765 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2766 (ld_frag addr:$src)))))), 2767 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2768 } 2769} 2770 2771defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2772defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2773defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2774defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2775 2776defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2777defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2778defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2779defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2780 2781/// Unop Arithmetic 2782/// In addition, we also have a special variant of the scalar form here to 2783/// represent the associated intrinsic operation. This form is unlike the 2784/// plain scalar form, in that it takes an entire vector (instead of a 2785/// scalar) and leaves the top elements undefined. 2786/// 2787/// And, we have a special variant form for a full-vector intrinsic form. 2788 2789/// sse_fp_unop_s - SSE1 unops in scalar form 2790/// For the non-AVX defs, we need $src1 to be tied to $dst because 2791/// the HW instructions are 2 operand / destructive. 2792multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2793 ValueType ScalarVT, X86MemOperand x86memop, 2794 Operand intmemop, SDNode OpNode, Domain d, 2795 X86FoldableSchedWrite sched, Predicate target> { 2796 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2797 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2798 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2799 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2800 Requires<[target]>; 2801 let mayLoad = 1 in 2802 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2803 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2804 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2805 Sched<[sched.Folded]>, 2806 Requires<[target, OptForSize]>; 2807 } 2808 2809 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2810 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2811 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2812 Sched<[sched]>; 2813 let mayLoad = 1 in 2814 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2815 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2816 Sched<[sched.Folded, sched.ReadAfterFold]>; 2817 } 2818 2819} 2820 2821multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, 2822 PatFrags mem_frags, Intrinsic Intr, 2823 Predicate target, string Suffix> { 2824 let Predicates = [target] in { 2825 // These are unary operations, but they are modeled as having 2 source operands 2826 // because the high elements of the destination are unchanged in SSE. 2827 def : Pat<(Intr VR128:$src), 2828 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2829 } 2830 // We don't want to fold scalar loads into these instructions unless 2831 // optimizing for size. This is because the folded instruction will have a 2832 // partial register update, while the unfolded sequence will not, e.g. 2833 // movss mem, %xmm0 2834 // rcpss %xmm0, %xmm0 2835 // which has a clobber before the rcp, vs. 2836 // rcpss mem, %xmm0 2837 let Predicates = [target, OptForSize] in { 2838 def : Pat<(Intr (mem_frags addr:$src2)), 2839 (!cast<Instruction>(NAME#m_Int) 2840 (vt (IMPLICIT_DEF)), addr:$src2)>; 2841 } 2842} 2843 2844multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags, 2845 Intrinsic Intr, Predicate target> { 2846 let Predicates = [target] in { 2847 def : Pat<(Intr VR128:$src), 2848 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2849 VR128:$src)>; 2850 } 2851 let Predicates = [target, OptForSize] in { 2852 def : Pat<(Intr (mem_frags addr:$src2)), 2853 (!cast<Instruction>(NAME#m_Int) 2854 (vt (IMPLICIT_DEF)), addr:$src2)>; 2855 } 2856} 2857 2858multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2859 ValueType ScalarVT, X86MemOperand x86memop, 2860 Operand intmemop, SDNode OpNode, Domain d, 2861 X86FoldableSchedWrite sched, Predicate target> { 2862 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2863 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2864 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2865 [], d>, Sched<[sched]>; 2866 let mayLoad = 1 in 2867 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2868 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2869 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2870 } 2871 let hasSideEffects = 0, ExeDomain = d in { 2872 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2873 (ins VR128:$src1, VR128:$src2), 2874 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2875 []>, Sched<[sched]>; 2876 let mayLoad = 1 in 2877 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2878 (ins VR128:$src1, intmemop:$src2), 2879 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2880 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2881 } 2882 2883 // We don't want to fold scalar loads into these instructions unless 2884 // optimizing for size. This is because the folded instruction will have a 2885 // partial register update, while the unfolded sequence will not, e.g. 2886 // vmovss mem, %xmm0 2887 // vrcpss %xmm0, %xmm0, %xmm0 2888 // which has a clobber before the rcp, vs. 2889 // vrcpss mem, %xmm0, %xmm0 2890 // TODO: In theory, we could fold the load, and avoid the stall caused by 2891 // the partial register store, either in BreakFalseDeps or with smarter RA. 2892 let Predicates = [target] in { 2893 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2894 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2895 } 2896 let Predicates = [target, OptForSize] in { 2897 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2898 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2899 addr:$src)>; 2900 } 2901} 2902 2903/// sse1_fp_unop_p - SSE1 unops in packed form. 2904multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 2905 X86SchedWriteWidths sched, list<Predicate> prds> { 2906let Predicates = prds in { 2907 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2908 !strconcat("v", OpcodeStr, 2909 "ps\t{$src, $dst|$dst, $src}"), 2910 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2911 VEX, Sched<[sched.XMM]>, VEX_WIG; 2912 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2913 !strconcat("v", OpcodeStr, 2914 "ps\t{$src, $dst|$dst, $src}"), 2915 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2916 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2917 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2918 !strconcat("v", OpcodeStr, 2919 "ps\t{$src, $dst|$dst, $src}"), 2920 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2921 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2922 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2923 !strconcat("v", OpcodeStr, 2924 "ps\t{$src, $dst|$dst, $src}"), 2925 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2926 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2927} 2928 2929 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2930 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2931 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2932 Sched<[sched.XMM]>; 2933 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2934 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2935 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2936 Sched<[sched.XMM.Folded]>; 2937} 2938 2939/// sse2_fp_unop_p - SSE2 unops in vector forms. 2940multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2941 SDNode OpNode, X86SchedWriteWidths sched> { 2942let Predicates = [HasAVX, NoVLX] in { 2943 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2944 !strconcat("v", OpcodeStr, 2945 "pd\t{$src, $dst|$dst, $src}"), 2946 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2947 VEX, Sched<[sched.XMM]>, VEX_WIG; 2948 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2949 !strconcat("v", OpcodeStr, 2950 "pd\t{$src, $dst|$dst, $src}"), 2951 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2952 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2953 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2954 !strconcat("v", OpcodeStr, 2955 "pd\t{$src, $dst|$dst, $src}"), 2956 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2957 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2958 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2959 !strconcat("v", OpcodeStr, 2960 "pd\t{$src, $dst|$dst, $src}"), 2961 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2962 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2963} 2964 2965 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2966 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2967 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2968 Sched<[sched.XMM]>; 2969 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2970 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2971 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 2972 Sched<[sched.XMM.Folded]>; 2973} 2974 2975multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, 2976 X86SchedWriteWidths sched, Predicate AVXTarget> { 2977 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2978 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 2979 UseSSE1, "SS">, XS; 2980 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2981 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 2982 AVXTarget>, 2983 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 2984} 2985 2986multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2987 X86SchedWriteWidths sched, Predicate AVXTarget> { 2988 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem, 2989 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 2990 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, 2991 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 2992 XS, VEX_4V, VEX_LIG, VEX_WIG; 2993} 2994 2995multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2996 X86SchedWriteWidths sched, Predicate AVXTarget> { 2997 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem, 2998 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 2999 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, 3000 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 3001 XD, VEX_4V, VEX_LIG, VEX_WIG; 3002} 3003 3004// Square root. 3005defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 3006 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3007 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3008 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3009 3010// Reciprocal approximations. Note that these typically require refinement 3011// in order to obtain suitable precision. 3012defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3013 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3014 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3015defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3016 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3017 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3018 3019// There is no f64 version of the reciprocal approximation instructions. 3020 3021multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 3022 ValueType VT, Predicate BasePredicate> { 3023 let Predicates = [BasePredicate] in { 3024 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3025 (OpNode (extractelt VT:$src, 0))))), 3026 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3027 } 3028 3029 // Repeat for AVX versions of the instructions. 3030 let Predicates = [UseAVX] in { 3031 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3032 (OpNode (extractelt VT:$src, 0))))), 3033 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3034 } 3035} 3036 3037defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3038defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3039 3040multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3041 SDNode Move, ValueType VT, 3042 Predicate BasePredicate> { 3043 let Predicates = [BasePredicate] in { 3044 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3045 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3046 } 3047 3048 // Repeat for AVX versions of the instructions. 3049 let Predicates = [HasAVX] in { 3050 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3051 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3052 } 3053} 3054 3055defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3056 v4f32, UseSSE1>; 3057defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3058 v4f32, UseSSE1>; 3059 3060 3061//===----------------------------------------------------------------------===// 3062// SSE 1 & 2 - Non-temporal stores 3063//===----------------------------------------------------------------------===// 3064 3065let AddedComplexity = 400 in { // Prefer non-temporal versions 3066let Predicates = [HasAVX, NoVLX] in { 3067let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3068def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3069 (ins f128mem:$dst, VR128:$src), 3070 "movntps\t{$src, $dst|$dst, $src}", 3071 [(alignednontemporalstore (v4f32 VR128:$src), 3072 addr:$dst)]>, VEX, VEX_WIG; 3073def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3074 (ins f128mem:$dst, VR128:$src), 3075 "movntpd\t{$src, $dst|$dst, $src}", 3076 [(alignednontemporalstore (v2f64 VR128:$src), 3077 addr:$dst)]>, VEX, VEX_WIG; 3078} // SchedRW 3079 3080let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3081def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3082 (ins f256mem:$dst, VR256:$src), 3083 "movntps\t{$src, $dst|$dst, $src}", 3084 [(alignednontemporalstore (v8f32 VR256:$src), 3085 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3086def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3087 (ins f256mem:$dst, VR256:$src), 3088 "movntpd\t{$src, $dst|$dst, $src}", 3089 [(alignednontemporalstore (v4f64 VR256:$src), 3090 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3091} // SchedRW 3092 3093let ExeDomain = SSEPackedInt in { 3094def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3095 (ins i128mem:$dst, VR128:$src), 3096 "movntdq\t{$src, $dst|$dst, $src}", 3097 [(alignednontemporalstore (v2i64 VR128:$src), 3098 addr:$dst)]>, VEX, VEX_WIG, 3099 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3100def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3101 (ins i256mem:$dst, VR256:$src), 3102 "movntdq\t{$src, $dst|$dst, $src}", 3103 [(alignednontemporalstore (v4i64 VR256:$src), 3104 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3105 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3106} // ExeDomain 3107} // Predicates 3108 3109let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3110def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3111 "movntps\t{$src, $dst|$dst, $src}", 3112 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3113def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3114 "movntpd\t{$src, $dst|$dst, $src}", 3115 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3116} // SchedRW 3117 3118let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3119def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3120 "movntdq\t{$src, $dst|$dst, $src}", 3121 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3122 3123let SchedRW = [WriteStoreNT] in { 3124// There is no AVX form for instructions below this point 3125def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3126 "movnti{l}\t{$src, $dst|$dst, $src}", 3127 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3128 PS, Requires<[HasSSE2]>; 3129def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3130 "movnti{q}\t{$src, $dst|$dst, $src}", 3131 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3132 PS, Requires<[HasSSE2]>; 3133} // SchedRW = [WriteStoreNT] 3134 3135let Predicates = [HasAVX, NoVLX] in { 3136 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3137 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3138 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3139 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3140 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3141 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3142 3143 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3144 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3145 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3146 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3147 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3148 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3149} 3150 3151let Predicates = [UseSSE2] in { 3152 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3153 (MOVNTDQmr addr:$dst, VR128:$src)>; 3154 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3155 (MOVNTDQmr addr:$dst, VR128:$src)>; 3156 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3157 (MOVNTDQmr addr:$dst, VR128:$src)>; 3158} 3159 3160} // AddedComplexity 3161 3162//===----------------------------------------------------------------------===// 3163// SSE 1 & 2 - Prefetch and memory fence 3164//===----------------------------------------------------------------------===// 3165 3166// Prefetch intrinsic. 3167let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3168def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3169 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3170def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3171 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3172def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3173 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3174def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3175 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3176} 3177 3178// FIXME: How should flush instruction be modeled? 3179let SchedRW = [WriteLoad] in { 3180// Flush cache 3181def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3182 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3183 PS, Requires<[HasSSE2]>; 3184} 3185 3186let SchedRW = [WriteNop] in { 3187// Pause. This "instruction" is encoded as "rep; nop", so even though it 3188// was introduced with SSE2, it's backward compatible. 3189def PAUSE : I<0x90, RawFrm, (outs), (ins), 3190 "pause", [(int_x86_sse2_pause)]>, OBXS; 3191} 3192 3193let SchedRW = [WriteFence] in { 3194// Load, store, and memory fence 3195// TODO: As with mfence, we may want to ease the availability of sfence/lfence 3196// to include any 64-bit target. 3197def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3198 PS, Requires<[HasSSE1]>; 3199def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3200 PS, Requires<[HasSSE2]>; 3201def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3202 PS, Requires<[HasMFence]>; 3203} // SchedRW 3204 3205def : Pat<(X86MFence), (MFENCE)>; 3206 3207//===----------------------------------------------------------------------===// 3208// SSE 1 & 2 - Load/Store XCSR register 3209//===----------------------------------------------------------------------===// 3210 3211let mayLoad=1, hasSideEffects=1 in 3212def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3213 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3214 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3215let mayStore=1, hasSideEffects=1 in 3216def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3217 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3218 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3219 3220let mayLoad=1, hasSideEffects=1 in 3221def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3222 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3223 PS, Sched<[WriteLDMXCSR]>; 3224let mayStore=1, hasSideEffects=1 in 3225def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3226 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3227 PS, Sched<[WriteSTMXCSR]>; 3228 3229//===---------------------------------------------------------------------===// 3230// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3231//===---------------------------------------------------------------------===// 3232 3233let ExeDomain = SSEPackedInt in { // SSE integer instructions 3234 3235let hasSideEffects = 0 in { 3236def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3237 "movdqa\t{$src, $dst|$dst, $src}", []>, 3238 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3239def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3240 "movdqu\t{$src, $dst|$dst, $src}", []>, 3241 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3242def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3243 "movdqa\t{$src, $dst|$dst, $src}", []>, 3244 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3245def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3246 "movdqu\t{$src, $dst|$dst, $src}", []>, 3247 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3248} 3249 3250// For Disassembler 3251let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3252def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3253 "movdqa\t{$src, $dst|$dst, $src}", []>, 3254 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3255 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3256def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3257 "movdqa\t{$src, $dst|$dst, $src}", []>, 3258 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3259 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3260def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3261 "movdqu\t{$src, $dst|$dst, $src}", []>, 3262 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3263 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3264def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3265 "movdqu\t{$src, $dst|$dst, $src}", []>, 3266 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3267 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3268} 3269 3270let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3271 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3272def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3273 "movdqa\t{$src, $dst|$dst, $src}", 3274 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3275 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3276def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3277 "movdqa\t{$src, $dst|$dst, $src}", []>, 3278 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3279 VEX, VEX_L, VEX_WIG; 3280def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3281 "vmovdqu\t{$src, $dst|$dst, $src}", 3282 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3283 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3284 XS, VEX, VEX_WIG; 3285def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3286 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3287 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3288 XS, VEX, VEX_L, VEX_WIG; 3289} 3290 3291let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3292def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3293 (ins i128mem:$dst, VR128:$src), 3294 "movdqa\t{$src, $dst|$dst, $src}", 3295 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3296 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3297def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3298 (ins i256mem:$dst, VR256:$src), 3299 "movdqa\t{$src, $dst|$dst, $src}", []>, 3300 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3301def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3302 "vmovdqu\t{$src, $dst|$dst, $src}", 3303 [(store (v2i64 VR128:$src), addr:$dst)]>, 3304 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3305def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3306 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3307 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3308} 3309 3310let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3311let hasSideEffects = 0 in { 3312def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3313 "movdqa\t{$src, $dst|$dst, $src}", []>; 3314 3315def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3316 "movdqu\t{$src, $dst|$dst, $src}", []>, 3317 XS, Requires<[UseSSE2]>; 3318} 3319 3320// For Disassembler 3321let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3322def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3323 "movdqa\t{$src, $dst|$dst, $src}", []>, 3324 FoldGenData<"MOVDQArr">; 3325 3326def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3327 "movdqu\t{$src, $dst|$dst, $src}", []>, 3328 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3329} 3330} // SchedRW 3331 3332let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3333 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3334def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3335 "movdqa\t{$src, $dst|$dst, $src}", 3336 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3337def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3338 "movdqu\t{$src, $dst|$dst, $src}", 3339 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3340 XS, Requires<[UseSSE2]>; 3341} 3342 3343let mayStore = 1, hasSideEffects = 0, 3344 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3345def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3346 "movdqa\t{$src, $dst|$dst, $src}", 3347 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3348def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3349 "movdqu\t{$src, $dst|$dst, $src}", 3350 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3351 XS, Requires<[UseSSE2]>; 3352} 3353 3354} // ExeDomain = SSEPackedInt 3355 3356// Reversed version with ".s" suffix for GAS compatibility. 3357def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3358 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3359def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3360 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3361def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3362 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3363def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3364 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3365 3366// Reversed version with ".s" suffix for GAS compatibility. 3367def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3368 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3369def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3370 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3371 3372let Predicates = [HasAVX, NoVLX] in { 3373 // Additional patterns for other integer sizes. 3374 def : Pat<(alignedloadv4i32 addr:$src), 3375 (VMOVDQArm addr:$src)>; 3376 def : Pat<(alignedloadv8i16 addr:$src), 3377 (VMOVDQArm addr:$src)>; 3378 def : Pat<(alignedloadv16i8 addr:$src), 3379 (VMOVDQArm addr:$src)>; 3380 def : Pat<(loadv4i32 addr:$src), 3381 (VMOVDQUrm addr:$src)>; 3382 def : Pat<(loadv8i16 addr:$src), 3383 (VMOVDQUrm addr:$src)>; 3384 def : Pat<(loadv16i8 addr:$src), 3385 (VMOVDQUrm addr:$src)>; 3386 3387 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3388 (VMOVDQAmr addr:$dst, VR128:$src)>; 3389 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3390 (VMOVDQAmr addr:$dst, VR128:$src)>; 3391 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3392 (VMOVDQAmr addr:$dst, VR128:$src)>; 3393 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3394 (VMOVDQUmr addr:$dst, VR128:$src)>; 3395 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3396 (VMOVDQUmr addr:$dst, VR128:$src)>; 3397 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3398 (VMOVDQUmr addr:$dst, VR128:$src)>; 3399} 3400 3401//===---------------------------------------------------------------------===// 3402// SSE2 - Packed Integer Arithmetic Instructions 3403//===---------------------------------------------------------------------===// 3404 3405let ExeDomain = SSEPackedInt in { // SSE integer instructions 3406 3407/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3408multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3409 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3410 PatFrag memop_frag, X86MemOperand x86memop, 3411 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3412 let isCommutable = 1 in 3413 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3414 (ins RC:$src1, RC:$src2), 3415 !if(Is2Addr, 3416 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3417 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3418 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3419 Sched<[sched]>; 3420 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3421 (ins RC:$src1, x86memop:$src2), 3422 !if(Is2Addr, 3423 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3424 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3425 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3426 (memop_frag addr:$src2))))]>, 3427 Sched<[sched.Folded, sched.ReadAfterFold]>; 3428} 3429} // ExeDomain = SSEPackedInt 3430 3431defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3432 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3433defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3434 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3435defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3436 SchedWriteVecALU, 1, NoVLX>; 3437defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3438 SchedWriteVecALU, 1, NoVLX>; 3439defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3440 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3441defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3442 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3443defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3444 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3445defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3446 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3447defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3448 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3449defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3450 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3451defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3452 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3453defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3454 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3455defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3456 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3457defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3458 SchedWriteVecALU, 0, NoVLX>; 3459defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3460 SchedWriteVecALU, 0, NoVLX>; 3461defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3462 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3463defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3464 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3465defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3466 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3467defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3468 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3469defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3470 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3471defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3472 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3473defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3474 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3475defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3476 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3477defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 3478 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3479defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 3480 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3481defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3482 SchedWriteVecIMul, 1, NoVLX>; 3483 3484let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3485defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3486 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3487 VEX_4V, VEX_WIG; 3488 3489let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3490defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3491 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3492 0>, VEX_4V, VEX_L, VEX_WIG; 3493let Constraints = "$src1 = $dst" in 3494defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3495 memop, i128mem, SchedWriteVecIMul.XMM>; 3496 3497let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3498defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3499 load, i128mem, SchedWritePSADBW.XMM, 0>, 3500 VEX_4V, VEX_WIG; 3501let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3502defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3503 load, i256mem, SchedWritePSADBW.YMM, 0>, 3504 VEX_4V, VEX_L, VEX_WIG; 3505let Constraints = "$src1 = $dst" in 3506defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3507 memop, i128mem, SchedWritePSADBW.XMM>; 3508 3509//===---------------------------------------------------------------------===// 3510// SSE2 - Packed Integer Logical Instructions 3511//===---------------------------------------------------------------------===// 3512 3513multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3514 string OpcodeStr, SDNode OpNode, 3515 SDNode OpNode2, RegisterClass RC, 3516 X86FoldableSchedWrite sched, 3517 X86FoldableSchedWrite schedImm, 3518 ValueType DstVT, ValueType SrcVT, 3519 PatFrag ld_frag, bit Is2Addr = 1> { 3520 // src2 is always 128-bit 3521 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3522 (ins RC:$src1, VR128:$src2), 3523 !if(Is2Addr, 3524 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3525 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3526 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3527 Sched<[sched]>; 3528 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3529 (ins RC:$src1, i128mem:$src2), 3530 !if(Is2Addr, 3531 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3532 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3533 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3534 (SrcVT (ld_frag addr:$src2)))))]>, 3535 Sched<[sched.Folded, sched.ReadAfterFold]>; 3536 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3537 (ins RC:$src1, u8imm:$src2), 3538 !if(Is2Addr, 3539 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3540 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3541 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3542 Sched<[schedImm]>; 3543} 3544 3545multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3546 string OpcodeStr, SDNode OpNode, 3547 SDNode OpNode2, ValueType DstVT128, 3548 ValueType DstVT256, ValueType SrcVT, 3549 X86SchedWriteWidths sched, 3550 X86SchedWriteWidths schedImm, Predicate prd> { 3551let Predicates = [HasAVX, prd] in 3552 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3553 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3554 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; 3555let Predicates = [HasAVX2, prd] in 3556 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3557 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3558 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, 3559 VEX_WIG; 3560let Constraints = "$src1 = $dst" in 3561 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3562 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3563 memop>; 3564} 3565 3566multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3567 SDNode OpNode, RegisterClass RC, ValueType VT, 3568 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3569 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3570 !if(Is2Addr, 3571 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3572 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3573 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3574 Sched<[sched]>; 3575} 3576 3577multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3578 SDNode OpNode, X86SchedWriteWidths sched> { 3579let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3580 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3581 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3582let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3583 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3584 VR256, v32i8, sched.YMM, 0>, 3585 VEX_4V, VEX_L, VEX_WIG; 3586let Constraints = "$src1 = $dst" in 3587 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3588 sched.XMM>; 3589} 3590 3591let ExeDomain = SSEPackedInt in { 3592 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3593 v8i16, v16i16, v8i16, SchedWriteVecShift, 3594 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3595 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3596 v4i32, v8i32, v4i32, SchedWriteVecShift, 3597 SchedWriteVecShiftImm, NoVLX>; 3598 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3599 v2i64, v4i64, v2i64, SchedWriteVecShift, 3600 SchedWriteVecShiftImm, NoVLX>; 3601 3602 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3603 v8i16, v16i16, v8i16, SchedWriteVecShift, 3604 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3605 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3606 v4i32, v8i32, v4i32, SchedWriteVecShift, 3607 SchedWriteVecShiftImm, NoVLX>; 3608 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3609 v2i64, v4i64, v2i64, SchedWriteVecShift, 3610 SchedWriteVecShiftImm, NoVLX>; 3611 3612 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3613 v8i16, v16i16, v8i16, SchedWriteVecShift, 3614 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3615 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3616 v4i32, v8i32, v4i32, SchedWriteVecShift, 3617 SchedWriteVecShiftImm, NoVLX>; 3618 3619 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3620 SchedWriteShuffle>; 3621 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3622 SchedWriteShuffle>; 3623} // ExeDomain = SSEPackedInt 3624 3625//===---------------------------------------------------------------------===// 3626// SSE2 - Packed Integer Comparison Instructions 3627//===---------------------------------------------------------------------===// 3628 3629defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3630 SchedWriteVecALU, 1, TruePredicate>; 3631defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3632 SchedWriteVecALU, 1, TruePredicate>; 3633defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3634 SchedWriteVecALU, 1, TruePredicate>; 3635defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3636 SchedWriteVecALU, 0, TruePredicate>; 3637defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3638 SchedWriteVecALU, 0, TruePredicate>; 3639defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3640 SchedWriteVecALU, 0, TruePredicate>; 3641 3642//===---------------------------------------------------------------------===// 3643// SSE2 - Packed Integer Shuffle Instructions 3644//===---------------------------------------------------------------------===// 3645 3646let ExeDomain = SSEPackedInt in { 3647multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3648 SDNode OpNode, X86SchedWriteWidths sched, 3649 Predicate prd> { 3650let Predicates = [HasAVX, prd] in { 3651 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3652 (ins VR128:$src1, u8imm:$src2), 3653 !strconcat("v", OpcodeStr, 3654 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3655 [(set VR128:$dst, 3656 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3657 VEX, Sched<[sched.XMM]>, VEX_WIG; 3658 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3659 (ins i128mem:$src1, u8imm:$src2), 3660 !strconcat("v", OpcodeStr, 3661 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3662 [(set VR128:$dst, 3663 (vt128 (OpNode (load addr:$src1), 3664 (i8 timm:$src2))))]>, VEX, 3665 Sched<[sched.XMM.Folded]>, VEX_WIG; 3666} 3667 3668let Predicates = [HasAVX2, prd] in { 3669 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3670 (ins VR256:$src1, u8imm:$src2), 3671 !strconcat("v", OpcodeStr, 3672 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3673 [(set VR256:$dst, 3674 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3675 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3676 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3677 (ins i256mem:$src1, u8imm:$src2), 3678 !strconcat("v", OpcodeStr, 3679 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3680 [(set VR256:$dst, 3681 (vt256 (OpNode (load addr:$src1), 3682 (i8 timm:$src2))))]>, VEX, VEX_L, 3683 Sched<[sched.YMM.Folded]>, VEX_WIG; 3684} 3685 3686let Predicates = [UseSSE2] in { 3687 def ri : Ii8<0x70, MRMSrcReg, 3688 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3689 !strconcat(OpcodeStr, 3690 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3691 [(set VR128:$dst, 3692 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3693 Sched<[sched.XMM]>; 3694 def mi : Ii8<0x70, MRMSrcMem, 3695 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3696 !strconcat(OpcodeStr, 3697 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3698 [(set VR128:$dst, 3699 (vt128 (OpNode (memop addr:$src1), 3700 (i8 timm:$src2))))]>, 3701 Sched<[sched.XMM.Folded]>; 3702} 3703} 3704} // ExeDomain = SSEPackedInt 3705 3706defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3707 SchedWriteShuffle, NoVLX>, PD; 3708defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3709 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3710defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3711 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3712 3713//===---------------------------------------------------------------------===// 3714// Packed Integer Pack Instructions (SSE & AVX) 3715//===---------------------------------------------------------------------===// 3716 3717let ExeDomain = SSEPackedInt in { 3718multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3719 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3720 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3721 PatFrag ld_frag, bit Is2Addr = 1> { 3722 def rr : PDI<opc, MRMSrcReg, 3723 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3724 !if(Is2Addr, 3725 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3726 !strconcat(OpcodeStr, 3727 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3728 [(set RC:$dst, 3729 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3730 Sched<[sched]>; 3731 def rm : PDI<opc, MRMSrcMem, 3732 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3733 !if(Is2Addr, 3734 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3735 !strconcat(OpcodeStr, 3736 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3737 [(set RC:$dst, 3738 (OutVT (OpNode (ArgVT RC:$src1), 3739 (ld_frag addr:$src2))))]>, 3740 Sched<[sched.Folded, sched.ReadAfterFold]>; 3741} 3742 3743multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3744 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3745 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3746 PatFrag ld_frag, bit Is2Addr = 1> { 3747 def rr : SS48I<opc, MRMSrcReg, 3748 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3749 !if(Is2Addr, 3750 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3751 !strconcat(OpcodeStr, 3752 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3753 [(set RC:$dst, 3754 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3755 Sched<[sched]>; 3756 def rm : SS48I<opc, MRMSrcMem, 3757 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3758 !if(Is2Addr, 3759 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3760 !strconcat(OpcodeStr, 3761 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3762 [(set RC:$dst, 3763 (OutVT (OpNode (ArgVT RC:$src1), 3764 (ld_frag addr:$src2))))]>, 3765 Sched<[sched.Folded, sched.ReadAfterFold]>; 3766} 3767 3768let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3769 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3770 i128mem, SchedWriteShuffle.XMM, load, 0>, 3771 VEX_4V, VEX_WIG; 3772 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3773 i128mem, SchedWriteShuffle.XMM, load, 0>, 3774 VEX_4V, VEX_WIG; 3775 3776 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3777 i128mem, SchedWriteShuffle.XMM, load, 0>, 3778 VEX_4V, VEX_WIG; 3779 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3780 i128mem, SchedWriteShuffle.XMM, load, 0>, 3781 VEX_4V, VEX_WIG; 3782} 3783 3784let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3785 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3786 i256mem, SchedWriteShuffle.YMM, load, 0>, 3787 VEX_4V, VEX_L, VEX_WIG; 3788 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3789 i256mem, SchedWriteShuffle.YMM, load, 0>, 3790 VEX_4V, VEX_L, VEX_WIG; 3791 3792 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3793 i256mem, SchedWriteShuffle.YMM, load, 0>, 3794 VEX_4V, VEX_L, VEX_WIG; 3795 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3796 i256mem, SchedWriteShuffle.YMM, load, 0>, 3797 VEX_4V, VEX_L, VEX_WIG; 3798} 3799 3800let Constraints = "$src1 = $dst" in { 3801 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3802 i128mem, SchedWriteShuffle.XMM, memop>; 3803 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3804 i128mem, SchedWriteShuffle.XMM, memop>; 3805 3806 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3807 i128mem, SchedWriteShuffle.XMM, memop>; 3808 3809 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3810 i128mem, SchedWriteShuffle.XMM, memop>; 3811} 3812} // ExeDomain = SSEPackedInt 3813 3814//===---------------------------------------------------------------------===// 3815// SSE2 - Packed Integer Unpack Instructions 3816//===---------------------------------------------------------------------===// 3817 3818let ExeDomain = SSEPackedInt in { 3819multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3820 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3821 X86FoldableSchedWrite sched, PatFrag ld_frag, 3822 bit Is2Addr = 1> { 3823 def rr : PDI<opc, MRMSrcReg, 3824 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3825 !if(Is2Addr, 3826 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3827 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3828 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3829 Sched<[sched]>; 3830 def rm : PDI<opc, MRMSrcMem, 3831 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3832 !if(Is2Addr, 3833 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3834 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3835 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3836 Sched<[sched.Folded, sched.ReadAfterFold]>; 3837} 3838 3839let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3840 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3841 i128mem, SchedWriteShuffle.XMM, load, 0>, 3842 VEX_4V, VEX_WIG; 3843 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3844 i128mem, SchedWriteShuffle.XMM, load, 0>, 3845 VEX_4V, VEX_WIG; 3846 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3847 i128mem, SchedWriteShuffle.XMM, load, 0>, 3848 VEX_4V, VEX_WIG; 3849 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3850 i128mem, SchedWriteShuffle.XMM, load, 0>, 3851 VEX_4V, VEX_WIG; 3852} 3853 3854let Predicates = [HasAVX, NoVLX] in { 3855 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3856 i128mem, SchedWriteShuffle.XMM, load, 0>, 3857 VEX_4V, VEX_WIG; 3858 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3859 i128mem, SchedWriteShuffle.XMM, load, 0>, 3860 VEX_4V, VEX_WIG; 3861 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3862 i128mem, SchedWriteShuffle.XMM, load, 0>, 3863 VEX_4V, VEX_WIG; 3864 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3865 i128mem, SchedWriteShuffle.XMM, load, 0>, 3866 VEX_4V, VEX_WIG; 3867} 3868 3869let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3870 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3871 i256mem, SchedWriteShuffle.YMM, load, 0>, 3872 VEX_4V, VEX_L, VEX_WIG; 3873 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3874 i256mem, SchedWriteShuffle.YMM, load, 0>, 3875 VEX_4V, VEX_L, VEX_WIG; 3876 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3877 i256mem, SchedWriteShuffle.YMM, load, 0>, 3878 VEX_4V, VEX_L, VEX_WIG; 3879 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3880 i256mem, SchedWriteShuffle.YMM, load, 0>, 3881 VEX_4V, VEX_L, VEX_WIG; 3882} 3883 3884let Predicates = [HasAVX2, NoVLX] in { 3885 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3886 i256mem, SchedWriteShuffle.YMM, load, 0>, 3887 VEX_4V, VEX_L, VEX_WIG; 3888 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3889 i256mem, SchedWriteShuffle.YMM, load, 0>, 3890 VEX_4V, VEX_L, VEX_WIG; 3891 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3892 i256mem, SchedWriteShuffle.YMM, load, 0>, 3893 VEX_4V, VEX_L, VEX_WIG; 3894 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3895 i256mem, SchedWriteShuffle.YMM, load, 0>, 3896 VEX_4V, VEX_L, VEX_WIG; 3897} 3898 3899let Constraints = "$src1 = $dst" in { 3900 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3901 i128mem, SchedWriteShuffle.XMM, memop>; 3902 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3903 i128mem, SchedWriteShuffle.XMM, memop>; 3904 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3905 i128mem, SchedWriteShuffle.XMM, memop>; 3906 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3907 i128mem, SchedWriteShuffle.XMM, memop>; 3908 3909 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3910 i128mem, SchedWriteShuffle.XMM, memop>; 3911 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3912 i128mem, SchedWriteShuffle.XMM, memop>; 3913 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3914 i128mem, SchedWriteShuffle.XMM, memop>; 3915 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3916 i128mem, SchedWriteShuffle.XMM, memop>; 3917} 3918} // ExeDomain = SSEPackedInt 3919 3920//===---------------------------------------------------------------------===// 3921// SSE2 - Packed Integer Extract and Insert 3922//===---------------------------------------------------------------------===// 3923 3924let ExeDomain = SSEPackedInt in { 3925multiclass sse2_pinsrw<bit Is2Addr = 1> { 3926 def rr : Ii8<0xC4, MRMSrcReg, 3927 (outs VR128:$dst), (ins VR128:$src1, 3928 GR32orGR64:$src2, u8imm:$src3), 3929 !if(Is2Addr, 3930 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3931 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3932 [(set VR128:$dst, 3933 (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 3934 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3935 def rm : Ii8<0xC4, MRMSrcMem, 3936 (outs VR128:$dst), (ins VR128:$src1, 3937 i16mem:$src2, u8imm:$src3), 3938 !if(Is2Addr, 3939 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3940 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3941 [(set VR128:$dst, 3942 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3943 timm:$src3))]>, 3944 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 3945} 3946 3947// Extract 3948let Predicates = [HasAVX, NoBWI] in 3949def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3950 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3951 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3952 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3953 timm:$src2))]>, 3954 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; 3955def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3956 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3957 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3958 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3959 timm:$src2))]>, 3960 Sched<[WriteVecExtract]>; 3961 3962// Insert 3963let Predicates = [HasAVX, NoBWI] in 3964defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; 3965 3966let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 3967defm PINSRW : sse2_pinsrw, PD; 3968 3969} // ExeDomain = SSEPackedInt 3970 3971//===---------------------------------------------------------------------===// 3972// SSE2 - Packed Mask Creation 3973//===---------------------------------------------------------------------===// 3974 3975let ExeDomain = SSEPackedInt in { 3976 3977def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3978 (ins VR128:$src), 3979 "pmovmskb\t{$src, $dst|$dst, $src}", 3980 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3981 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 3982 3983let Predicates = [HasAVX2] in { 3984def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3985 (ins VR256:$src), 3986 "pmovmskb\t{$src, $dst|$dst, $src}", 3987 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 3988 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 3989} 3990 3991def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 3992 "pmovmskb\t{$src, $dst|$dst, $src}", 3993 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3994 Sched<[WriteVecMOVMSK]>; 3995 3996} // ExeDomain = SSEPackedInt 3997 3998//===---------------------------------------------------------------------===// 3999// SSE2 - Conditional Store 4000//===---------------------------------------------------------------------===// 4001 4002let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 4003let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 4004def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4005 (ins VR128:$src, VR128:$mask), 4006 "maskmovdqu\t{$mask, $src|$src, $mask}", 4007 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4008 VEX, VEX_WIG; 4009let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4010def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4011 (ins VR128:$src, VR128:$mask), 4012 "maskmovdqu\t{$mask, $src|$src, $mask}", 4013 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4014 VEX, VEX_WIG; 4015 4016let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 4017def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4018 "maskmovdqu\t{$mask, $src|$src, $mask}", 4019 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4020let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4021def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4022 "maskmovdqu\t{$mask, $src|$src, $mask}", 4023 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4024 4025} // ExeDomain = SSEPackedInt 4026 4027//===---------------------------------------------------------------------===// 4028// SSE2 - Move Doubleword/Quadword 4029//===---------------------------------------------------------------------===// 4030 4031//===---------------------------------------------------------------------===// 4032// Move Int Doubleword to Packed Double Int 4033// 4034let ExeDomain = SSEPackedInt in { 4035def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4036 "movd\t{$src, $dst|$dst, $src}", 4037 [(set VR128:$dst, 4038 (v4i32 (scalar_to_vector GR32:$src)))]>, 4039 VEX, Sched<[WriteVecMoveFromGpr]>; 4040def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4041 "movd\t{$src, $dst|$dst, $src}", 4042 [(set VR128:$dst, 4043 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4044 VEX, Sched<[WriteVecLoad]>; 4045def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4046 "movq\t{$src, $dst|$dst, $src}", 4047 [(set VR128:$dst, 4048 (v2i64 (scalar_to_vector GR64:$src)))]>, 4049 VEX, Sched<[WriteVecMoveFromGpr]>; 4050let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4051def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4052 "movq\t{$src, $dst|$dst, $src}", []>, 4053 VEX, Sched<[WriteVecLoad]>; 4054let isCodeGenOnly = 1 in 4055def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4056 "movq\t{$src, $dst|$dst, $src}", 4057 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4058 VEX, Sched<[WriteVecMoveFromGpr]>; 4059 4060def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4061 "movd\t{$src, $dst|$dst, $src}", 4062 [(set VR128:$dst, 4063 (v4i32 (scalar_to_vector GR32:$src)))]>, 4064 Sched<[WriteVecMoveFromGpr]>; 4065def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4066 "movd\t{$src, $dst|$dst, $src}", 4067 [(set VR128:$dst, 4068 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4069 Sched<[WriteVecLoad]>; 4070def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4071 "movq\t{$src, $dst|$dst, $src}", 4072 [(set VR128:$dst, 4073 (v2i64 (scalar_to_vector GR64:$src)))]>, 4074 Sched<[WriteVecMoveFromGpr]>; 4075let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4076def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4077 "movq\t{$src, $dst|$dst, $src}", []>, 4078 Sched<[WriteVecLoad]>; 4079let isCodeGenOnly = 1 in 4080def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4081 "movq\t{$src, $dst|$dst, $src}", 4082 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4083 Sched<[WriteVecMoveFromGpr]>; 4084} // ExeDomain = SSEPackedInt 4085 4086//===---------------------------------------------------------------------===// 4087// Move Int Doubleword to Single Scalar 4088// 4089let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4090 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4091 "movd\t{$src, $dst|$dst, $src}", 4092 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4093 VEX, Sched<[WriteVecMoveFromGpr]>; 4094 4095 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4096 "movd\t{$src, $dst|$dst, $src}", 4097 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4098 Sched<[WriteVecMoveFromGpr]>; 4099 4100} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4101 4102//===---------------------------------------------------------------------===// 4103// Move Packed Doubleword Int to Packed Double Int 4104// 4105let ExeDomain = SSEPackedInt in { 4106def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4107 "movd\t{$src, $dst|$dst, $src}", 4108 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4109 (iPTR 0)))]>, VEX, 4110 Sched<[WriteVecMoveToGpr]>; 4111def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4112 (ins i32mem:$dst, VR128:$src), 4113 "movd\t{$src, $dst|$dst, $src}", 4114 [(store (i32 (extractelt (v4i32 VR128:$src), 4115 (iPTR 0))), addr:$dst)]>, 4116 VEX, Sched<[WriteVecStore]>; 4117def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4118 "movd\t{$src, $dst|$dst, $src}", 4119 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4120 (iPTR 0)))]>, 4121 Sched<[WriteVecMoveToGpr]>; 4122def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4123 "movd\t{$src, $dst|$dst, $src}", 4124 [(store (i32 (extractelt (v4i32 VR128:$src), 4125 (iPTR 0))), addr:$dst)]>, 4126 Sched<[WriteVecStore]>; 4127} // ExeDomain = SSEPackedInt 4128 4129//===---------------------------------------------------------------------===// 4130// Move Packed Doubleword Int first element to Doubleword Int 4131// 4132let ExeDomain = SSEPackedInt in { 4133let SchedRW = [WriteVecMoveToGpr] in { 4134def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4135 "movq\t{$src, $dst|$dst, $src}", 4136 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4137 (iPTR 0)))]>, 4138 VEX; 4139 4140def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4141 "movq\t{$src, $dst|$dst, $src}", 4142 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4143 (iPTR 0)))]>; 4144} //SchedRW 4145 4146let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4147def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4148 (ins i64mem:$dst, VR128:$src), 4149 "movq\t{$src, $dst|$dst, $src}", []>, 4150 VEX, Sched<[WriteVecStore]>; 4151let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4152def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4153 "movq\t{$src, $dst|$dst, $src}", []>, 4154 Sched<[WriteVecStore]>; 4155} // ExeDomain = SSEPackedInt 4156 4157//===---------------------------------------------------------------------===// 4158// Bitcast FR64 <-> GR64 4159// 4160let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4161 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4162 "movq\t{$src, $dst|$dst, $src}", 4163 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4164 VEX, Sched<[WriteVecMoveToGpr]>; 4165 4166 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4167 "movq\t{$src, $dst|$dst, $src}", 4168 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4169 Sched<[WriteVecMoveToGpr]>; 4170} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4171 4172//===---------------------------------------------------------------------===// 4173// Move Scalar Single to Double Int 4174// 4175let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4176 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4177 "movd\t{$src, $dst|$dst, $src}", 4178 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4179 VEX, Sched<[WriteVecMoveToGpr]>; 4180 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4181 "movd\t{$src, $dst|$dst, $src}", 4182 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4183 Sched<[WriteVecMoveToGpr]>; 4184} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4185 4186let Predicates = [UseAVX] in { 4187 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4188 (VMOVDI2PDIrr GR32:$src)>; 4189 4190 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4191 (VMOV64toPQIrr GR64:$src)>; 4192 4193 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4194 // These instructions also write zeros in the high part of a 256-bit register. 4195 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4196 (VMOVDI2PDIrm addr:$src)>; 4197 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4198 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4199} 4200 4201let Predicates = [UseSSE2] in { 4202 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4203 (MOVDI2PDIrr GR32:$src)>; 4204 4205 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4206 (MOV64toPQIrr GR64:$src)>; 4207 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4208 (MOVDI2PDIrm addr:$src)>; 4209} 4210 4211// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4212// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4213// these aliases. 4214def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4215 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4216def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4217 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4218// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4219def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4220 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4221def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4222 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4223 4224//===---------------------------------------------------------------------===// 4225// SSE2 - Move Quadword 4226//===---------------------------------------------------------------------===// 4227 4228//===---------------------------------------------------------------------===// 4229// Move Quadword Int to Packed Quadword Int 4230// 4231 4232let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4233def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4234 "vmovq\t{$src, $dst|$dst, $src}", 4235 [(set VR128:$dst, 4236 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4237 VEX, Requires<[UseAVX]>, VEX_WIG; 4238def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4239 "movq\t{$src, $dst|$dst, $src}", 4240 [(set VR128:$dst, 4241 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4242 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4243} // ExeDomain, SchedRW 4244 4245//===---------------------------------------------------------------------===// 4246// Move Packed Quadword Int to Quadword Int 4247// 4248let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4249def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4250 "movq\t{$src, $dst|$dst, $src}", 4251 [(store (i64 (extractelt (v2i64 VR128:$src), 4252 (iPTR 0))), addr:$dst)]>, 4253 VEX, VEX_WIG; 4254def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4255 "movq\t{$src, $dst|$dst, $src}", 4256 [(store (i64 (extractelt (v2i64 VR128:$src), 4257 (iPTR 0))), addr:$dst)]>; 4258} // ExeDomain, SchedRW 4259 4260// For disassembler only 4261let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4262 SchedRW = [SchedWriteVecLogic.XMM] in { 4263def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4264 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4265def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4266 "movq\t{$src, $dst|$dst, $src}", []>; 4267} 4268 4269def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4270 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4271def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4272 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4273 4274let Predicates = [UseAVX] in { 4275 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4276 (VMOVQI2PQIrm addr:$src)>; 4277 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4278 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4279 4280 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4281 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4282} 4283 4284let Predicates = [UseSSE2] in { 4285 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4286 4287 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4288 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4289} 4290 4291//===---------------------------------------------------------------------===// 4292// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4293// IA32 document. movq xmm1, xmm2 does clear the high bits. 4294// 4295let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4296def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4297 "vmovq\t{$src, $dst|$dst, $src}", 4298 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4299 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4300def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4301 "movq\t{$src, $dst|$dst, $src}", 4302 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4303 XS, Requires<[UseSSE2]>; 4304} // ExeDomain, SchedRW 4305 4306let Predicates = [UseAVX] in { 4307 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4308 (VMOVZPQILo2PQIrr VR128:$src)>; 4309} 4310let Predicates = [UseSSE2] in { 4311 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4312 (MOVZPQILo2PQIrr VR128:$src)>; 4313} 4314 4315let Predicates = [UseAVX] in { 4316 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4317 (SUBREG_TO_REG (i32 0), 4318 (v2f64 (VMOVZPQILo2PQIrr 4319 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4320 sub_xmm)>; 4321 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4322 (SUBREG_TO_REG (i32 0), 4323 (v2i64 (VMOVZPQILo2PQIrr 4324 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4325 sub_xmm)>; 4326} 4327 4328//===---------------------------------------------------------------------===// 4329// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4330//===---------------------------------------------------------------------===// 4331 4332multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4333 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4334 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4335def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4336 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4337 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4338 Sched<[sched]>; 4339def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4340 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4341 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4342 Sched<[sched.Folded]>; 4343} 4344 4345let Predicates = [HasAVX, NoVLX] in { 4346 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4347 v4f32, VR128, loadv4f32, f128mem, 4348 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4349 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4350 v4f32, VR128, loadv4f32, f128mem, 4351 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4352 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4353 v8f32, VR256, loadv8f32, f256mem, 4354 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4355 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4356 v8f32, VR256, loadv8f32, f256mem, 4357 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4358} 4359defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4360 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4361defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4362 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4363 4364let Predicates = [HasAVX, NoVLX] in { 4365 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4366 (VMOVSHDUPrr VR128:$src)>; 4367 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4368 (VMOVSHDUPrm addr:$src)>; 4369 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4370 (VMOVSLDUPrr VR128:$src)>; 4371 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4372 (VMOVSLDUPrm addr:$src)>; 4373 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4374 (VMOVSHDUPYrr VR256:$src)>; 4375 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4376 (VMOVSHDUPYrm addr:$src)>; 4377 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4378 (VMOVSLDUPYrr VR256:$src)>; 4379 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4380 (VMOVSLDUPYrm addr:$src)>; 4381} 4382 4383let Predicates = [UseSSE3] in { 4384 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4385 (MOVSHDUPrr VR128:$src)>; 4386 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4387 (MOVSHDUPrm addr:$src)>; 4388 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4389 (MOVSLDUPrr VR128:$src)>; 4390 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4391 (MOVSLDUPrm addr:$src)>; 4392} 4393 4394//===---------------------------------------------------------------------===// 4395// SSE3 - Replicate Double FP - MOVDDUP 4396//===---------------------------------------------------------------------===// 4397 4398multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4399def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4400 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4401 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4402 Sched<[sched.XMM]>; 4403def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4404 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4405 [(set VR128:$dst, 4406 (v2f64 (X86Movddup 4407 (scalar_to_vector (loadf64 addr:$src)))))]>, 4408 Sched<[sched.XMM.Folded]>; 4409} 4410 4411// FIXME: Merge with above classes when there are patterns for the ymm version 4412multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4413def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4414 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4415 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4416 Sched<[sched.YMM]>; 4417def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4418 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4419 [(set VR256:$dst, 4420 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4421 Sched<[sched.YMM.Folded]>; 4422} 4423 4424let Predicates = [HasAVX, NoVLX] in { 4425 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4426 VEX, VEX_WIG; 4427 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4428 VEX, VEX_L, VEX_WIG; 4429} 4430 4431defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4432 4433 4434let Predicates = [HasAVX, NoVLX] in { 4435 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4436 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4437} 4438 4439let Predicates = [UseSSE3] in { 4440 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4441 (MOVDDUPrm addr:$src)>; 4442} 4443 4444//===---------------------------------------------------------------------===// 4445// SSE3 - Move Unaligned Integer 4446//===---------------------------------------------------------------------===// 4447 4448let Predicates = [HasAVX] in { 4449 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4450 "vlddqu\t{$src, $dst|$dst, $src}", 4451 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4452 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4453 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4454 "vlddqu\t{$src, $dst|$dst, $src}", 4455 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4456 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4457} // Predicates 4458 4459def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4460 "lddqu\t{$src, $dst|$dst, $src}", 4461 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4462 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4463 4464//===---------------------------------------------------------------------===// 4465// SSE3 - Arithmetic 4466//===---------------------------------------------------------------------===// 4467 4468multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4469 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4470 PatFrag ld_frag, bit Is2Addr = 1> { 4471let Uses = [MXCSR], mayRaiseFPException = 1 in { 4472 def rr : I<0xD0, MRMSrcReg, 4473 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4474 !if(Is2Addr, 4475 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4476 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4477 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4478 Sched<[sched]>; 4479 def rm : I<0xD0, MRMSrcMem, 4480 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4481 !if(Is2Addr, 4482 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4483 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4484 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4485 Sched<[sched.Folded, sched.ReadAfterFold]>; 4486} 4487} 4488 4489let Predicates = [HasAVX] in { 4490 let ExeDomain = SSEPackedSingle in { 4491 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4492 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4493 XD, VEX_4V, VEX_WIG; 4494 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4495 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4496 XD, VEX_4V, VEX_L, VEX_WIG; 4497 } 4498 let ExeDomain = SSEPackedDouble in { 4499 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4500 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4501 PD, VEX_4V, VEX_WIG; 4502 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4503 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4504 PD, VEX_4V, VEX_L, VEX_WIG; 4505 } 4506} 4507let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4508 let ExeDomain = SSEPackedSingle in 4509 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4510 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4511 let ExeDomain = SSEPackedDouble in 4512 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4513 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4514} 4515 4516//===---------------------------------------------------------------------===// 4517// SSE3 Instructions 4518//===---------------------------------------------------------------------===// 4519 4520// Horizontal ops 4521multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4522 X86MemOperand x86memop, SDNode OpNode, 4523 X86FoldableSchedWrite sched, PatFrag ld_frag, 4524 bit Is2Addr = 1> { 4525let Uses = [MXCSR], mayRaiseFPException = 1 in { 4526 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4527 !if(Is2Addr, 4528 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4529 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4530 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4531 Sched<[sched]>; 4532 4533 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4534 !if(Is2Addr, 4535 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4536 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4537 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4538 Sched<[sched.Folded, sched.ReadAfterFold]>; 4539} 4540} 4541multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4542 X86MemOperand x86memop, SDNode OpNode, 4543 X86FoldableSchedWrite sched, PatFrag ld_frag, 4544 bit Is2Addr = 1> { 4545let Uses = [MXCSR], mayRaiseFPException = 1 in { 4546 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4547 !if(Is2Addr, 4548 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4549 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4550 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4551 Sched<[sched]>; 4552 4553 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4554 !if(Is2Addr, 4555 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4556 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4557 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4558 Sched<[sched.Folded, sched.ReadAfterFold]>; 4559} 4560} 4561 4562let Predicates = [HasAVX] in { 4563 let ExeDomain = SSEPackedSingle in { 4564 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4565 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4566 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4567 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4568 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4569 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4570 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4571 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4572 } 4573 let ExeDomain = SSEPackedDouble in { 4574 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4575 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4576 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4577 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4578 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4579 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4580 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4581 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4582 } 4583} 4584 4585let Constraints = "$src1 = $dst" in { 4586 let ExeDomain = SSEPackedSingle in { 4587 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4588 WriteFHAdd, memopv4f32>; 4589 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4590 WriteFHAdd, memopv4f32>; 4591 } 4592 let ExeDomain = SSEPackedDouble in { 4593 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4594 WriteFHAdd, memopv2f64>; 4595 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4596 WriteFHAdd, memopv2f64>; 4597 } 4598} 4599 4600//===---------------------------------------------------------------------===// 4601// SSSE3 - Packed Absolute Instructions 4602//===---------------------------------------------------------------------===// 4603 4604/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4605multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4606 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4607 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4608 (ins VR128:$src), 4609 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4610 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4611 Sched<[sched.XMM]>; 4612 4613 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4614 (ins i128mem:$src), 4615 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4616 [(set VR128:$dst, 4617 (vt (OpNode (ld_frag addr:$src))))]>, 4618 Sched<[sched.XMM.Folded]>; 4619} 4620 4621/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4622multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4623 SDNode OpNode, X86SchedWriteWidths sched> { 4624 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4625 (ins VR256:$src), 4626 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4627 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4628 Sched<[sched.YMM]>; 4629 4630 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4631 (ins i256mem:$src), 4632 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4633 [(set VR256:$dst, 4634 (vt (OpNode (load addr:$src))))]>, 4635 Sched<[sched.YMM.Folded]>; 4636} 4637 4638let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4639 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4640 load>, VEX, VEX_WIG; 4641 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4642 load>, VEX, VEX_WIG; 4643} 4644let Predicates = [HasAVX, NoVLX] in { 4645 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4646 load>, VEX, VEX_WIG; 4647} 4648let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4649 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4650 VEX, VEX_L, VEX_WIG; 4651 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4652 VEX, VEX_L, VEX_WIG; 4653} 4654let Predicates = [HasAVX2, NoVLX] in { 4655 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4656 VEX, VEX_L, VEX_WIG; 4657} 4658 4659defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4660 memop>; 4661defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4662 memop>; 4663defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4664 memop>; 4665 4666//===---------------------------------------------------------------------===// 4667// SSSE3 - Packed Binary Operator Instructions 4668//===---------------------------------------------------------------------===// 4669 4670/// SS3I_binop_rm - Simple SSSE3 bin op 4671multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4672 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4673 PatFrag memop_frag, X86MemOperand x86memop, 4674 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4675 let isCommutable = 1 in 4676 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4677 (ins RC:$src1, RC:$src2), 4678 !if(Is2Addr, 4679 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4680 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4681 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4682 Sched<[sched]>; 4683 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4684 (ins RC:$src1, x86memop:$src2), 4685 !if(Is2Addr, 4686 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4687 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4688 [(set RC:$dst, 4689 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4690 Sched<[sched.Folded, sched.ReadAfterFold]>; 4691} 4692 4693/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4694multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4695 Intrinsic IntId128, X86FoldableSchedWrite sched, 4696 PatFrag ld_frag, bit Is2Addr = 1> { 4697 let isCommutable = 1 in 4698 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4699 (ins VR128:$src1, VR128:$src2), 4700 !if(Is2Addr, 4701 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4702 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4703 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4704 Sched<[sched]>; 4705 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4706 (ins VR128:$src1, i128mem:$src2), 4707 !if(Is2Addr, 4708 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4709 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4710 [(set VR128:$dst, 4711 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4712 Sched<[sched.Folded, sched.ReadAfterFold]>; 4713} 4714 4715multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4716 Intrinsic IntId256, 4717 X86FoldableSchedWrite sched> { 4718 let isCommutable = 1 in 4719 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4720 (ins VR256:$src1, VR256:$src2), 4721 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4722 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4723 Sched<[sched]>; 4724 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4725 (ins VR256:$src1, i256mem:$src2), 4726 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4727 [(set VR256:$dst, 4728 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4729 Sched<[sched.Folded, sched.ReadAfterFold]>; 4730} 4731 4732let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4733let isCommutable = 0 in { 4734 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4735 VR128, load, i128mem, 4736 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4737 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4738 v16i8, VR128, load, i128mem, 4739 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4740} 4741defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4742 VR128, load, i128mem, 4743 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4744} 4745 4746let ImmT = NoImm, Predicates = [HasAVX] in { 4747let isCommutable = 0 in { 4748 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4749 load, i128mem, 4750 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4751 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4752 load, i128mem, 4753 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4754 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4755 load, i128mem, 4756 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4757 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4758 load, i128mem, 4759 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4760 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4761 int_x86_ssse3_psign_b_128, 4762 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4763 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4764 int_x86_ssse3_psign_w_128, 4765 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4766 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4767 int_x86_ssse3_psign_d_128, 4768 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4769 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4770 int_x86_ssse3_phadd_sw_128, 4771 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4772 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4773 int_x86_ssse3_phsub_sw_128, 4774 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4775} 4776} 4777 4778let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4779let isCommutable = 0 in { 4780 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4781 VR256, load, i256mem, 4782 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4783 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4784 v32i8, VR256, load, i256mem, 4785 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4786} 4787defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4788 VR256, load, i256mem, 4789 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4790} 4791 4792let ImmT = NoImm, Predicates = [HasAVX2] in { 4793let isCommutable = 0 in { 4794 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4795 VR256, load, i256mem, 4796 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4797 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4798 load, i256mem, 4799 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4800 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4801 VR256, load, i256mem, 4802 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4803 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4804 load, i256mem, 4805 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4806 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4807 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4808 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4809 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4810 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4811 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4812 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4813 int_x86_avx2_phadd_sw, 4814 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4815 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4816 int_x86_avx2_phsub_sw, 4817 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4818} 4819} 4820 4821// None of these have i8 immediate fields. 4822let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4823let isCommutable = 0 in { 4824 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4825 memop, i128mem, SchedWritePHAdd.XMM>; 4826 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4827 memop, i128mem, SchedWritePHAdd.XMM>; 4828 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4829 memop, i128mem, SchedWritePHAdd.XMM>; 4830 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4831 memop, i128mem, SchedWritePHAdd.XMM>; 4832 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4833 SchedWriteVecALU.XMM, memop>; 4834 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4835 SchedWriteVecALU.XMM, memop>; 4836 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4837 SchedWriteVecALU.XMM, memop>; 4838 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4839 memop, i128mem, SchedWriteVarShuffle.XMM>; 4840 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4841 int_x86_ssse3_phadd_sw_128, 4842 SchedWritePHAdd.XMM, memop>; 4843 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4844 int_x86_ssse3_phsub_sw_128, 4845 SchedWritePHAdd.XMM, memop>; 4846 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4847 v16i8, VR128, memop, i128mem, 4848 SchedWriteVecIMul.XMM>; 4849} 4850defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4851 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4852} 4853 4854//===---------------------------------------------------------------------===// 4855// SSSE3 - Packed Align Instruction Patterns 4856//===---------------------------------------------------------------------===// 4857 4858multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4859 PatFrag memop_frag, X86MemOperand x86memop, 4860 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4861 let hasSideEffects = 0 in { 4862 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4863 (ins RC:$src1, RC:$src2, u8imm:$src3), 4864 !if(Is2Addr, 4865 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4866 !strconcat(asm, 4867 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4868 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4869 Sched<[sched]>; 4870 let mayLoad = 1 in 4871 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4872 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4873 !if(Is2Addr, 4874 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4875 !strconcat(asm, 4876 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4877 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4878 (memop_frag addr:$src2), 4879 (i8 timm:$src3))))]>, 4880 Sched<[sched.Folded, sched.ReadAfterFold]>; 4881 } 4882} 4883 4884let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4885 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4886 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4887let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4888 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4889 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4890let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4891 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4892 SchedWriteShuffle.XMM>; 4893 4894//===---------------------------------------------------------------------===// 4895// SSSE3 - Thread synchronization 4896//===---------------------------------------------------------------------===// 4897 4898let SchedRW = [WriteSystem] in { 4899let Uses = [EAX, ECX, EDX] in 4900def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4901 TB, Requires<[HasSSE3, Not64BitMode]>; 4902let Uses = [RAX, ECX, EDX] in 4903def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4904 TB, Requires<[HasSSE3, In64BitMode]>; 4905 4906let Uses = [ECX, EAX] in 4907def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4908 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4909} // SchedRW 4910 4911def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4912def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4913 4914def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4915 Requires<[Not64BitMode]>; 4916def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4917 Requires<[In64BitMode]>; 4918 4919//===----------------------------------------------------------------------===// 4920// SSE4.1 - Packed Move with Sign/Zero Extend 4921// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 4922//===----------------------------------------------------------------------===// 4923 4924multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4925 RegisterClass OutRC, RegisterClass InRC, 4926 X86FoldableSchedWrite sched> { 4927 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4928 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4929 Sched<[sched]>; 4930 4931 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4932 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4933 Sched<[sched.Folded]>; 4934} 4935 4936multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 4937 X86MemOperand MemOp, X86MemOperand MemYOp, 4938 Predicate prd> { 4939 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 4940 SchedWriteShuffle.XMM>; 4941 let Predicates = [HasAVX, prd] in 4942 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 4943 VR128, VR128, SchedWriteShuffle.XMM>, 4944 VEX, VEX_WIG; 4945 let Predicates = [HasAVX2, prd] in 4946 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 4947 VR256, VR128, WriteShuffle256>, 4948 VEX, VEX_L, VEX_WIG; 4949} 4950 4951multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4952 X86MemOperand MemYOp, Predicate prd> { 4953 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 4954 MemOp, MemYOp, prd>; 4955 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 4956 !strconcat("pmovzx", OpcodeStr), 4957 MemOp, MemYOp, prd>; 4958} 4959 4960defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 4961defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 4962defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 4963 4964defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 4965defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 4966 4967defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 4968 4969// AVX2 Patterns 4970multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 4971 SDNode ExtOp, SDNode InVecOp> { 4972 // Register-Register patterns 4973 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4974 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 4975 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 4976 } 4977 let Predicates = [HasAVX2, NoVLX] in { 4978 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 4979 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 4980 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 4981 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 4982 4983 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 4984 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 4985 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 4986 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 4987 4988 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 4989 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 4990 } 4991 4992 // Simple Register-Memory patterns 4993 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4994 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4995 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4996 4997 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 4998 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4999 } 5000 5001 let Predicates = [HasAVX2, NoVLX] in { 5002 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5003 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5004 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5005 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5006 5007 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5008 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5009 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5010 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5011 5012 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5013 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5014 } 5015 5016 // AVX2 Register-Memory patterns 5017 let Predicates = [HasAVX2, NoVLX] in { 5018 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5019 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5020 5021 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5022 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5023 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5024 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5025 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5026 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5027 5028 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5029 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5030 5031 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5032 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5033 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), 5034 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5035 5036 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5037 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5038 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5039 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5040 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5041 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5042 } 5043} 5044 5045defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5046defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5047 5048// SSE4.1/AVX patterns. 5049multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5050 SDNode ExtOp> { 5051 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5052 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5053 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5054 } 5055 let Predicates = [HasAVX, NoVLX] in { 5056 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5057 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5058 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5059 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5060 5061 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5062 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5063 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5064 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5065 5066 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5067 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5068 } 5069 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5070 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5071 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5072 } 5073 let Predicates = [HasAVX, NoVLX] in { 5074 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5075 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5076 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5077 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5078 5079 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5080 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5081 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5082 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5083 5084 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5085 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5086 } 5087 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5088 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5089 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5090 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5091 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5092 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5093 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5094 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5095 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5096 } 5097 let Predicates = [HasAVX, NoVLX] in { 5098 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5099 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5100 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5101 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5102 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5103 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5104 5105 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5106 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5107 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5108 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5109 5110 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5111 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5112 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5113 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5114 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5115 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5116 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5117 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5118 5119 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5120 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5121 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5122 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5123 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5124 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5125 5126 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5127 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5128 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5129 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5130 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5131 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5132 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5133 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5134 } 5135} 5136 5137defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5138defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5139 5140let Predicates = [UseSSE41] in { 5141 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5142 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5143} 5144 5145//===----------------------------------------------------------------------===// 5146// SSE4.1 - Extract Instructions 5147//===----------------------------------------------------------------------===// 5148 5149/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5150multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5151 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5152 (ins VR128:$src1, u8imm:$src2), 5153 !strconcat(OpcodeStr, 5154 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5155 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5156 timm:$src2))]>, 5157 Sched<[WriteVecExtract]>; 5158 let hasSideEffects = 0, mayStore = 1 in 5159 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5160 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5161 !strconcat(OpcodeStr, 5162 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5163 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), 5164 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5165} 5166 5167let Predicates = [HasAVX, NoBWI] in 5168 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; 5169 5170defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5171 5172 5173/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5174multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5175 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5176 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5177 (ins VR128:$src1, u8imm:$src2), 5178 !strconcat(OpcodeStr, 5179 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5180 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5181 5182 let hasSideEffects = 0, mayStore = 1 in 5183 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5184 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5185 !strconcat(OpcodeStr, 5186 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5187 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), 5188 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5189} 5190 5191let Predicates = [HasAVX, NoBWI] in 5192 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; 5193 5194defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5195 5196 5197/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5198multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5199 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5200 (ins VR128:$src1, u8imm:$src2), 5201 !strconcat(OpcodeStr, 5202 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5203 [(set GR32:$dst, 5204 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5205 Sched<[WriteVecExtract]>; 5206 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5207 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5208 !strconcat(OpcodeStr, 5209 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5210 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5211 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5212} 5213 5214let Predicates = [HasAVX, NoDQI] in 5215 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5216 5217defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5218 5219/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5220multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5221 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5222 (ins VR128:$src1, u8imm:$src2), 5223 !strconcat(OpcodeStr, 5224 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5225 [(set GR64:$dst, 5226 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5227 Sched<[WriteVecExtract]>; 5228 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5229 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5230 !strconcat(OpcodeStr, 5231 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5232 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5233 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5234} 5235 5236let Predicates = [HasAVX, NoDQI] in 5237 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5238 5239defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5240 5241/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5242/// destination 5243multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5244 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5245 (ins VR128:$src1, u8imm:$src2), 5246 !strconcat(OpcodeStr, 5247 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5248 [(set GR32orGR64:$dst, 5249 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5250 Sched<[WriteVecExtract]>; 5251 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5252 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5253 !strconcat(OpcodeStr, 5254 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5255 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5256 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5257} 5258 5259let ExeDomain = SSEPackedSingle in { 5260 let Predicates = [UseAVX] in 5261 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5262 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5263} 5264 5265//===----------------------------------------------------------------------===// 5266// SSE4.1 - Insert Instructions 5267//===----------------------------------------------------------------------===// 5268 5269multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5270 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5271 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5272 !if(Is2Addr, 5273 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5274 !strconcat(asm, 5275 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5276 [(set VR128:$dst, 5277 (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 5278 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5279 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5280 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5281 !if(Is2Addr, 5282 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5283 !strconcat(asm, 5284 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5285 [(set VR128:$dst, 5286 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, 5287 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5288} 5289 5290let Predicates = [HasAVX, NoBWI] in 5291 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; 5292let Constraints = "$src1 = $dst" in 5293 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5294 5295multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5296 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5297 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5298 !if(Is2Addr, 5299 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5300 !strconcat(asm, 5301 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5302 [(set VR128:$dst, 5303 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5304 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5305 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5306 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5307 !if(Is2Addr, 5308 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5309 !strconcat(asm, 5310 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5311 [(set VR128:$dst, 5312 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5313 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5314} 5315 5316let Predicates = [HasAVX, NoDQI] in 5317 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5318let Constraints = "$src1 = $dst" in 5319 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5320 5321multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5322 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5323 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5324 !if(Is2Addr, 5325 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5326 !strconcat(asm, 5327 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5328 [(set VR128:$dst, 5329 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5330 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5331 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5332 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5333 !if(Is2Addr, 5334 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5335 !strconcat(asm, 5336 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5337 [(set VR128:$dst, 5338 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5339 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5340} 5341 5342let Predicates = [HasAVX, NoDQI] in 5343 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5344let Constraints = "$src1 = $dst" in 5345 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5346 5347// insertps has a few different modes, there's the first two here below which 5348// are optimized inserts that won't zero arbitrary elements in the destination 5349// vector. The next one matches the intrinsic and could zero arbitrary elements 5350// in the target vector. 5351multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5352 let isCommutable = 1 in 5353 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5354 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5355 !if(Is2Addr, 5356 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5357 !strconcat(asm, 5358 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5359 [(set VR128:$dst, 5360 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5361 Sched<[SchedWriteFShuffle.XMM]>; 5362 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5363 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5364 !if(Is2Addr, 5365 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5366 !strconcat(asm, 5367 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5368 [(set VR128:$dst, 5369 (X86insertps VR128:$src1, 5370 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5371 timm:$src3))]>, 5372 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5373} 5374 5375let ExeDomain = SSEPackedSingle in { 5376 let Predicates = [UseAVX] in 5377 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5378 VEX_4V, VEX_WIG; 5379 let Constraints = "$src1 = $dst" in 5380 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5381} 5382 5383//===----------------------------------------------------------------------===// 5384// SSE4.1 - Round Instructions 5385//===----------------------------------------------------------------------===// 5386 5387multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5388 X86MemOperand x86memop, RegisterClass RC, 5389 ValueType VT, PatFrag mem_frag, SDNode OpNode, 5390 X86FoldableSchedWrite sched> { 5391 // Intrinsic operation, reg. 5392 // Vector intrinsic operation, reg 5393let Uses = [MXCSR], mayRaiseFPException = 1 in { 5394 def r : SS4AIi8<opc, MRMSrcReg, 5395 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5396 !strconcat(OpcodeStr, 5397 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5398 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5399 Sched<[sched]>; 5400 5401 // Vector intrinsic operation, mem 5402 def m : SS4AIi8<opc, MRMSrcMem, 5403 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5404 !strconcat(OpcodeStr, 5405 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5406 [(set RC:$dst, 5407 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5408 Sched<[sched.Folded]>; 5409} 5410} 5411 5412multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5413 string OpcodeStr, X86FoldableSchedWrite sched> { 5414let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5415 def SSr : SS4AIi8<opcss, MRMSrcReg, 5416 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5417 !strconcat(OpcodeStr, 5418 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5419 []>, Sched<[sched]>; 5420 5421 let mayLoad = 1 in 5422 def SSm : SS4AIi8<opcss, MRMSrcMem, 5423 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5424 !strconcat(OpcodeStr, 5425 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5426 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5427} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5428 5429let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5430 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5431 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5432 !strconcat(OpcodeStr, 5433 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5434 []>, Sched<[sched]>; 5435 5436 let mayLoad = 1 in 5437 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5438 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5439 !strconcat(OpcodeStr, 5440 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5441 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5442} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5443} 5444 5445multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5446 string OpcodeStr, X86FoldableSchedWrite sched> { 5447let Uses = [MXCSR], mayRaiseFPException = 1 in { 5448let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5449 def SSr : SS4AIi8<opcss, MRMSrcReg, 5450 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5451 !strconcat(OpcodeStr, 5452 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5453 []>, Sched<[sched]>; 5454 5455 let mayLoad = 1 in 5456 def SSm : SS4AIi8<opcss, MRMSrcMem, 5457 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5458 !strconcat(OpcodeStr, 5459 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5460 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5461} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5462 5463let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5464 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5465 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5466 !strconcat(OpcodeStr, 5467 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5468 []>, Sched<[sched]>; 5469 5470 let mayLoad = 1 in 5471 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5472 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5473 !strconcat(OpcodeStr, 5474 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5475 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5476} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5477} 5478} 5479 5480multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5481 string OpcodeStr, X86FoldableSchedWrite sched, 5482 ValueType VT32, ValueType VT64, 5483 SDNode OpNode, bit Is2Addr = 1> { 5484let Uses = [MXCSR], mayRaiseFPException = 1 in { 5485let ExeDomain = SSEPackedSingle in { 5486 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5487 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5488 !if(Is2Addr, 5489 !strconcat(OpcodeStr, 5490 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5491 !strconcat(OpcodeStr, 5492 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5493 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5494 Sched<[sched]>; 5495 5496 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5497 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5498 !if(Is2Addr, 5499 !strconcat(OpcodeStr, 5500 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5501 !strconcat(OpcodeStr, 5502 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5503 [(set VR128:$dst, 5504 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, 5505 Sched<[sched.Folded, sched.ReadAfterFold]>; 5506} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5507 5508let ExeDomain = SSEPackedDouble in { 5509 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5510 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5511 !if(Is2Addr, 5512 !strconcat(OpcodeStr, 5513 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5514 !strconcat(OpcodeStr, 5515 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5516 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5517 Sched<[sched]>; 5518 5519 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5520 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5521 !if(Is2Addr, 5522 !strconcat(OpcodeStr, 5523 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5524 !strconcat(OpcodeStr, 5525 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5526 [(set VR128:$dst, 5527 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, 5528 Sched<[sched.Folded, sched.ReadAfterFold]>; 5529} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5530} 5531} 5532 5533// FP round - roundss, roundps, roundsd, roundpd 5534let Predicates = [HasAVX, NoVLX] in { 5535 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5536 // Intrinsic form 5537 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5538 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5539 VEX, VEX_WIG; 5540 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5541 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5542 VEX, VEX_L, VEX_WIG; 5543 } 5544 5545 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5546 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5547 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5548 VEX, VEX_WIG; 5549 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5550 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5551 VEX, VEX_L, VEX_WIG; 5552 } 5553} 5554let Predicates = [UseAVX] in { 5555 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5556 v4f32, v2f64, X86RndScales, 0>, 5557 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5558 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5559 VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC; 5560} 5561 5562let Predicates = [UseAVX] in { 5563 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5564 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5565 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5566 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5567} 5568 5569let Predicates = [UseAVX, OptForSize] in { 5570 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5571 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5572 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5573 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5574} 5575 5576let ExeDomain = SSEPackedSingle in 5577defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5578 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5579let ExeDomain = SSEPackedDouble in 5580defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5581 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5582 5583defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5584 5585let Constraints = "$src1 = $dst" in 5586defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5587 v4f32, v2f64, X86RndScales>; 5588 5589let Predicates = [UseSSE41] in { 5590 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5591 (ROUNDSSr FR32:$src1, timm:$src2)>; 5592 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5593 (ROUNDSDr FR64:$src1, timm:$src2)>; 5594} 5595 5596let Predicates = [UseSSE41, OptForSize] in { 5597 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5598 (ROUNDSSm addr:$src1, timm:$src2)>; 5599 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5600 (ROUNDSDm addr:$src1, timm:$src2)>; 5601} 5602 5603//===----------------------------------------------------------------------===// 5604// SSE4.1 - Packed Bit Test 5605//===----------------------------------------------------------------------===// 5606 5607// ptest instruction we'll lower to this in X86ISelLowering primarily from 5608// the intel intrinsic that corresponds to this. 5609let Defs = [EFLAGS], Predicates = [HasAVX] in { 5610def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5611 "vptest\t{$src2, $src1|$src1, $src2}", 5612 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5613 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5614def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5615 "vptest\t{$src2, $src1|$src1, $src2}", 5616 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5617 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5618 VEX, VEX_WIG; 5619 5620def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5621 "vptest\t{$src2, $src1|$src1, $src2}", 5622 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5623 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5624def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5625 "vptest\t{$src2, $src1|$src1, $src2}", 5626 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5627 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5628 VEX, VEX_L, VEX_WIG; 5629} 5630 5631let Defs = [EFLAGS] in { 5632def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5633 "ptest\t{$src2, $src1|$src1, $src2}", 5634 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5635 Sched<[SchedWriteVecTest.XMM]>; 5636def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5637 "ptest\t{$src2, $src1|$src1, $src2}", 5638 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5639 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5640} 5641 5642// The bit test instructions below are AVX only 5643multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5644 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5645 X86FoldableSchedWrite sched> { 5646 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5647 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5648 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5649 Sched<[sched]>, VEX; 5650 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5651 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5652 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5653 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5654} 5655 5656let Defs = [EFLAGS], Predicates = [HasAVX] in { 5657let ExeDomain = SSEPackedSingle in { 5658defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5659 SchedWriteFTest.XMM>; 5660defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5661 SchedWriteFTest.YMM>, VEX_L; 5662} 5663let ExeDomain = SSEPackedDouble in { 5664defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5665 SchedWriteFTest.XMM>; 5666defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5667 SchedWriteFTest.YMM>, VEX_L; 5668} 5669} 5670 5671//===----------------------------------------------------------------------===// 5672// SSE4.1 - Misc Instructions 5673//===----------------------------------------------------------------------===// 5674 5675let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5676 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5677 "popcnt{w}\t{$src, $dst|$dst, $src}", 5678 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5679 Sched<[WritePOPCNT]>, OpSize16, XS; 5680 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5681 "popcnt{w}\t{$src, $dst|$dst, $src}", 5682 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5683 (implicit EFLAGS)]>, 5684 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5685 5686 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5687 "popcnt{l}\t{$src, $dst|$dst, $src}", 5688 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5689 Sched<[WritePOPCNT]>, OpSize32, XS; 5690 5691 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5692 "popcnt{l}\t{$src, $dst|$dst, $src}", 5693 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5694 (implicit EFLAGS)]>, 5695 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5696 5697 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5698 "popcnt{q}\t{$src, $dst|$dst, $src}", 5699 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5700 Sched<[WritePOPCNT]>, XS; 5701 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5702 "popcnt{q}\t{$src, $dst|$dst, $src}", 5703 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5704 (implicit EFLAGS)]>, 5705 Sched<[WritePOPCNT.Folded]>, XS; 5706} 5707 5708// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5709multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5710 SDNode OpNode, PatFrag ld_frag, 5711 X86FoldableSchedWrite Sched> { 5712 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5713 (ins VR128:$src), 5714 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5715 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5716 Sched<[Sched]>; 5717 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5718 (ins i128mem:$src), 5719 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5720 [(set VR128:$dst, 5721 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5722 Sched<[Sched.Folded]>; 5723} 5724 5725// PHMIN has the same profile as PSAD, thus we use the same scheduling 5726// model, although the naming is misleading. 5727let Predicates = [HasAVX] in 5728defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5729 X86phminpos, load, 5730 WritePHMINPOS>, VEX, VEX_WIG; 5731defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5732 X86phminpos, memop, 5733 WritePHMINPOS>; 5734 5735/// SS48I_binop_rm - Simple SSE41 binary operator. 5736multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5737 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5738 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5739 bit Is2Addr = 1> { 5740 let isCommutable = 1 in 5741 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5742 (ins RC:$src1, RC:$src2), 5743 !if(Is2Addr, 5744 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5745 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5746 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5747 Sched<[sched]>; 5748 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5749 (ins RC:$src1, x86memop:$src2), 5750 !if(Is2Addr, 5751 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5752 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5753 [(set RC:$dst, 5754 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5755 Sched<[sched.Folded, sched.ReadAfterFold]>; 5756} 5757 5758let Predicates = [HasAVX, NoVLX] in { 5759 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5760 load, i128mem, SchedWriteVecALU.XMM, 0>, 5761 VEX_4V, VEX_WIG; 5762 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5763 load, i128mem, SchedWriteVecALU.XMM, 0>, 5764 VEX_4V, VEX_WIG; 5765 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5766 load, i128mem, SchedWriteVecALU.XMM, 0>, 5767 VEX_4V, VEX_WIG; 5768 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5769 load, i128mem, SchedWriteVecALU.XMM, 0>, 5770 VEX_4V, VEX_WIG; 5771 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5772 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5773 VEX_4V, VEX_WIG; 5774} 5775let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5776 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5777 load, i128mem, SchedWriteVecALU.XMM, 0>, 5778 VEX_4V, VEX_WIG; 5779 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5780 load, i128mem, SchedWriteVecALU.XMM, 0>, 5781 VEX_4V, VEX_WIG; 5782 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5783 load, i128mem, SchedWriteVecALU.XMM, 0>, 5784 VEX_4V, VEX_WIG; 5785 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5786 load, i128mem, SchedWriteVecALU.XMM, 0>, 5787 VEX_4V, VEX_WIG; 5788} 5789 5790let Predicates = [HasAVX2, NoVLX] in { 5791 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5792 load, i256mem, SchedWriteVecALU.YMM, 0>, 5793 VEX_4V, VEX_L, VEX_WIG; 5794 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5795 load, i256mem, SchedWriteVecALU.YMM, 0>, 5796 VEX_4V, VEX_L, VEX_WIG; 5797 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5798 load, i256mem, SchedWriteVecALU.YMM, 0>, 5799 VEX_4V, VEX_L, VEX_WIG; 5800 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5801 load, i256mem, SchedWriteVecALU.YMM, 0>, 5802 VEX_4V, VEX_L, VEX_WIG; 5803 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5804 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5805 VEX_4V, VEX_L, VEX_WIG; 5806} 5807let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5808 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5809 load, i256mem, SchedWriteVecALU.YMM, 0>, 5810 VEX_4V, VEX_L, VEX_WIG; 5811 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5812 load, i256mem, SchedWriteVecALU.YMM, 0>, 5813 VEX_4V, VEX_L, VEX_WIG; 5814 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5815 load, i256mem, SchedWriteVecALU.YMM, 0>, 5816 VEX_4V, VEX_L, VEX_WIG; 5817 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5818 load, i256mem, SchedWriteVecALU.YMM, 0>, 5819 VEX_4V, VEX_L, VEX_WIG; 5820} 5821 5822let Constraints = "$src1 = $dst" in { 5823 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5824 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5825 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5826 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5827 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5828 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5829 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5830 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5831 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5832 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5833 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5834 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5835 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5836 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5837 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5838 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5839 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5840 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5841} 5842 5843let Predicates = [HasAVX, NoVLX] in 5844 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5845 load, i128mem, SchedWritePMULLD.XMM, 0>, 5846 VEX_4V, VEX_WIG; 5847let Predicates = [HasAVX] in 5848 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5849 load, i128mem, SchedWriteVecALU.XMM, 0>, 5850 VEX_4V, VEX_WIG; 5851 5852let Predicates = [HasAVX2, NoVLX] in 5853 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5854 load, i256mem, SchedWritePMULLD.YMM, 0>, 5855 VEX_4V, VEX_L, VEX_WIG; 5856let Predicates = [HasAVX2] in 5857 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5858 load, i256mem, SchedWriteVecALU.YMM, 0>, 5859 VEX_4V, VEX_L, VEX_WIG; 5860 5861let Constraints = "$src1 = $dst" in { 5862 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5863 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5864 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5865 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5866} 5867 5868/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5869multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5870 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5871 X86MemOperand x86memop, bit Is2Addr, 5872 X86FoldableSchedWrite sched> { 5873 let isCommutable = 1 in 5874 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5875 (ins RC:$src1, RC:$src2, u8imm:$src3), 5876 !if(Is2Addr, 5877 !strconcat(OpcodeStr, 5878 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5879 !strconcat(OpcodeStr, 5880 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5881 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5882 Sched<[sched]>; 5883 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5884 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5885 !if(Is2Addr, 5886 !strconcat(OpcodeStr, 5887 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5888 !strconcat(OpcodeStr, 5889 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5890 [(set RC:$dst, 5891 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 5892 Sched<[sched.Folded, sched.ReadAfterFold]>; 5893} 5894 5895/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5896multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5897 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5898 X86MemOperand x86memop, bit Is2Addr, 5899 X86FoldableSchedWrite sched> { 5900 let isCommutable = 1 in 5901 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5902 (ins RC:$src1, RC:$src2, u8imm:$src3), 5903 !if(Is2Addr, 5904 !strconcat(OpcodeStr, 5905 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5906 !strconcat(OpcodeStr, 5907 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5908 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 5909 Sched<[sched]>; 5910 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5911 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5912 !if(Is2Addr, 5913 !strconcat(OpcodeStr, 5914 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5915 !strconcat(OpcodeStr, 5916 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5917 [(set RC:$dst, 5918 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 5919 Sched<[sched.Folded, sched.ReadAfterFold]>; 5920} 5921 5922def BlendCommuteImm2 : SDNodeXForm<timm, [{ 5923 uint8_t Imm = N->getZExtValue() & 0x03; 5924 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 5925}]>; 5926 5927def BlendCommuteImm4 : SDNodeXForm<timm, [{ 5928 uint8_t Imm = N->getZExtValue() & 0x0f; 5929 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 5930}]>; 5931 5932def BlendCommuteImm8 : SDNodeXForm<timm, [{ 5933 uint8_t Imm = N->getZExtValue() & 0xff; 5934 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 5935}]>; 5936 5937// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 5938def BlendScaleImm4 : SDNodeXForm<timm, [{ 5939 uint8_t Imm = N->getZExtValue(); 5940 uint8_t NewImm = 0; 5941 for (unsigned i = 0; i != 4; ++i) { 5942 if (Imm & (1 << i)) 5943 NewImm |= 0x3 << (i * 2); 5944 } 5945 return getI8Imm(NewImm, SDLoc(N)); 5946}]>; 5947 5948// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 5949def BlendScaleImm2 : SDNodeXForm<timm, [{ 5950 uint8_t Imm = N->getZExtValue(); 5951 uint8_t NewImm = 0; 5952 for (unsigned i = 0; i != 2; ++i) { 5953 if (Imm & (1 << i)) 5954 NewImm |= 0xf << (i * 4); 5955 } 5956 return getI8Imm(NewImm, SDLoc(N)); 5957}]>; 5958 5959// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 5960def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 5961 uint8_t Imm = N->getZExtValue(); 5962 uint8_t NewImm = 0; 5963 for (unsigned i = 0; i != 2; ++i) { 5964 if (Imm & (1 << i)) 5965 NewImm |= 0x3 << (i * 2); 5966 } 5967 return getI8Imm(NewImm, SDLoc(N)); 5968}]>; 5969 5970// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 5971def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 5972 uint8_t Imm = N->getZExtValue(); 5973 uint8_t NewImm = 0; 5974 for (unsigned i = 0; i != 4; ++i) { 5975 if (Imm & (1 << i)) 5976 NewImm |= 0x3 << (i * 2); 5977 } 5978 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5979}]>; 5980 5981// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 5982def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 5983 uint8_t Imm = N->getZExtValue(); 5984 uint8_t NewImm = 0; 5985 for (unsigned i = 0; i != 2; ++i) { 5986 if (Imm & (1 << i)) 5987 NewImm |= 0xf << (i * 4); 5988 } 5989 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5990}]>; 5991 5992// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 5993def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 5994 uint8_t Imm = N->getZExtValue(); 5995 uint8_t NewImm = 0; 5996 for (unsigned i = 0; i != 2; ++i) { 5997 if (Imm & (1 << i)) 5998 NewImm |= 0x3 << (i * 2); 5999 } 6000 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 6001}]>; 6002 6003let Predicates = [HasAVX] in { 6004 let isCommutable = 0 in { 6005 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6006 VR128, load, i128mem, 0, 6007 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 6008 } 6009 6010let Uses = [MXCSR], mayRaiseFPException = 1 in { 6011 let ExeDomain = SSEPackedSingle in 6012 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6013 VR128, load, f128mem, 0, 6014 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 6015 let ExeDomain = SSEPackedDouble in 6016 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6017 VR128, load, f128mem, 0, 6018 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 6019 let ExeDomain = SSEPackedSingle in 6020 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6021 VR256, load, i256mem, 0, 6022 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 6023} 6024} 6025 6026let Predicates = [HasAVX2] in { 6027 let isCommutable = 0 in { 6028 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6029 VR256, load, i256mem, 0, 6030 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 6031 } 6032} 6033 6034let Constraints = "$src1 = $dst" in { 6035 let isCommutable = 0 in { 6036 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6037 VR128, memop, i128mem, 1, 6038 SchedWriteMPSAD.XMM>; 6039 } 6040 6041 let ExeDomain = SSEPackedSingle in 6042 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6043 VR128, memop, f128mem, 1, 6044 SchedWriteDPPS.XMM>, SIMD_EXC; 6045 let ExeDomain = SSEPackedDouble in 6046 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6047 VR128, memop, f128mem, 1, 6048 SchedWriteDPPD.XMM>, SIMD_EXC; 6049} 6050 6051/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6052multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6053 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6054 X86MemOperand x86memop, bit Is2Addr, Domain d, 6055 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6056let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6057 let isCommutable = 1 in 6058 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6059 (ins RC:$src1, RC:$src2, u8imm:$src3), 6060 !if(Is2Addr, 6061 !strconcat(OpcodeStr, 6062 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6063 !strconcat(OpcodeStr, 6064 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6065 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6066 Sched<[sched]>; 6067 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6068 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6069 !if(Is2Addr, 6070 !strconcat(OpcodeStr, 6071 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6072 !strconcat(OpcodeStr, 6073 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6074 [(set RC:$dst, 6075 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6076 Sched<[sched.Folded, sched.ReadAfterFold]>; 6077} 6078 6079 // Pattern to commute if load is in first source. 6080 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6081 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6082 (commuteXForm timm:$src3))>; 6083} 6084 6085let Predicates = [HasAVX] in { 6086 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6087 VR128, load, f128mem, 0, SSEPackedSingle, 6088 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6089 VEX_4V, VEX_WIG; 6090 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6091 VR256, load, f256mem, 0, SSEPackedSingle, 6092 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6093 VEX_4V, VEX_L, VEX_WIG; 6094 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6095 VR128, load, f128mem, 0, SSEPackedDouble, 6096 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6097 VEX_4V, VEX_WIG; 6098 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6099 VR256, load, f256mem, 0, SSEPackedDouble, 6100 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6101 VEX_4V, VEX_L, VEX_WIG; 6102 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6103 VR128, load, i128mem, 0, SSEPackedInt, 6104 SchedWriteBlend.XMM, BlendCommuteImm8>, 6105 VEX_4V, VEX_WIG; 6106} 6107 6108let Predicates = [HasAVX2] in { 6109 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6110 VR256, load, i256mem, 0, SSEPackedInt, 6111 SchedWriteBlend.YMM, BlendCommuteImm8>, 6112 VEX_4V, VEX_L, VEX_WIG; 6113} 6114 6115// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6116// ExecutionDomainFixPass will cleanup domains later on. 6117let Predicates = [HasAVX1Only] in { 6118def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6119 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6120def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6121 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6122def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6123 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6124 6125// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6126// it from becoming movsd via commuting under optsize. 6127def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6128 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6129def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6130 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6131def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6132 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6133 6134def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6135 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6136def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6137 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6138def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6139 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6140 6141// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6142// it from becoming movss via commuting under optsize. 6143def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6144 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6145def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6146 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6147def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6148 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6149} 6150 6151defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6152 VR128, memop, f128mem, 1, SSEPackedSingle, 6153 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6154defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6155 VR128, memop, f128mem, 1, SSEPackedDouble, 6156 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6157defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6158 VR128, memop, i128mem, 1, SSEPackedInt, 6159 SchedWriteBlend.XMM, BlendCommuteImm8>; 6160 6161let Predicates = [UseSSE41] in { 6162// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6163// it from becoming movss via commuting under optsize. 6164def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6165 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6166def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6167 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6168def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6169 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6170 6171def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6172 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6173def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6174 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6175def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6176 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6177} 6178 6179// For insertion into the zero index (low half) of a 256-bit vector, it is 6180// more efficient to generate a blend with immediate instead of an insert*128. 6181let Predicates = [HasAVX] in { 6182def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6183 (VBLENDPDYrri VR256:$src1, 6184 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6185 VR128:$src2, sub_xmm), 0x3)>; 6186def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6187 (VBLENDPSYrri VR256:$src1, 6188 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6189 VR128:$src2, sub_xmm), 0xf)>; 6190 6191def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6192 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6193 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6194def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6195 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6196 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6197} 6198 6199/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6200multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6201 X86MemOperand x86memop, ValueType VT, 6202 PatFrag mem_frag, SDNode OpNode, 6203 X86FoldableSchedWrite sched> { 6204 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6205 (ins RC:$src1, RC:$src2, RC:$src3), 6206 !strconcat(OpcodeStr, 6207 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6208 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6209 SSEPackedInt>, TAPD, VEX_4V, 6210 Sched<[sched]>; 6211 6212 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6213 (ins RC:$src1, x86memop:$src2, RC:$src3), 6214 !strconcat(OpcodeStr, 6215 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6216 [(set RC:$dst, 6217 (OpNode RC:$src3, (mem_frag addr:$src2), 6218 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, 6219 Sched<[sched.Folded, sched.ReadAfterFold, 6220 // x86memop:$src2 6221 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6222 ReadDefault, 6223 // RC::$src3 6224 sched.ReadAfterFold]>; 6225} 6226 6227let Predicates = [HasAVX] in { 6228let ExeDomain = SSEPackedDouble in { 6229defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6230 v2f64, loadv2f64, X86Blendv, 6231 SchedWriteFVarBlend.XMM>; 6232defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6233 v4f64, loadv4f64, X86Blendv, 6234 SchedWriteFVarBlend.YMM>, VEX_L; 6235} // ExeDomain = SSEPackedDouble 6236let ExeDomain = SSEPackedSingle in { 6237defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6238 v4f32, loadv4f32, X86Blendv, 6239 SchedWriteFVarBlend.XMM>; 6240defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6241 v8f32, loadv8f32, X86Blendv, 6242 SchedWriteFVarBlend.YMM>, VEX_L; 6243} // ExeDomain = SSEPackedSingle 6244defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6245 v16i8, loadv16i8, X86Blendv, 6246 SchedWriteVarBlend.XMM>; 6247} 6248 6249let Predicates = [HasAVX2] in { 6250defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6251 v32i8, loadv32i8, X86Blendv, 6252 SchedWriteVarBlend.YMM>, VEX_L; 6253} 6254 6255let Predicates = [HasAVX] in { 6256 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6257 (v4i32 VR128:$src2))), 6258 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6259 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6260 (v2i64 VR128:$src2))), 6261 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6262 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6263 (v8i32 VR256:$src2))), 6264 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6265 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6266 (v4i64 VR256:$src2))), 6267 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6268} 6269 6270// Prefer a movss or movsd over a blendps when optimizing for size. these were 6271// changed to use blends because blends have better throughput on sandybridge 6272// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6273let Predicates = [HasAVX, OptForSpeed] in { 6274 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6275 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6276 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6277 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6278 6279 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6280 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6281 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6282 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6283 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6284 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6285 6286 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6287 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6288 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6289 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6290 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6291 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6292 6293 // Move low f32 and clear high bits. 6294 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6295 (SUBREG_TO_REG (i32 0), 6296 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6297 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6298 (i8 1))), sub_xmm)>; 6299 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6300 (SUBREG_TO_REG (i32 0), 6301 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6302 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6303 (i8 3))), sub_xmm)>; 6304} 6305 6306// Prefer a movss or movsd over a blendps when optimizing for size. these were 6307// changed to use blends because blends have better throughput on sandybridge 6308// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6309let Predicates = [UseSSE41, OptForSpeed] in { 6310 // With SSE41 we can use blends for these patterns. 6311 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6312 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6313 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6314 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6315 6316 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6317 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6318 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6319 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6320 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6321 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6322 6323 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6324 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6325 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6326 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6327 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6328 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6329} 6330 6331 6332/// SS41I_ternary - SSE 4.1 ternary operator 6333let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6334 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6335 PatFrag mem_frag, X86MemOperand x86memop, 6336 SDNode OpNode, X86FoldableSchedWrite sched> { 6337 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6338 (ins VR128:$src1, VR128:$src2), 6339 !strconcat(OpcodeStr, 6340 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6341 [(set VR128:$dst, 6342 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6343 Sched<[sched]>; 6344 6345 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6346 (ins VR128:$src1, x86memop:$src2), 6347 !strconcat(OpcodeStr, 6348 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6349 [(set VR128:$dst, 6350 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6351 Sched<[sched.Folded, sched.ReadAfterFold]>; 6352 } 6353} 6354 6355let ExeDomain = SSEPackedDouble in 6356defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6357 X86Blendv, SchedWriteFVarBlend.XMM>; 6358let ExeDomain = SSEPackedSingle in 6359defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6360 X86Blendv, SchedWriteFVarBlend.XMM>; 6361defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6362 X86Blendv, SchedWriteVarBlend.XMM>; 6363 6364// Aliases with the implicit xmm0 argument 6365def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6366 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6367def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6368 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6369def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6370 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6371def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6372 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6373def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6374 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6375def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6376 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6377 6378let Predicates = [UseSSE41] in { 6379 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6380 (v4i32 VR128:$src2))), 6381 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6382 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6383 (v2i64 VR128:$src2))), 6384 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6385} 6386 6387let AddedComplexity = 400 in { // Prefer non-temporal versions 6388 6389let Predicates = [HasAVX, NoVLX] in 6390def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6391 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6392 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6393let Predicates = [HasAVX2, NoVLX] in 6394def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6395 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6396 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6397def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6398 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6399 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6400 6401let Predicates = [HasAVX2, NoVLX] in { 6402 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6403 (VMOVNTDQAYrm addr:$src)>; 6404 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6405 (VMOVNTDQAYrm addr:$src)>; 6406 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6407 (VMOVNTDQAYrm addr:$src)>; 6408 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6409 (VMOVNTDQAYrm addr:$src)>; 6410 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6411 (VMOVNTDQAYrm addr:$src)>; 6412 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6413 (VMOVNTDQAYrm addr:$src)>; 6414} 6415 6416let Predicates = [HasAVX, NoVLX] in { 6417 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6418 (VMOVNTDQArm addr:$src)>; 6419 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6420 (VMOVNTDQArm addr:$src)>; 6421 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6422 (VMOVNTDQArm addr:$src)>; 6423 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6424 (VMOVNTDQArm addr:$src)>; 6425 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6426 (VMOVNTDQArm addr:$src)>; 6427 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6428 (VMOVNTDQArm addr:$src)>; 6429} 6430 6431let Predicates = [UseSSE41] in { 6432 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6433 (MOVNTDQArm addr:$src)>; 6434 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6435 (MOVNTDQArm addr:$src)>; 6436 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6437 (MOVNTDQArm addr:$src)>; 6438 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6439 (MOVNTDQArm addr:$src)>; 6440 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6441 (MOVNTDQArm addr:$src)>; 6442 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6443 (MOVNTDQArm addr:$src)>; 6444} 6445 6446} // AddedComplexity 6447 6448//===----------------------------------------------------------------------===// 6449// SSE4.2 - Compare Instructions 6450//===----------------------------------------------------------------------===// 6451 6452/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6453multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6454 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6455 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6456 bit Is2Addr = 1> { 6457 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6458 (ins RC:$src1, RC:$src2), 6459 !if(Is2Addr, 6460 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6461 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6462 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6463 Sched<[sched]>; 6464 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6465 (ins RC:$src1, x86memop:$src2), 6466 !if(Is2Addr, 6467 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6468 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6469 [(set RC:$dst, 6470 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6471 Sched<[sched.Folded, sched.ReadAfterFold]>; 6472} 6473 6474let Predicates = [HasAVX] in 6475 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6476 load, i128mem, SchedWriteVecALU.XMM, 0>, 6477 VEX_4V, VEX_WIG; 6478 6479let Predicates = [HasAVX2] in 6480 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6481 load, i256mem, SchedWriteVecALU.YMM, 0>, 6482 VEX_4V, VEX_L, VEX_WIG; 6483 6484let Constraints = "$src1 = $dst" in 6485 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6486 memop, i128mem, SchedWriteVecALU.XMM>; 6487 6488//===----------------------------------------------------------------------===// 6489// SSE4.2 - String/text Processing Instructions 6490//===----------------------------------------------------------------------===// 6491 6492multiclass pcmpistrm_SS42AI<string asm> { 6493 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6494 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6495 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6496 []>, Sched<[WritePCmpIStrM]>; 6497 let mayLoad = 1 in 6498 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6499 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6500 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6501 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6502} 6503 6504let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6505 let Predicates = [HasAVX] in 6506 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, VEX_WIG; 6507 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6508} 6509 6510multiclass SS42AI_pcmpestrm<string asm> { 6511 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6512 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6513 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6514 []>, Sched<[WritePCmpEStrM]>; 6515 let mayLoad = 1 in 6516 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6517 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6518 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6519 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6520} 6521 6522let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6523 let Predicates = [HasAVX] in 6524 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, VEX_WIG; 6525 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6526} 6527 6528multiclass SS42AI_pcmpistri<string asm> { 6529 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6530 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6531 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6532 []>, Sched<[WritePCmpIStrI]>; 6533 let mayLoad = 1 in 6534 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6535 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6536 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6537 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6538} 6539 6540let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6541 let Predicates = [HasAVX] in 6542 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, VEX_WIG; 6543 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6544} 6545 6546multiclass SS42AI_pcmpestri<string asm> { 6547 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6548 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6549 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6550 []>, Sched<[WritePCmpEStrI]>; 6551 let mayLoad = 1 in 6552 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6553 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6554 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6555 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6556} 6557 6558let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6559 let Predicates = [HasAVX] in 6560 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, VEX_WIG; 6561 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6562} 6563 6564//===----------------------------------------------------------------------===// 6565// SSE4.2 - CRC Instructions 6566//===----------------------------------------------------------------------===// 6567 6568// No CRC instructions have AVX equivalents 6569 6570// crc intrinsic instruction 6571// This set of instructions are only rm, the only difference is the size 6572// of r and m. 6573class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6574 RegisterClass RCIn, SDPatternOperator Int> : 6575 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6576 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6577 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6578 Sched<[WriteCRC32]>; 6579 6580class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6581 X86MemOperand x86memop, SDPatternOperator Int> : 6582 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6583 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6584 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6585 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; 6586 6587let Constraints = "$src1 = $dst" in { 6588 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6589 int_x86_sse42_crc32_32_8>; 6590 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6591 int_x86_sse42_crc32_32_8>; 6592 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6593 int_x86_sse42_crc32_32_16>, OpSize16; 6594 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6595 int_x86_sse42_crc32_32_16>, OpSize16; 6596 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6597 int_x86_sse42_crc32_32_32>, OpSize32; 6598 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6599 int_x86_sse42_crc32_32_32>, OpSize32; 6600 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6601 int_x86_sse42_crc32_64_64>, REX_W; 6602 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6603 int_x86_sse42_crc32_64_64>, REX_W; 6604 let hasSideEffects = 0 in { 6605 let mayLoad = 1 in 6606 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6607 null_frag>, REX_W; 6608 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6609 null_frag>, REX_W; 6610 } 6611} 6612 6613//===----------------------------------------------------------------------===// 6614// SHA-NI Instructions 6615//===----------------------------------------------------------------------===// 6616 6617// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6618multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6619 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6620 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6621 (ins VR128:$src1, VR128:$src2), 6622 !if(UsesXMM0, 6623 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6624 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6625 [!if(UsesXMM0, 6626 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6627 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6628 T8PS, Sched<[sched]>; 6629 6630 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6631 (ins VR128:$src1, i128mem:$src2), 6632 !if(UsesXMM0, 6633 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6634 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6635 [!if(UsesXMM0, 6636 (set VR128:$dst, (IntId VR128:$src1, 6637 (memop addr:$src2), XMM0)), 6638 (set VR128:$dst, (IntId VR128:$src1, 6639 (memop addr:$src2))))]>, T8PS, 6640 Sched<[sched.Folded, sched.ReadAfterFold]>; 6641} 6642 6643let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6644 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6645 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6646 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6647 [(set VR128:$dst, 6648 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6649 (i8 timm:$src3)))]>, TAPS, 6650 Sched<[SchedWriteVecIMul.XMM]>; 6651 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6652 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6653 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6654 [(set VR128:$dst, 6655 (int_x86_sha1rnds4 VR128:$src1, 6656 (memop addr:$src2), 6657 (i8 timm:$src3)))]>, TAPS, 6658 Sched<[SchedWriteVecIMul.XMM.Folded, 6659 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6660 6661 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6662 SchedWriteVecIMul.XMM>; 6663 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6664 SchedWriteVecIMul.XMM>; 6665 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6666 SchedWriteVecIMul.XMM>; 6667 6668 let Uses=[XMM0] in 6669 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6670 SchedWriteVecIMul.XMM, 1>; 6671 6672 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6673 SchedWriteVecIMul.XMM>; 6674 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6675 SchedWriteVecIMul.XMM>; 6676} 6677 6678// Aliases with explicit %xmm0 6679def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6680 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6681def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6682 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6683 6684//===----------------------------------------------------------------------===// 6685// AES-NI Instructions 6686//===----------------------------------------------------------------------===// 6687 6688multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6689 Intrinsic IntId, PatFrag ld_frag, 6690 bit Is2Addr = 0, RegisterClass RC = VR128, 6691 X86MemOperand MemOp = i128mem> { 6692 let AsmString = OpcodeStr# 6693 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6694 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6695 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6696 (ins RC:$src1, RC:$src2), "", 6697 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6698 Sched<[WriteAESDecEnc]>; 6699 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6700 (ins RC:$src1, MemOp:$src2), "", 6701 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6702 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6703 } 6704} 6705 6706// Perform One Round of an AES Encryption/Decryption Flow 6707let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6708 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6709 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; 6710 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6711 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; 6712 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6713 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; 6714 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6715 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; 6716} 6717 6718let Predicates = [NoVLX, HasVAES] in { 6719 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6720 int_x86_aesni_aesenc_256, load, 0, VR256, 6721 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6722 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6723 int_x86_aesni_aesenclast_256, load, 0, VR256, 6724 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6725 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6726 int_x86_aesni_aesdec_256, load, 0, VR256, 6727 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6728 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6729 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6730 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6731} 6732 6733let Constraints = "$src1 = $dst" in { 6734 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6735 int_x86_aesni_aesenc, memop, 1>; 6736 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6737 int_x86_aesni_aesenclast, memop, 1>; 6738 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6739 int_x86_aesni_aesdec, memop, 1>; 6740 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6741 int_x86_aesni_aesdeclast, memop, 1>; 6742} 6743 6744// Perform the AES InvMixColumn Transformation 6745let Predicates = [HasAVX, HasAES] in { 6746 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6747 (ins VR128:$src1), 6748 "vaesimc\t{$src1, $dst|$dst, $src1}", 6749 [(set VR128:$dst, 6750 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6751 VEX, VEX_WIG; 6752 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6753 (ins i128mem:$src1), 6754 "vaesimc\t{$src1, $dst|$dst, $src1}", 6755 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6756 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6757} 6758def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6759 (ins VR128:$src1), 6760 "aesimc\t{$src1, $dst|$dst, $src1}", 6761 [(set VR128:$dst, 6762 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6763def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6764 (ins i128mem:$src1), 6765 "aesimc\t{$src1, $dst|$dst, $src1}", 6766 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6767 Sched<[WriteAESIMC.Folded]>; 6768 6769// AES Round Key Generation Assist 6770let Predicates = [HasAVX, HasAES] in { 6771 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6772 (ins VR128:$src1, u8imm:$src2), 6773 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6774 [(set VR128:$dst, 6775 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6776 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6777 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6778 (ins i128mem:$src1, u8imm:$src2), 6779 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6780 [(set VR128:$dst, 6781 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6782 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6783} 6784def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6785 (ins VR128:$src1, u8imm:$src2), 6786 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6787 [(set VR128:$dst, 6788 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6789 Sched<[WriteAESKeyGen]>; 6790def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6791 (ins i128mem:$src1, u8imm:$src2), 6792 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6793 [(set VR128:$dst, 6794 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6795 Sched<[WriteAESKeyGen.Folded]>; 6796 6797//===----------------------------------------------------------------------===// 6798// PCLMUL Instructions 6799//===----------------------------------------------------------------------===// 6800 6801// Immediate transform to help with commuting. 6802def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6803 uint8_t Imm = N->getZExtValue(); 6804 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6805}]>; 6806 6807// SSE carry-less Multiplication instructions 6808let Predicates = [NoAVX, HasPCLMUL] in { 6809 let Constraints = "$src1 = $dst" in { 6810 let isCommutable = 1 in 6811 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6812 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6813 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6814 [(set VR128:$dst, 6815 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6816 Sched<[WriteCLMul]>; 6817 6818 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6819 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6820 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6821 [(set VR128:$dst, 6822 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6823 timm:$src3))]>, 6824 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6825 } // Constraints = "$src1 = $dst" 6826 6827 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6828 (i8 timm:$src3)), 6829 (PCLMULQDQrm VR128:$src1, addr:$src2, 6830 (PCLMULCommuteImm timm:$src3))>; 6831} // Predicates = [NoAVX, HasPCLMUL] 6832 6833// SSE aliases 6834foreach HI = ["hq","lq"] in 6835foreach LO = ["hq","lq"] in { 6836 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6837 (PCLMULQDQrr VR128:$dst, VR128:$src, 6838 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6839 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6840 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6841 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6842} 6843 6844// AVX carry-less Multiplication instructions 6845multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6846 PatFrag LdFrag, Intrinsic IntId> { 6847 let isCommutable = 1 in 6848 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6849 (ins RC:$src1, RC:$src2, u8imm:$src3), 6850 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6851 [(set RC:$dst, 6852 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6853 Sched<[WriteCLMul]>; 6854 6855 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6856 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6857 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6858 [(set RC:$dst, 6859 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6860 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6861 6862 // We can commute a load in the first operand by swapping the sources and 6863 // rotating the immediate. 6864 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6865 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6866 (PCLMULCommuteImm timm:$src3))>; 6867} 6868 6869let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6870defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6871 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 6872 6873let Predicates = [NoVLX, HasVPCLMULQDQ] in 6874defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6875 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 6876 6877multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6878 X86MemOperand MemOp, string Hi, string Lo> { 6879 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6880 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 6881 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6882 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6883 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 6884 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6885} 6886 6887multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 6888 X86MemOperand MemOp> { 6889 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 6890 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 6891 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 6892 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 6893} 6894 6895// AVX aliases 6896defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 6897defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 6898 6899//===----------------------------------------------------------------------===// 6900// SSE4A Instructions 6901//===----------------------------------------------------------------------===// 6902 6903let Predicates = [HasSSE4A] in { 6904 6905let ExeDomain = SSEPackedInt in { 6906let Constraints = "$src = $dst" in { 6907def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 6908 (ins VR128:$src, u8imm:$len, u8imm:$idx), 6909 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 6910 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 6911 timm:$idx))]>, 6912 PD, Sched<[SchedWriteVecALU.XMM]>; 6913def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6914 (ins VR128:$src, VR128:$mask), 6915 "extrq\t{$mask, $src|$src, $mask}", 6916 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 6917 VR128:$mask))]>, 6918 PD, Sched<[SchedWriteVecALU.XMM]>; 6919 6920def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 6921 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 6922 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 6923 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 6924 timm:$len, timm:$idx))]>, 6925 XD, Sched<[SchedWriteVecALU.XMM]>; 6926def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6927 (ins VR128:$src, VR128:$mask), 6928 "insertq\t{$mask, $src|$src, $mask}", 6929 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 6930 VR128:$mask))]>, 6931 XD, Sched<[SchedWriteVecALU.XMM]>; 6932} 6933} // ExeDomain = SSEPackedInt 6934 6935// Non-temporal (unaligned) scalar stores. 6936let AddedComplexity = 400 in { // Prefer non-temporal versions 6937let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 6938def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 6939 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 6940 6941def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 6942 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 6943} // SchedRW 6944 6945def : Pat<(nontemporalstore FR32:$src, addr:$dst), 6946 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 6947 6948def : Pat<(nontemporalstore FR64:$src, addr:$dst), 6949 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 6950 6951} // AddedComplexity 6952} // HasSSE4A 6953 6954//===----------------------------------------------------------------------===// 6955// AVX Instructions 6956//===----------------------------------------------------------------------===// 6957 6958//===----------------------------------------------------------------------===// 6959// VBROADCAST - Load from memory and broadcast to all elements of the 6960// destination operand 6961// 6962class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 6963 X86MemOperand x86memop, ValueType VT, 6964 PatFrag bcast_frag, SchedWrite Sched> : 6965 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 6966 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6967 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 6968 Sched<[Sched]>, VEX; 6969 6970// AVX2 adds register forms 6971class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 6972 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 6973 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 6974 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6975 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 6976 Sched<[Sched]>, VEX; 6977 6978let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 6979 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 6980 f32mem, v4f32, X86VBroadcastld32, 6981 SchedWriteFShuffle.XMM.Folded>; 6982 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 6983 f32mem, v8f32, X86VBroadcastld32, 6984 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6985} 6986let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 6987def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 6988 v4f64, X86VBroadcastld64, 6989 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6990 6991let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 6992 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 6993 v4f32, v4f32, SchedWriteFShuffle.XMM>; 6994 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 6995 v8f32, v4f32, WriteFShuffle256>, VEX_L; 6996} 6997let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 6998def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 6999 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7000 7001//===----------------------------------------------------------------------===// 7002// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7003// halves of a 256-bit vector. 7004// 7005let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7006def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7007 (ins i128mem:$src), 7008 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7009 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7010 7011let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7012 ExeDomain = SSEPackedSingle in 7013def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7014 (ins f128mem:$src), 7015 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7016 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7017 7018let Predicates = [HasAVX, NoVLX] in { 7019def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), 7020 (VBROADCASTF128 addr:$src)>; 7021def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), 7022 (VBROADCASTF128 addr:$src)>; 7023// NOTE: We're using FP instructions here, but execution domain fixing can 7024// convert to integer when profitable. 7025def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), 7026 (VBROADCASTF128 addr:$src)>; 7027def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), 7028 (VBROADCASTF128 addr:$src)>; 7029def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), 7030 (VBROADCASTF128 addr:$src)>; 7031def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), 7032 (VBROADCASTF128 addr:$src)>; 7033} 7034 7035//===----------------------------------------------------------------------===// 7036// VINSERTF128 - Insert packed floating-point values 7037// 7038let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7039def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7040 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7041 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7042 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7043let mayLoad = 1 in 7044def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7045 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7046 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7047 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7048} 7049 7050// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7051// with YMM register containing zero. 7052// FIXME: Avoid producing vxorps to clear the fake inputs. 7053let Predicates = [HasAVX1Only] in { 7054def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7055} 7056 7057multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, 7058 PatFrag memop_frag> { 7059 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7060 (iPTR imm)), 7061 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7062 (INSERT_get_vinsert128_imm VR256:$ins))>; 7063 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7064 (From (memop_frag addr:$src2)), 7065 (iPTR imm)), 7066 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7067 (INSERT_get_vinsert128_imm VR256:$ins))>; 7068} 7069 7070let Predicates = [HasAVX, NoVLX] in { 7071 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; 7072 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; 7073} 7074 7075let Predicates = [HasAVX1Only] in { 7076 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; 7077 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>; 7078 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>; 7079 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>; 7080} 7081 7082//===----------------------------------------------------------------------===// 7083// VEXTRACTF128 - Extract packed floating-point values 7084// 7085let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7086def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7087 (ins VR256:$src1, u8imm:$src2), 7088 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7089 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7090let mayStore = 1 in 7091def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7092 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7093 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7094 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7095} 7096 7097multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7098 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7099 (To (!cast<Instruction>(InstrStr#rr) 7100 (From VR256:$src1), 7101 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7102 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7103 (iPTR imm))), addr:$dst), 7104 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7105 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7106} 7107 7108// AVX1 patterns 7109let Predicates = [HasAVX, NoVLX] in { 7110 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7111 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7112} 7113 7114let Predicates = [HasAVX1Only] in { 7115 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7116 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7117 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7118 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7119} 7120 7121//===----------------------------------------------------------------------===// 7122// VMASKMOV - Conditional SIMD Packed Loads and Stores 7123// 7124multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7125 Intrinsic IntLd, Intrinsic IntLd256, 7126 Intrinsic IntSt, Intrinsic IntSt256, 7127 X86SchedWriteMaskMove schedX, 7128 X86SchedWriteMaskMove schedY> { 7129 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7130 (ins VR128:$src1, f128mem:$src2), 7131 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7132 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7133 VEX_4V, Sched<[schedX.RM]>; 7134 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7135 (ins VR256:$src1, f256mem:$src2), 7136 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7137 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7138 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7139 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7140 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7141 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7142 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7143 VEX_4V, Sched<[schedX.MR]>; 7144 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7145 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7146 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7147 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7148 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7149} 7150 7151let ExeDomain = SSEPackedSingle in 7152defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7153 int_x86_avx_maskload_ps, 7154 int_x86_avx_maskload_ps_256, 7155 int_x86_avx_maskstore_ps, 7156 int_x86_avx_maskstore_ps_256, 7157 WriteFMaskMove32, WriteFMaskMove32Y>; 7158let ExeDomain = SSEPackedDouble in 7159defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7160 int_x86_avx_maskload_pd, 7161 int_x86_avx_maskload_pd_256, 7162 int_x86_avx_maskstore_pd, 7163 int_x86_avx_maskstore_pd_256, 7164 WriteFMaskMove64, WriteFMaskMove64Y>; 7165 7166//===----------------------------------------------------------------------===// 7167// AVX_VNNI 7168//===----------------------------------------------------------------------===// 7169let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in 7170multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7171 bit IsCommutable> { 7172 let isCommutable = IsCommutable in 7173 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 7174 (ins VR128:$src1, VR128:$src2, VR128:$src3), 7175 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7176 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, 7177 VR128:$src2, VR128:$src3)))]>, 7178 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 7179 7180 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 7181 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 7182 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7183 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, 7184 (loadv4i32 addr:$src3))))]>, 7185 VEX_4V, Sched<[SchedWriteVecIMul.XMM]>; 7186 7187 let isCommutable = IsCommutable in 7188 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 7189 (ins VR256:$src1, VR256:$src2, VR256:$src3), 7190 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7191 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, 7192 VR256:$src2, VR256:$src3)))]>, 7193 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; 7194 7195 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 7196 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 7197 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7198 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, 7199 (loadv8i32 addr:$src3))))]>, 7200 VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>; 7201} 7202 7203defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix; 7204defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix; 7205defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix; 7206defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix; 7207 7208def X86vpmaddwd_su : PatFrag<(ops node:$lhs, node:$rhs), 7209 (X86vpmaddwd node:$lhs, node:$rhs), [{ 7210 return N->hasOneUse(); 7211}]>; 7212 7213let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { 7214 def : Pat<(v8i32 (add VR256:$src1, 7215 (X86vpmaddwd_su VR256:$src2, VR256:$src3))), 7216 (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; 7217 def : Pat<(v8i32 (add VR256:$src1, 7218 (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), 7219 (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; 7220 def : Pat<(v4i32 (add VR128:$src1, 7221 (X86vpmaddwd_su VR128:$src2, VR128:$src3))), 7222 (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; 7223 def : Pat<(v4i32 (add VR128:$src1, 7224 (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), 7225 (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; 7226} 7227 7228//===----------------------------------------------------------------------===// 7229// VPERMIL - Permute Single and Double Floating-Point Values 7230// 7231 7232multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7233 RegisterClass RC, X86MemOperand x86memop_f, 7234 X86MemOperand x86memop_i, 7235 ValueType f_vt, ValueType i_vt, 7236 X86FoldableSchedWrite sched, 7237 X86FoldableSchedWrite varsched> { 7238 let Predicates = [HasAVX, NoVLX] in { 7239 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7240 (ins RC:$src1, RC:$src2), 7241 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7242 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7243 Sched<[varsched]>; 7244 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7245 (ins RC:$src1, x86memop_i:$src2), 7246 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7247 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7248 (i_vt (load addr:$src2)))))]>, VEX_4V, 7249 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7250 7251 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7252 (ins RC:$src1, u8imm:$src2), 7253 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7254 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7255 Sched<[sched]>; 7256 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7257 (ins x86memop_f:$src1, u8imm:$src2), 7258 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7259 [(set RC:$dst, 7260 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7261 Sched<[sched.Folded]>; 7262 }// Predicates = [HasAVX, NoVLX] 7263} 7264 7265let ExeDomain = SSEPackedSingle in { 7266 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7267 v4f32, v4i32, SchedWriteFShuffle.XMM, 7268 SchedWriteFVarShuffle.XMM>; 7269 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7270 v8f32, v8i32, SchedWriteFShuffle.YMM, 7271 SchedWriteFVarShuffle.YMM>, VEX_L; 7272} 7273let ExeDomain = SSEPackedDouble in { 7274 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7275 v2f64, v2i64, SchedWriteFShuffle.XMM, 7276 SchedWriteFVarShuffle.XMM>; 7277 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7278 v4f64, v4i64, SchedWriteFShuffle.YMM, 7279 SchedWriteFVarShuffle.YMM>, VEX_L; 7280} 7281 7282//===----------------------------------------------------------------------===// 7283// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7284// 7285 7286let ExeDomain = SSEPackedSingle in { 7287let isCommutable = 1 in 7288def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7289 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7290 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7291 VEX_4V, VEX_L, Sched<[WriteFShuffle256]>; 7292def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7293 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7294 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7295 VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7296} 7297 7298// Immediate transform to help with commuting. 7299def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7300 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7301}]>; 7302 7303multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { 7304 def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7305 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; 7306 def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), 7307 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; 7308 // Pattern with load in other operand. 7309 def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), 7310 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7311 (Perm2XCommuteImm timm:$imm))>; 7312} 7313 7314let Predicates = [HasAVX] in { 7315 defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; 7316 defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; 7317} 7318 7319let Predicates = [HasAVX1Only] in { 7320 defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; 7321 defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; 7322 defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; 7323 defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; 7324} 7325 7326//===----------------------------------------------------------------------===// 7327// VZERO - Zero YMM registers 7328// Note: These instruction do not affect the YMM16-YMM31. 7329// 7330 7331let SchedRW = [WriteSystem] in { 7332let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7333 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7334 // Zero All YMM registers 7335 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7336 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7337 Requires<[HasAVX]>, VEX_WIG; 7338 7339 // Zero Upper bits of YMM registers 7340 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7341 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7342 Requires<[HasAVX]>, VEX_WIG; 7343} // Defs 7344} // SchedRW 7345 7346//===----------------------------------------------------------------------===// 7347// Half precision conversion instructions 7348// 7349 7350multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7351 X86FoldableSchedWrite sched> { 7352 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7353 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7354 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, 7355 T8PD, VEX, Sched<[sched]>; 7356 let hasSideEffects = 0, mayLoad = 1 in 7357 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7358 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7359 []>, T8PD, VEX, Sched<[sched.Folded]>; 7360} 7361 7362multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7363 SchedWrite RR, SchedWrite MR> { 7364 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7365 (ins RC:$src1, i32u8imm:$src2), 7366 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7367 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, 7368 TAPD, VEX, Sched<[RR]>; 7369 let hasSideEffects = 0, mayStore = 1 in 7370 def mr : Ii8<0x1D, MRMDestMem, (outs), 7371 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7372 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7373 TAPD, VEX, Sched<[MR]>; 7374} 7375 7376let Predicates = [HasF16C, NoVLX] in { 7377 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7378 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7379 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7380 WriteCvtPS2PHSt>, SIMD_EXC; 7381 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7382 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7383 7384 // Pattern match vcvtph2ps of a scalar i64 load. 7385 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7386 (VCVTPH2PSrm addr:$src)>; 7387 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 7388 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7389 (VCVTPH2PSrm addr:$src)>; 7390 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), 7391 (VCVTPH2PSYrm addr:$src)>; 7392 7393 def : Pat<(store (f64 (extractelt 7394 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7395 (iPTR 0))), addr:$dst), 7396 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7397 def : Pat<(store (i64 (extractelt 7398 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7399 (iPTR 0))), addr:$dst), 7400 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7401 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7402 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7403} 7404 7405//===----------------------------------------------------------------------===// 7406// AVX2 Instructions 7407//===----------------------------------------------------------------------===// 7408 7409/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7410multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7411 ValueType OpVT, X86FoldableSchedWrite sched, 7412 RegisterClass RC, 7413 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7414 let isCommutable = 1 in 7415 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7416 (ins RC:$src1, RC:$src2, u8imm:$src3), 7417 !strconcat(OpcodeStr, 7418 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7419 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7420 Sched<[sched]>, VEX_4V; 7421 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7422 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7423 !strconcat(OpcodeStr, 7424 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7425 [(set RC:$dst, 7426 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7427 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; 7428 7429 // Pattern to commute if load is in first source. 7430 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7431 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7432 (commuteXForm timm:$src3))>; 7433} 7434 7435let Predicates = [HasAVX2] in { 7436defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7437 SchedWriteBlend.XMM, VR128, i128mem, 7438 BlendCommuteImm4>; 7439defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7440 SchedWriteBlend.YMM, VR256, i256mem, 7441 BlendCommuteImm8>, VEX_L; 7442 7443def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7444 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7445def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7446 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7447def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7448 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7449 7450def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7451 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7452def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7453 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7454def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7455 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7456} 7457 7458// For insertion into the zero index (low half) of a 256-bit vector, it is 7459// more efficient to generate a blend with immediate instead of an insert*128. 7460// NOTE: We're using FP instructions here, but execution domain fixing should 7461// take care of using integer instructions when profitable. 7462let Predicates = [HasAVX] in { 7463def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7464 (VBLENDPSYrri VR256:$src1, 7465 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7466 VR128:$src2, sub_xmm), 0xf)>; 7467def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7468 (VBLENDPSYrri VR256:$src1, 7469 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7470 VR128:$src2, sub_xmm), 0xf)>; 7471def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7472 (VBLENDPSYrri VR256:$src1, 7473 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7474 VR128:$src2, sub_xmm), 0xf)>; 7475def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7476 (VBLENDPSYrri VR256:$src1, 7477 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7478 VR128:$src2, sub_xmm), 0xf)>; 7479 7480def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7481 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7482 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7483def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7484 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7485 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7486def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7487 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7488 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7489def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7490 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7491 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7492} 7493 7494//===----------------------------------------------------------------------===// 7495// VPBROADCAST - Load from memory and broadcast to all elements of the 7496// destination operand 7497// 7498multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7499 X86MemOperand x86memop, PatFrag bcast_frag, 7500 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7501 let Predicates = [HasAVX2, prd] in { 7502 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7503 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7504 [(set VR128:$dst, 7505 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7506 Sched<[SchedWriteShuffle.XMM]>, VEX; 7507 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7508 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7509 [(set VR128:$dst, 7510 (OpVT128 (bcast_frag addr:$src)))]>, 7511 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7512 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7513 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7514 [(set VR256:$dst, 7515 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7516 Sched<[WriteShuffle256]>, VEX, VEX_L; 7517 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7518 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7519 [(set VR256:$dst, 7520 (OpVT256 (bcast_frag addr:$src)))]>, 7521 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7522 7523 // Provide aliases for broadcast from the same register class that 7524 // automatically does the extract. 7525 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7526 (!cast<Instruction>(NAME#"Yrr") 7527 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7528 } 7529} 7530 7531defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7532 v16i8, v32i8, NoVLX_Or_NoBWI>; 7533defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7534 v8i16, v16i16, NoVLX_Or_NoBWI>; 7535defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7536 v4i32, v8i32, NoVLX>; 7537defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7538 v2i64, v4i64, NoVLX>; 7539 7540let Predicates = [HasAVX2, NoVLX] in { 7541 // Provide fallback in case the load node that is used in the patterns above 7542 // is used by additional users, which prevents the pattern selection. 7543 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7544 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7545 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7546 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7547 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7548 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7549} 7550 7551let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7552 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7553 (VPBROADCASTBrr (VMOVDI2PDIrr 7554 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7555 GR8:$src, sub_8bit))))>; 7556 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7557 (VPBROADCASTBYrr (VMOVDI2PDIrr 7558 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7559 GR8:$src, sub_8bit))))>; 7560 7561 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7562 (VPBROADCASTWrr (VMOVDI2PDIrr 7563 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7564 GR16:$src, sub_16bit))))>; 7565 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7566 (VPBROADCASTWYrr (VMOVDI2PDIrr 7567 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7568 GR16:$src, sub_16bit))))>; 7569} 7570let Predicates = [HasAVX2, NoVLX] in { 7571 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7572 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7573 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7574 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7575 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7576 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7577 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7578 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7579} 7580 7581// AVX1 broadcast patterns 7582let Predicates = [HasAVX1Only] in { 7583def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7584 (VBROADCASTSSYrm addr:$src)>; 7585def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7586 (VBROADCASTSDYrm addr:$src)>; 7587def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7588 (VBROADCASTSSrm addr:$src)>; 7589} 7590 7591 // Provide fallback in case the load node that is used in the patterns above 7592 // is used by additional users, which prevents the pattern selection. 7593let Predicates = [HasAVX, NoVLX] in { 7594 // 128bit broadcasts: 7595 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7596 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7597 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7598 (VMOVDDUPrm addr:$src)>; 7599 7600 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7601 (VMOVDDUPrr VR128:$src)>; 7602} 7603 7604let Predicates = [HasAVX1Only] in { 7605 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7606 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7607 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7608 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7609 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7610 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7611 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7612 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7613 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7614 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7615 7616 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7617 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7618 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7619 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7620 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7621 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7622 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7623 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7624 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7625 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7626 7627 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7628 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7629 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7630 (VMOVDDUPrm addr:$src)>; 7631} 7632 7633//===----------------------------------------------------------------------===// 7634// VPERM - Permute instructions 7635// 7636 7637multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7638 ValueType OpVT, X86FoldableSchedWrite Sched, 7639 X86MemOperand memOp> { 7640 let Predicates = [HasAVX2, NoVLX] in { 7641 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7642 (ins VR256:$src1, VR256:$src2), 7643 !strconcat(OpcodeStr, 7644 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7645 [(set VR256:$dst, 7646 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7647 Sched<[Sched]>, VEX_4V, VEX_L; 7648 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7649 (ins VR256:$src1, memOp:$src2), 7650 !strconcat(OpcodeStr, 7651 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7652 [(set VR256:$dst, 7653 (OpVT (X86VPermv VR256:$src1, 7654 (load addr:$src2))))]>, 7655 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; 7656 } 7657} 7658 7659defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7660let ExeDomain = SSEPackedSingle in 7661defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7662 7663multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7664 ValueType OpVT, X86FoldableSchedWrite Sched, 7665 X86MemOperand memOp> { 7666 let Predicates = [HasAVX2, NoVLX] in { 7667 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7668 (ins VR256:$src1, u8imm:$src2), 7669 !strconcat(OpcodeStr, 7670 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7671 [(set VR256:$dst, 7672 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7673 Sched<[Sched]>, VEX, VEX_L; 7674 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7675 (ins memOp:$src1, u8imm:$src2), 7676 !strconcat(OpcodeStr, 7677 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7678 [(set VR256:$dst, 7679 (OpVT (X86VPermi (mem_frag addr:$src1), 7680 (i8 timm:$src2))))]>, 7681 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7682 } 7683} 7684 7685defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7686 WriteShuffle256, i256mem>, VEX_W; 7687let ExeDomain = SSEPackedDouble in 7688defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7689 WriteFShuffle256, f256mem>, VEX_W; 7690 7691//===----------------------------------------------------------------------===// 7692// VPERM2I128 - Permute Integer vector Values in 128-bit chunks 7693// 7694let isCommutable = 1 in 7695def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7696 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7697 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7698 Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7699def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7700 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7701 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7702 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7703 7704let Predicates = [HasAVX2] in { 7705 defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; 7706 defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; 7707 defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; 7708 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7709} 7710 7711//===----------------------------------------------------------------------===// 7712// VINSERTI128 - Insert packed integer values 7713// 7714let hasSideEffects = 0 in { 7715def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7716 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7717 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7718 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7719let mayLoad = 1 in 7720def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7721 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7722 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7723 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7724} 7725 7726let Predicates = [HasAVX2, NoVLX] in { 7727 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; 7728 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>; 7729 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>; 7730 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>; 7731} 7732 7733//===----------------------------------------------------------------------===// 7734// VEXTRACTI128 - Extract packed integer values 7735// 7736def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7737 (ins VR256:$src1, u8imm:$src2), 7738 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7739 Sched<[WriteShuffle256]>, VEX, VEX_L; 7740let hasSideEffects = 0, mayStore = 1 in 7741def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7742 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7743 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7744 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7745 7746let Predicates = [HasAVX2, NoVLX] in { 7747 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7748 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7749 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7750 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7751} 7752 7753//===----------------------------------------------------------------------===// 7754// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7755// 7756multiclass avx2_pmovmask<string OpcodeStr, 7757 Intrinsic IntLd128, Intrinsic IntLd256, 7758 Intrinsic IntSt128, Intrinsic IntSt256, 7759 X86SchedWriteMaskMove schedX, 7760 X86SchedWriteMaskMove schedY> { 7761 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7762 (ins VR128:$src1, i128mem:$src2), 7763 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7764 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7765 VEX_4V, Sched<[schedX.RM]>; 7766 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7767 (ins VR256:$src1, i256mem:$src2), 7768 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7769 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7770 VEX_4V, VEX_L, Sched<[schedY.RM]>; 7771 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7772 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7773 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7774 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7775 VEX_4V, Sched<[schedX.MR]>; 7776 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7777 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7778 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7779 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7780 VEX_4V, VEX_L, Sched<[schedY.MR]>; 7781} 7782 7783defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7784 int_x86_avx2_maskload_d, 7785 int_x86_avx2_maskload_d_256, 7786 int_x86_avx2_maskstore_d, 7787 int_x86_avx2_maskstore_d_256, 7788 WriteVecMaskMove32, WriteVecMaskMove32Y>; 7789defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7790 int_x86_avx2_maskload_q, 7791 int_x86_avx2_maskload_q_256, 7792 int_x86_avx2_maskstore_q, 7793 int_x86_avx2_maskstore_q_256, 7794 WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W; 7795 7796multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7797 ValueType MaskVT> { 7798 // masked store 7799 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7800 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7801 // masked load 7802 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7803 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7804 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7805 (VT immAllZerosV))), 7806 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7807} 7808let Predicates = [HasAVX] in { 7809 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7810 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 7811 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 7812 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 7813} 7814let Predicates = [HasAVX1Only] in { 7815 // load/store i32/i64 not supported use ps/pd version 7816 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 7817 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 7818 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 7819 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 7820} 7821let Predicates = [HasAVX2] in { 7822 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 7823 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 7824 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 7825 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 7826} 7827 7828//===----------------------------------------------------------------------===// 7829// Variable Bit Shifts 7830// 7831multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 7832 ValueType vt128, ValueType vt256> { 7833 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 7834 (ins VR128:$src1, VR128:$src2), 7835 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7836 [(set VR128:$dst, 7837 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 7838 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 7839 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 7840 (ins VR128:$src1, i128mem:$src2), 7841 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7842 [(set VR128:$dst, 7843 (vt128 (OpNode VR128:$src1, 7844 (vt128 (load addr:$src2)))))]>, 7845 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, 7846 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 7847 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7848 (ins VR256:$src1, VR256:$src2), 7849 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7850 [(set VR256:$dst, 7851 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 7852 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 7853 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7854 (ins VR256:$src1, i256mem:$src2), 7855 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7856 [(set VR256:$dst, 7857 (vt256 (OpNode VR256:$src1, 7858 (vt256 (load addr:$src2)))))]>, 7859 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 7860 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 7861} 7862 7863let Predicates = [HasAVX2, NoVLX] in { 7864 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 7865 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; 7866 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 7867 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; 7868 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 7869} 7870 7871//===----------------------------------------------------------------------===// 7872// VGATHER - GATHER Operations 7873 7874// FIXME: Improve scheduling of gather instructions. 7875multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, 7876 ValueType VTy, RegisterClass RC256, 7877 X86MemOperand memop128, X86MemOperand memop256, 7878 ValueType MTx = VTx, ValueType MTy = VTy> { 7879let mayLoad = 1, hasSideEffects = 0 in { 7880 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 7881 (ins VR128:$src1, memop128:$src2, VR128:$mask), 7882 !strconcat(OpcodeStr, 7883 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7884 []>, VEX, Sched<[WriteLoad]>; 7885 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 7886 (ins RC256:$src1, memop256:$src2, RC256:$mask), 7887 !strconcat(OpcodeStr, 7888 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7889 []>, VEX, VEX_L, Sched<[WriteLoad]>; 7890} 7891} 7892 7893let Predicates = [HasAVX2] in { 7894 let mayLoad = 1, hasSideEffects = 0, Constraints 7895 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 7896 in { 7897 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, 7898 VR256, vx128mem, vx256mem>, VEX_W; 7899 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, 7900 VR256, vx128mem, vy256mem>, VEX_W; 7901 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, 7902 VR256, vx128mem, vy256mem>; 7903 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, 7904 VR128, vx64mem, vy128mem>; 7905 7906 let ExeDomain = SSEPackedDouble in { 7907 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, 7908 VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W; 7909 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, 7910 VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W; 7911 } 7912 7913 let ExeDomain = SSEPackedSingle in { 7914 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, 7915 VR256, vx128mem, vy256mem, v4i32, v8i32>; 7916 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, 7917 VR128, vx64mem, vy128mem, v4i32, v4i32>; 7918 } 7919 } 7920} 7921 7922//===----------------------------------------------------------------------===// 7923// GFNI instructions 7924//===----------------------------------------------------------------------===// 7925 7926multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 7927 RegisterClass RC, PatFrag MemOpFrag, 7928 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7929 let ExeDomain = SSEPackedInt, 7930 AsmString = !if(Is2Addr, 7931 OpcodeStr#"\t{$src2, $dst|$dst, $src2}", 7932 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 7933 let isCommutable = 1 in 7934 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 7935 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 7936 Sched<[SchedWriteVecALU.XMM]>, T8PD; 7937 7938 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 7939 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 7940 (MemOpFrag addr:$src2))))]>, 7941 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD; 7942 } 7943} 7944 7945multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 7946 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 7947 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7948 let AsmString = !if(Is2Addr, 7949 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7950 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 7951 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 7952 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 7953 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 7954 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; 7955 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 7956 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 7957 [(set RC:$dst, (OpVT (OpNode RC:$src1, 7958 (MemOpFrag addr:$src2), 7959 timm:$src3)))], SSEPackedInt>, 7960 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; 7961 } 7962} 7963 7964multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 7965 let Constraints = "$src1 = $dst", 7966 Predicates = [HasGFNI, UseSSE2] in 7967 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 7968 VR128, load, i128mem, 1>; 7969 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 7970 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, 7971 load, i128mem>, VEX_4V, VEX_W; 7972 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, 7973 load, i256mem>, VEX_4V, VEX_L, VEX_W; 7974 } 7975} 7976 7977// GF2P8MULB 7978let Constraints = "$src1 = $dst", 7979 Predicates = [HasGFNI, UseSSE2] in 7980defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 7981 i128mem, 1>; 7982let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 7983 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 7984 i128mem>, VEX_4V; 7985 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 7986 i256mem>, VEX_4V, VEX_L; 7987} 7988// GF2P8AFFINEINVQB, GF2P8AFFINEQB 7989let isCommutable = 0 in { 7990 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 7991 X86GF2P8affineinvqb>, TAPD; 7992 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 7993 X86GF2P8affineqb>, TAPD; 7994} 7995 7996