1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 ComplexPattern mem_cpat, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 116 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 117 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 118 [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; 119} 120 121//===----------------------------------------------------------------------===// 122// AVX & SSE - Zero/One Vectors 123//===----------------------------------------------------------------------===// 124 125// Alias instruction that maps zero vector to pxor / xorp* for sse. 126// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 127// swizzled by ExecutionDomainFix to pxor. 128// We set canFoldAsLoad because this can be converted to a constant-pool 129// load of an all-zeros value if folding it would be beneficial. 130let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 131 isPseudo = 1, SchedRW = [WriteZero] in { 132def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 133 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 134} 135 136let Predicates = [NoAVX512] in 137def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 138 139 140// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 141// and doesn't need it because on sandy bridge the register is set to zero 142// at the rename stage without using any execution unit, so SET0PSY 143// and SET0PDY can be used for vector int instructions without penalty 144let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 145 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 146def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 147 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 148} 149 150// We set canFoldAsLoad because this can be converted to a constant-pool 151// load of an all-ones value if folding it would be beneficial. 152let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 153 isPseudo = 1, SchedRW = [WriteZero] in { 154 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 155 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 156 let Predicates = [HasAVX1Only, OptForMinSize] in { 157 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 158 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 159 } 160 let Predicates = [HasAVX2] in 161 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 162 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 163} 164 165//===----------------------------------------------------------------------===// 166// SSE 1 & 2 - Move FP Scalar Instructions 167// 168// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 169// register copies because it's a partial register update; Register-to-register 170// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 171// that the insert be implementable in terms of a copy, and just mentioned, we 172// don't use movss/movsd for copies. 173//===----------------------------------------------------------------------===// 174 175multiclass sse12_move_rr<SDNode OpNode, ValueType vt, 176 X86MemOperand x86memop, string base_opc, 177 string asm_opr, Domain d, string Name> { 178 let isCommutable = 1 in 179 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 180 (ins VR128:$src1, VR128:$src2), 181 !strconcat(base_opc, asm_opr), 182 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 183 Sched<[SchedWriteFShuffle.XMM]>; 184 185 // For the disassembler 186 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 187 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 188 (ins VR128:$src1, VR128:$src2), 189 !strconcat(base_opc, asm_opr), []>, 190 Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; 191} 192 193multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 194 X86MemOperand x86memop, string OpcodeStr, 195 Domain d, string Name, Predicate pred> { 196 // AVX 197 let Predicates = [UseAVX, OptForSize] in 198 defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 199 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, 200 "V"#Name>, 201 VEX_4V, VEX_LIG, VEX_WIG; 202 203 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 204 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 205 [(store RC:$src, addr:$dst)], d>, 206 VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; 207 // SSE1 & 2 208 let Constraints = "$src1 = $dst" in { 209 let Predicates = [pred, NoSSE41_Or_OptForSize] in 210 defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, 211 "\t{$src2, $dst|$dst, $src2}", d, Name>; 212 } 213 214 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 215 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 216 [(store RC:$src, addr:$dst)], d>, 217 Sched<[WriteFStore]>; 218 219 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 220 (!cast<Instruction>("V"#NAME#"rr_REV") 221 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 222 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 223 (!cast<Instruction>(NAME#"rr_REV") 224 VR128:$dst, VR128:$src2), 0>; 225} 226 227// Loading from memory automatically zeroing upper bits. 228multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 229 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 230 Domain d> { 231 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 232 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 233 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 234 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 235 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 236 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 237 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 238 Sched<[WriteFLoad]>; 239 240 // _alt version uses FR32/FR64 register class. 241 let isCodeGenOnly = 1 in { 242 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 243 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 244 [(set RC:$dst, (mem_pat addr:$src))], d>, 245 VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; 246 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 247 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 248 [(set RC:$dst, (mem_pat addr:$src))], d>, 249 Sched<[WriteFLoad]>; 250 } 251} 252 253defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 254 SSEPackedSingle, "MOVSS", UseSSE1>, XS; 255defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 256 SSEPackedDouble, "MOVSD", UseSSE2>, XD; 257 258let canFoldAsLoad = 1, isReMaterializable = 1 in { 259 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 260 SSEPackedSingle>, XS; 261 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 262 SSEPackedDouble>, XD; 263} 264 265// Patterns 266let Predicates = [UseAVX] in { 267 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 268 (VMOVSSrm addr:$src)>; 269 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 270 (VMOVSDrm addr:$src)>; 271 272 // Represent the same patterns above but in the form they appear for 273 // 256-bit types 274 def : Pat<(v8f32 (X86vzload32 addr:$src)), 275 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 276 def : Pat<(v4f64 (X86vzload64 addr:$src)), 277 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 278} 279 280let Predicates = [UseAVX, OptForSize] in { 281 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 282 // MOVSS to the lower bits. 283 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 284 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 285 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 286 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 287 288 // Move low f32 and clear high bits. 289 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 290 (SUBREG_TO_REG (i32 0), 291 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 292 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 293 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 294 (SUBREG_TO_REG (i32 0), 295 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 296 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 297} 298 299let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 300// Move scalar to XMM zero-extended, zeroing a VR128 then do a 301// MOVSS to the lower bits. 302def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 303 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 304def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 305 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 306} 307 308let Predicates = [UseSSE2] in 309def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 310 (MOVSDrm addr:$src)>; 311 312let Predicates = [UseSSE1] in 313def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 314 (MOVSSrm addr:$src)>; 315 316//===----------------------------------------------------------------------===// 317// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 318//===----------------------------------------------------------------------===// 319 320multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 321 X86MemOperand x86memop, PatFrag ld_frag, 322 string asm, Domain d, 323 X86SchedWriteMoveLS sched> { 324let hasSideEffects = 0, isMoveReg = 1 in 325 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 326 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 327 Sched<[sched.RR]>; 328let canFoldAsLoad = 1, isReMaterializable = 1 in 329 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 330 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 331 [(set RC:$dst, (ld_frag addr:$src))], d>, 332 Sched<[sched.RM]>; 333} 334 335let Predicates = [HasAVX, NoVLX] in { 336defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 337 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 338 PS, VEX, VEX_WIG; 339defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 340 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 341 PD, VEX, VEX_WIG; 342defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 343 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 344 PS, VEX, VEX_WIG; 345defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 346 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 347 PD, VEX, VEX_WIG; 348 349defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 350 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 351 PS, VEX, VEX_L, VEX_WIG; 352defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 353 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 354 PD, VEX, VEX_L, VEX_WIG; 355defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 356 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 357 PS, VEX, VEX_L, VEX_WIG; 358defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 359 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 360 PD, VEX, VEX_L, VEX_WIG; 361} 362 363let Predicates = [UseSSE1] in { 364defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 365 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 366 PS; 367defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 368 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 369 PS; 370} 371let Predicates = [UseSSE2] in { 372defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 373 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 374 PD; 375defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 376 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 377 PD; 378} 379 380let Predicates = [HasAVX, NoVLX] in { 381let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 382def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 383 "movaps\t{$src, $dst|$dst, $src}", 384 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 385 VEX, VEX_WIG; 386def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 387 "movapd\t{$src, $dst|$dst, $src}", 388 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 389 VEX, VEX_WIG; 390def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 391 "movups\t{$src, $dst|$dst, $src}", 392 [(store (v4f32 VR128:$src), addr:$dst)]>, 393 VEX, VEX_WIG; 394def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 395 "movupd\t{$src, $dst|$dst, $src}", 396 [(store (v2f64 VR128:$src), addr:$dst)]>, 397 VEX, VEX_WIG; 398} // SchedRW 399 400let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 401def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 402 "movaps\t{$src, $dst|$dst, $src}", 403 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 404 VEX, VEX_L, VEX_WIG; 405def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 406 "movapd\t{$src, $dst|$dst, $src}", 407 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 408 VEX, VEX_L, VEX_WIG; 409def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 410 "movups\t{$src, $dst|$dst, $src}", 411 [(store (v8f32 VR256:$src), addr:$dst)]>, 412 VEX, VEX_L, VEX_WIG; 413def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 414 "movupd\t{$src, $dst|$dst, $src}", 415 [(store (v4f64 VR256:$src), addr:$dst)]>, 416 VEX, VEX_L, VEX_WIG; 417} // SchedRW 418} // Predicate 419 420// For disassembler 421let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 422 isMoveReg = 1 in { 423let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 424 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 425 (ins VR128:$src), 426 "movaps\t{$src, $dst|$dst, $src}", []>, 427 VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; 428 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 429 (ins VR128:$src), 430 "movapd\t{$src, $dst|$dst, $src}", []>, 431 VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; 432 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 433 (ins VR128:$src), 434 "movups\t{$src, $dst|$dst, $src}", []>, 435 VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; 436 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 437 (ins VR128:$src), 438 "movupd\t{$src, $dst|$dst, $src}", []>, 439 VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; 440} // SchedRW 441 442let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 443 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 444 (ins VR256:$src), 445 "movaps\t{$src, $dst|$dst, $src}", []>, 446 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; 447 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 448 (ins VR256:$src), 449 "movapd\t{$src, $dst|$dst, $src}", []>, 450 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; 451 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 452 (ins VR256:$src), 453 "movups\t{$src, $dst|$dst, $src}", []>, 454 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; 455 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 456 (ins VR256:$src), 457 "movupd\t{$src, $dst|$dst, $src}", []>, 458 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; 459} // SchedRW 460} // Predicate 461 462// Reversed version with ".s" suffix for GAS compatibility. 463def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 464 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 465def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 466 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 467def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 468 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 469def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 470 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 471def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 472 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 473def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 474 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 475def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 476 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 477def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 478 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 479 480let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 481def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 482 "movaps\t{$src, $dst|$dst, $src}", 483 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 484def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 485 "movapd\t{$src, $dst|$dst, $src}", 486 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 487def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 488 "movups\t{$src, $dst|$dst, $src}", 489 [(store (v4f32 VR128:$src), addr:$dst)]>; 490def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 491 "movupd\t{$src, $dst|$dst, $src}", 492 [(store (v2f64 VR128:$src), addr:$dst)]>; 493} // SchedRW 494 495// For disassembler 496let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 497 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 498 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 499 "movaps\t{$src, $dst|$dst, $src}", []>, 500 FoldGenData<"MOVAPSrr">; 501 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 502 "movapd\t{$src, $dst|$dst, $src}", []>, 503 FoldGenData<"MOVAPDrr">; 504 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 505 "movups\t{$src, $dst|$dst, $src}", []>, 506 FoldGenData<"MOVUPSrr">; 507 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 508 "movupd\t{$src, $dst|$dst, $src}", []>, 509 FoldGenData<"MOVUPDrr">; 510} 511 512// Reversed version with ".s" suffix for GAS compatibility. 513def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 514 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 515def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 516 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 517def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 518 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 519def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 520 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 521 522let Predicates = [HasAVX, NoVLX] in { 523 // 256-bit load/store need to use floating point load/store in case we don't 524 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 525 // available and changing the domain is beneficial. 526 def : Pat<(alignedloadv4i64 addr:$src), 527 (VMOVAPSYrm addr:$src)>; 528 def : Pat<(alignedloadv8i32 addr:$src), 529 (VMOVAPSYrm addr:$src)>; 530 def : Pat<(alignedloadv16i16 addr:$src), 531 (VMOVAPSYrm addr:$src)>; 532 def : Pat<(alignedloadv32i8 addr:$src), 533 (VMOVAPSYrm addr:$src)>; 534 def : Pat<(loadv4i64 addr:$src), 535 (VMOVUPSYrm addr:$src)>; 536 def : Pat<(loadv8i32 addr:$src), 537 (VMOVUPSYrm addr:$src)>; 538 def : Pat<(loadv16i16 addr:$src), 539 (VMOVUPSYrm addr:$src)>; 540 def : Pat<(loadv32i8 addr:$src), 541 (VMOVUPSYrm addr:$src)>; 542 543 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 544 (VMOVAPSYmr addr:$dst, VR256:$src)>; 545 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 546 (VMOVAPSYmr addr:$dst, VR256:$src)>; 547 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 548 (VMOVAPSYmr addr:$dst, VR256:$src)>; 549 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 550 (VMOVAPSYmr addr:$dst, VR256:$src)>; 551 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 552 (VMOVUPSYmr addr:$dst, VR256:$src)>; 553 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 554 (VMOVUPSYmr addr:$dst, VR256:$src)>; 555 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 556 (VMOVUPSYmr addr:$dst, VR256:$src)>; 557 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 558 (VMOVUPSYmr addr:$dst, VR256:$src)>; 559} 560 561// Use movaps / movups for SSE integer load / store (one byte shorter). 562// The instructions selected below are then converted to MOVDQA/MOVDQU 563// during the SSE domain pass. 564let Predicates = [UseSSE1] in { 565 def : Pat<(alignedloadv2i64 addr:$src), 566 (MOVAPSrm addr:$src)>; 567 def : Pat<(alignedloadv4i32 addr:$src), 568 (MOVAPSrm addr:$src)>; 569 def : Pat<(alignedloadv8i16 addr:$src), 570 (MOVAPSrm addr:$src)>; 571 def : Pat<(alignedloadv16i8 addr:$src), 572 (MOVAPSrm addr:$src)>; 573 def : Pat<(loadv2i64 addr:$src), 574 (MOVUPSrm addr:$src)>; 575 def : Pat<(loadv4i32 addr:$src), 576 (MOVUPSrm addr:$src)>; 577 def : Pat<(loadv8i16 addr:$src), 578 (MOVUPSrm addr:$src)>; 579 def : Pat<(loadv16i8 addr:$src), 580 (MOVUPSrm addr:$src)>; 581 582 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 583 (MOVAPSmr addr:$dst, VR128:$src)>; 584 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 585 (MOVAPSmr addr:$dst, VR128:$src)>; 586 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 587 (MOVAPSmr addr:$dst, VR128:$src)>; 588 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 589 (MOVAPSmr addr:$dst, VR128:$src)>; 590 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 591 (MOVUPSmr addr:$dst, VR128:$src)>; 592 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 593 (MOVUPSmr addr:$dst, VR128:$src)>; 594 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 595 (MOVUPSmr addr:$dst, VR128:$src)>; 596 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 597 (MOVUPSmr addr:$dst, VR128:$src)>; 598} 599 600//===----------------------------------------------------------------------===// 601// SSE 1 & 2 - Move Low packed FP Instructions 602//===----------------------------------------------------------------------===// 603 604multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, 605 string base_opc, string asm_opr> { 606 // No pattern as they need be special cased between high and low. 607 let hasSideEffects = 0, mayLoad = 1 in 608 def PSrm : PI<opc, MRMSrcMem, 609 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 610 !strconcat(base_opc, "s", asm_opr), 611 [], SSEPackedSingle>, PS, 612 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 613 614 def PDrm : PI<opc, MRMSrcMem, 615 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 616 !strconcat(base_opc, "d", asm_opr), 617 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 618 (scalar_to_vector (loadf64 addr:$src2)))))], 619 SSEPackedDouble>, PD, 620 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 621} 622 623multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 624 string base_opc> { 625 let Predicates = [UseAVX] in 626 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 627 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 628 VEX_4V, VEX_WIG; 629 630 let Constraints = "$src1 = $dst" in 631 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 632 "\t{$src2, $dst|$dst, $src2}">; 633} 634 635defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 636 637let SchedRW = [WriteFStore] in { 638let Predicates = [UseAVX] in { 639let mayStore = 1, hasSideEffects = 0 in 640def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 641 "movlps\t{$src, $dst|$dst, $src}", 642 []>, 643 VEX, VEX_WIG; 644def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 645 "movlpd\t{$src, $dst|$dst, $src}", 646 [(store (f64 (extractelt (v2f64 VR128:$src), 647 (iPTR 0))), addr:$dst)]>, 648 VEX, VEX_WIG; 649}// UseAVX 650let mayStore = 1, hasSideEffects = 0 in 651def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 652 "movlps\t{$src, $dst|$dst, $src}", 653 []>; 654def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 655 "movlpd\t{$src, $dst|$dst, $src}", 656 [(store (f64 (extractelt (v2f64 VR128:$src), 657 (iPTR 0))), addr:$dst)]>; 658} // SchedRW 659 660let Predicates = [UseSSE1] in { 661 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 662 // end up with a movsd or blend instead of shufp. 663 // No need for aligned load, we're only loading 64-bits. 664 def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1, 665 (i8 -28)), 666 (MOVLPSrm VR128:$src1, addr:$src2)>; 667 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 668 (MOVLPSrm VR128:$src1, addr:$src2)>; 669 670 def : Pat<(v4f32 (X86vzload64 addr:$src)), 671 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 672 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 673 (MOVLPSmr addr:$dst, VR128:$src)>; 674} 675 676//===----------------------------------------------------------------------===// 677// SSE 1 & 2 - Move Hi packed FP Instructions 678//===----------------------------------------------------------------------===// 679 680defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 681 682let SchedRW = [WriteFStore] in { 683// v2f64 extract element 1 is always custom lowered to unpack high to low 684// and extract element 0 so the non-store version isn't too horrible. 685let Predicates = [UseAVX] in { 686let mayStore = 1, hasSideEffects = 0 in 687def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 688 "movhps\t{$src, $dst|$dst, $src}", 689 []>, VEX, VEX_WIG; 690def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 691 "movhpd\t{$src, $dst|$dst, $src}", 692 [(store (f64 (extractelt 693 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 694 (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; 695} // UseAVX 696let mayStore = 1, hasSideEffects = 0 in 697def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 698 "movhps\t{$src, $dst|$dst, $src}", 699 []>; 700def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 701 "movhpd\t{$src, $dst|$dst, $src}", 702 [(store (f64 (extractelt 703 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 704 (iPTR 0))), addr:$dst)]>; 705} // SchedRW 706 707let Predicates = [UseAVX] in { 708 // Also handle an i64 load because that may get selected as a faster way to 709 // load the data. 710 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 711 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 712 (VMOVHPDrm VR128:$src1, addr:$src2)>; 713 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 714 (VMOVHPDrm VR128:$src1, addr:$src2)>; 715 716 def : Pat<(store (f64 (extractelt 717 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 718 (iPTR 0))), addr:$dst), 719 (VMOVHPDmr addr:$dst, VR128:$src)>; 720 721 // MOVLPD patterns 722 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 723 (VMOVLPDrm VR128:$src1, addr:$src2)>; 724} 725 726let Predicates = [UseSSE1] in { 727 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 728 // end up with a movsd or blend instead of shufp. 729 // No need for aligned load, we're only loading 64-bits. 730 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))), 731 (MOVHPSrm VR128:$src1, addr:$src2)>; 732 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 733 (MOVHPSrm VR128:$src1, addr:$src2)>; 734 735 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 736 addr:$dst), 737 (MOVHPSmr addr:$dst, VR128:$src)>; 738} 739 740let Predicates = [UseSSE2] in { 741 // MOVHPD patterns 742 743 // Also handle an i64 load because that may get selected as a faster way to 744 // load the data. 745 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 746 (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), 747 (MOVHPDrm VR128:$src1, addr:$src2)>; 748 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 749 (MOVHPDrm VR128:$src1, addr:$src2)>; 750 751 def : Pat<(store (f64 (extractelt 752 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 753 (iPTR 0))), addr:$dst), 754 (MOVHPDmr addr:$dst, VR128:$src)>; 755 756 // MOVLPD patterns 757 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 758 (MOVLPDrm VR128:$src1, addr:$src2)>; 759} 760 761let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 762 // Use MOVLPD to load into the low bits from a full vector unless we can use 763 // BLENDPD. 764 def : Pat<(X86Movsd VR128:$src1, (v2f64 (nonvolatile_load addr:$src2))), 765 (MOVLPDrm VR128:$src1, addr:$src2)>; 766} 767 768//===----------------------------------------------------------------------===// 769// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 770//===----------------------------------------------------------------------===// 771 772let Predicates = [UseAVX] in { 773 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 774 (ins VR128:$src1, VR128:$src2), 775 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 776 [(set VR128:$dst, 777 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 778 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; 779 let isCommutable = 1 in 780 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 781 (ins VR128:$src1, VR128:$src2), 782 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 783 [(set VR128:$dst, 784 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 785 VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, 786 NotMemoryFoldable; 787} 788let Constraints = "$src1 = $dst" in { 789 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 790 (ins VR128:$src1, VR128:$src2), 791 "movlhps\t{$src2, $dst|$dst, $src2}", 792 [(set VR128:$dst, 793 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 794 Sched<[SchedWriteFShuffle.XMM]>; 795 let isCommutable = 1 in 796 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 797 (ins VR128:$src1, VR128:$src2), 798 "movhlps\t{$src2, $dst|$dst, $src2}", 799 [(set VR128:$dst, 800 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 801 Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; 802} 803 804//===----------------------------------------------------------------------===// 805// SSE 1 & 2 - Conversion Instructions 806//===----------------------------------------------------------------------===// 807 808multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 809 SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, 810 string asm, string mem, X86FoldableSchedWrite sched, 811 SchedRead Int2Fpu = ReadDefault> { 812 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 813 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 814 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 815 Sched<[sched, Int2Fpu]>; 816 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 817 mem#"\t{$src, $dst|$dst, $src}", 818 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 819 Sched<[sched.Folded]>; 820} 821 822multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 823 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 824 string asm, Domain d, X86FoldableSchedWrite sched> { 825let hasSideEffects = 0 in { 826 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 827 [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>, 828 Sched<[sched]>; 829 let mayLoad = 1 in 830 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 831 [(set RC:$dst, (DstTy (sint_to_fp 832 (SrcTy (ld_frag addr:$src)))))], d>, 833 Sched<[sched.Folded]>; 834} 835} 836 837multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 838 X86MemOperand x86memop, string asm, string mem, 839 X86FoldableSchedWrite sched> { 840let hasSideEffects = 0, Predicates = [UseAVX] in { 841 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 842 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 843 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 844 let mayLoad = 1 in 845 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 846 (ins DstRC:$src1, x86memop:$src), 847 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 848 Sched<[sched.Folded, sched.ReadAfterFold]>; 849} // hasSideEffects = 0 850} 851 852let isCodeGenOnly = 1, Predicates = [UseAVX] in { 853defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 854 "cvttss2si", "cvttss2si", 855 WriteCvtSS2I>, 856 XS, VEX, VEX_LIG; 857defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 858 "cvttss2si", "cvttss2si", 859 WriteCvtSS2I>, 860 XS, VEX, VEX_W, VEX_LIG; 861defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 862 "cvttsd2si", "cvttsd2si", 863 WriteCvtSD2I>, 864 XD, VEX, VEX_LIG; 865defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 866 "cvttsd2si", "cvttsd2si", 867 WriteCvtSD2I>, 868 XD, VEX, VEX_W, VEX_LIG; 869} 870 871// The assembler can recognize rr 64-bit instructions by seeing a rxx 872// register, but the same isn't true when only using memory operands, 873// provide other assembly "l" and "q" forms to address this explicitly 874// where appropriate to do so. 875let isCodeGenOnly = 1 in { 876defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 877 WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; 878defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 879 WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; 880defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 881 WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; 882defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 883 WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; 884} // isCodeGenOnly = 1 885 886let Predicates = [UseAVX] in { 887 def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), 888 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 889 def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), 890 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 891 def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), 892 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 893 def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), 894 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 895 896 def : Pat<(f32 (sint_to_fp GR32:$src)), 897 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 898 def : Pat<(f32 (sint_to_fp GR64:$src)), 899 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 900 def : Pat<(f64 (sint_to_fp GR32:$src)), 901 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 902 def : Pat<(f64 (sint_to_fp GR64:$src)), 903 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 904} 905 906let isCodeGenOnly = 1 in { 907defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, 908 "cvttss2si", "cvttss2si", 909 WriteCvtSS2I>, XS; 910defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, 911 "cvttss2si", "cvttss2si", 912 WriteCvtSS2I>, XS, REX_W; 913defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, 914 "cvttsd2si", "cvttsd2si", 915 WriteCvtSD2I>, XD; 916defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, 917 "cvttsd2si", "cvttsd2si", 918 WriteCvtSD2I>, XD, REX_W; 919defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, 920 "cvtsi2ss", "cvtsi2ss{l}", 921 WriteCvtI2SS, ReadInt2Fpu>, XS; 922defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, 923 "cvtsi2ss", "cvtsi2ss{q}", 924 WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W; 925defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, 926 "cvtsi2sd", "cvtsi2sd{l}", 927 WriteCvtI2SD, ReadInt2Fpu>, XD; 928defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, 929 "cvtsi2sd", "cvtsi2sd{q}", 930 WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W; 931} // isCodeGenOnly = 1 932 933// Conversion Instructions Intrinsics - Match intrinsics which expect MM 934// and/or XMM operand(s). 935 936multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 937 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 938 Operand memop, ComplexPattern mem_cpat, string asm, 939 X86FoldableSchedWrite sched> { 940 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 941 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 942 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 943 Sched<[sched]>; 944 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 945 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 946 [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>, 947 Sched<[sched.Folded]>; 948} 949 950multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 951 RegisterClass DstRC, X86MemOperand x86memop, 952 string asm, string mem, X86FoldableSchedWrite sched, 953 bit Is2Addr = 1> { 954let hasSideEffects = 0 in { 955 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 956 !if(Is2Addr, 957 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 958 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 959 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 960 let mayLoad = 1 in 961 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 962 (ins DstRC:$src1, x86memop:$src2), 963 !if(Is2Addr, 964 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 965 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 966 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 967} 968} 969 970let Predicates = [UseAVX] in { 971defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 972 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 973 WriteCvtSD2I>, XD, VEX, VEX_LIG; 974defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 975 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 976 WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; 977} 978defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 979 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD; 980defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 981 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; 982 983 984let Predicates = [UseAVX] in { 985defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 986 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG; 987defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 988 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W; 989defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 990 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG; 991defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 992 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W; 993} 994let Constraints = "$src1 = $dst" in { 995 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 996 i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS; 997 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 998 i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W; 999 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1000 i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD; 1001 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1002 i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W; 1003} 1004 1005def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1006 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1007def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1008 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1009def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1010 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1011def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1012 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1013 1014def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1015 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1016def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1017 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1018 1019def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1020 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1021def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1022 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1023def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1024 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1025def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1026 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1027 1028def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1029 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1030def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1031 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1032 1033/// SSE 1 Only 1034 1035// Aliases for intrinsics 1036let Predicates = [UseAVX] in { 1037defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1038 ssmem, sse_load_f32, "cvttss2si", 1039 WriteCvtSS2I>, XS, VEX, VEX_LIG; 1040defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1041 X86cvtts2Int, ssmem, sse_load_f32, 1042 "cvttss2si", WriteCvtSS2I>, 1043 XS, VEX, VEX_LIG, VEX_W; 1044defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1045 sdmem, sse_load_f64, "cvttsd2si", 1046 WriteCvtSS2I>, XD, VEX, VEX_LIG; 1047defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1048 X86cvtts2Int, sdmem, sse_load_f64, 1049 "cvttsd2si", WriteCvtSS2I>, 1050 XD, VEX, VEX_LIG, VEX_W; 1051} 1052defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1053 ssmem, sse_load_f32, "cvttss2si", 1054 WriteCvtSS2I>, XS; 1055defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1056 X86cvtts2Int, ssmem, sse_load_f32, 1057 "cvttss2si", WriteCvtSS2I>, XS, REX_W; 1058defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1059 sdmem, sse_load_f64, "cvttsd2si", 1060 WriteCvtSD2I>, XD; 1061defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1062 X86cvtts2Int, sdmem, sse_load_f64, 1063 "cvttsd2si", WriteCvtSD2I>, XD, REX_W; 1064 1065def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1066 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1067def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1068 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1069def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1070 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1071def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1072 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1073def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1074 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1075def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1076 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1077def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1078 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1079def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1080 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1081 1082def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1083 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1084def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1085 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1086def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1087 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1088def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1089 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1090def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1091 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1092def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1093 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1094def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1095 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1096def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1097 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1098 1099let Predicates = [UseAVX] in { 1100defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1101 ssmem, sse_load_f32, "cvtss2si", 1102 WriteCvtSS2I>, XS, VEX, VEX_LIG; 1103defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1104 ssmem, sse_load_f32, "cvtss2si", 1105 WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; 1106} 1107defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1108 ssmem, sse_load_f32, "cvtss2si", 1109 WriteCvtSS2I>, XS; 1110defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1111 ssmem, sse_load_f32, "cvtss2si", 1112 WriteCvtSS2I>, XS, REX_W; 1113 1114defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1115 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1116 SSEPackedSingle, WriteCvtI2PS>, 1117 PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1118defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1119 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1120 SSEPackedSingle, WriteCvtI2PSY>, 1121 PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; 1122 1123defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1124 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1125 SSEPackedSingle, WriteCvtI2PS>, 1126 PS, Requires<[UseSSE2]>; 1127 1128// AVX aliases 1129def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1130 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1131def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1132 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1133def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1134 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1135def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1136 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1137def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1138 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1139def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1140 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1141def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1142 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1143def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1144 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1145 1146// SSE aliases 1147def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1148 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1149def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1150 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1151def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1152 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1153def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1154 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1155def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1156 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1157def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1158 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1159def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1160 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1161def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1162 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1163 1164/// SSE 2 Only 1165 1166// Convert scalar double to scalar single 1167let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX] in { 1168def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1169 (ins FR32:$src1, FR64:$src2), 1170 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1171 VEX_4V, VEX_LIG, VEX_WIG, 1172 Sched<[WriteCvtSD2SS]>; 1173let mayLoad = 1 in 1174def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1175 (ins FR32:$src1, f64mem:$src2), 1176 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1177 XD, VEX_4V, VEX_LIG, VEX_WIG, 1178 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1179} 1180 1181def : Pat<(f32 (fpround FR64:$src)), 1182 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1183 Requires<[UseAVX]>; 1184 1185let isCodeGenOnly = 1 in { 1186def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1187 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1188 [(set FR32:$dst, (fpround FR64:$src))]>, 1189 Sched<[WriteCvtSD2SS]>; 1190def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1191 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1192 [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, 1193 XD, Requires<[UseSSE2, OptForSize]>, 1194 Sched<[WriteCvtSD2SS.Folded]>; 1195} 1196 1197def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1198 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1199 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1200 [(set VR128:$dst, 1201 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1202 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1203 Sched<[WriteCvtSD2SS]>; 1204def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1205 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1206 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1207 [(set VR128:$dst, 1208 (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>, 1209 XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>, 1210 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1211let Constraints = "$src1 = $dst" in { 1212def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1213 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1214 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1215 [(set VR128:$dst, 1216 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1217 XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1218def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1219 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1220 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1221 [(set VR128:$dst, 1222 (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>, 1223 XD, Requires<[UseSSE2]>, 1224 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1225} 1226 1227// Convert scalar single to scalar double 1228// SSE2 instructions with XS prefix 1229let isCodeGenOnly = 1, hasSideEffects = 0 in { 1230def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1231 (ins FR64:$src1, FR32:$src2), 1232 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1233 XS, VEX_4V, VEX_LIG, VEX_WIG, 1234 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>; 1235let mayLoad = 1 in 1236def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1237 (ins FR64:$src1, f32mem:$src2), 1238 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1239 XS, VEX_4V, VEX_LIG, VEX_WIG, 1240 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1241 Requires<[UseAVX, OptForSize]>; 1242} // isCodeGenOnly = 1, hasSideEffects = 0 1243 1244def : Pat<(f64 (fpextend FR32:$src)), 1245 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1246def : Pat<(fpextend (loadf32 addr:$src)), 1247 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1248 1249let isCodeGenOnly = 1 in { 1250def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1251 "cvtss2sd\t{$src, $dst|$dst, $src}", 1252 [(set FR64:$dst, (fpextend FR32:$src))]>, 1253 XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; 1254def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1255 "cvtss2sd\t{$src, $dst|$dst, $src}", 1256 [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>, 1257 XS, Requires<[UseSSE2, OptForSize]>, 1258 Sched<[WriteCvtSS2SD.Folded]>; 1259} // isCodeGenOnly = 1 1260 1261let hasSideEffects = 0 in { 1262def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1263 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1264 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1265 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, 1266 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1267let mayLoad = 1 in 1268def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1269 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1270 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1271 []>, XS, VEX_4V, VEX_LIG, VEX_WIG, Requires<[HasAVX]>, 1272 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1273let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1274def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1275 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1276 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1277 []>, XS, Requires<[UseSSE2]>, 1278 Sched<[WriteCvtSS2SD]>; 1279let mayLoad = 1 in 1280def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1281 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1282 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1283 []>, XS, Requires<[UseSSE2]>, 1284 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1285} 1286} // hasSideEffects = 0 1287 1288// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1289// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1290// vmovs{s,d} instructions 1291let Predicates = [UseAVX] in { 1292def : Pat<(v4f32 (X86Movss 1293 (v4f32 VR128:$dst), 1294 (v4f32 (scalar_to_vector 1295 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1296 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1297 1298def : Pat<(v2f64 (X86Movsd 1299 (v2f64 VR128:$dst), 1300 (v2f64 (scalar_to_vector 1301 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1302 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1303 1304def : Pat<(v4f32 (X86Movss 1305 (v4f32 VR128:$dst), 1306 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), 1307 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1308 1309def : Pat<(v4f32 (X86Movss 1310 (v4f32 VR128:$dst), 1311 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), 1312 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1313 1314def : Pat<(v4f32 (X86Movss 1315 (v4f32 VR128:$dst), 1316 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), 1317 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1318 1319def : Pat<(v4f32 (X86Movss 1320 (v4f32 VR128:$dst), 1321 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), 1322 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1323 1324def : Pat<(v2f64 (X86Movsd 1325 (v2f64 VR128:$dst), 1326 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), 1327 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1328 1329def : Pat<(v2f64 (X86Movsd 1330 (v2f64 VR128:$dst), 1331 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), 1332 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1333 1334def : Pat<(v2f64 (X86Movsd 1335 (v2f64 VR128:$dst), 1336 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), 1337 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1338 1339def : Pat<(v2f64 (X86Movsd 1340 (v2f64 VR128:$dst), 1341 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), 1342 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1343} // Predicates = [UseAVX] 1344 1345let Predicates = [UseSSE2] in { 1346def : Pat<(v4f32 (X86Movss 1347 (v4f32 VR128:$dst), 1348 (v4f32 (scalar_to_vector 1349 (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1350 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1351 1352def : Pat<(v2f64 (X86Movsd 1353 (v2f64 VR128:$dst), 1354 (v2f64 (scalar_to_vector 1355 (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1356 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1357 1358def : Pat<(v2f64 (X86Movsd 1359 (v2f64 VR128:$dst), 1360 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), 1361 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1362 1363def : Pat<(v2f64 (X86Movsd 1364 (v2f64 VR128:$dst), 1365 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), 1366 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1367 1368def : Pat<(v2f64 (X86Movsd 1369 (v2f64 VR128:$dst), 1370 (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), 1371 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1372 1373def : Pat<(v2f64 (X86Movsd 1374 (v2f64 VR128:$dst), 1375 (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), 1376 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1377} // Predicates = [UseSSE2] 1378 1379let Predicates = [UseSSE1] in { 1380def : Pat<(v4f32 (X86Movss 1381 (v4f32 VR128:$dst), 1382 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), 1383 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1384 1385def : Pat<(v4f32 (X86Movss 1386 (v4f32 VR128:$dst), 1387 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), 1388 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1389 1390def : Pat<(v4f32 (X86Movss 1391 (v4f32 VR128:$dst), 1392 (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), 1393 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1394 1395def : Pat<(v4f32 (X86Movss 1396 (v4f32 VR128:$dst), 1397 (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), 1398 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1399} // Predicates = [UseSSE1] 1400 1401let Predicates = [HasAVX, NoVLX] in { 1402// Convert packed single/double fp to doubleword 1403def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1404 "cvtps2dq\t{$src, $dst|$dst, $src}", 1405 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1406 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1407def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1408 "cvtps2dq\t{$src, $dst|$dst, $src}", 1409 [(set VR128:$dst, 1410 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1411 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1412def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1413 "cvtps2dq\t{$src, $dst|$dst, $src}", 1414 [(set VR256:$dst, 1415 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1416 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1417def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1418 "cvtps2dq\t{$src, $dst|$dst, $src}", 1419 [(set VR256:$dst, 1420 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1421 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1422} 1423def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1424 "cvtps2dq\t{$src, $dst|$dst, $src}", 1425 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1426 Sched<[WriteCvtPS2I]>; 1427def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1428 "cvtps2dq\t{$src, $dst|$dst, $src}", 1429 [(set VR128:$dst, 1430 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1431 Sched<[WriteCvtPS2ILd]>; 1432 1433 1434// Convert Packed Double FP to Packed DW Integers 1435let Predicates = [HasAVX, NoVLX] in { 1436// The assembler can recognize rr 256-bit instructions by seeing a ymm 1437// register, but the same isn't true when using memory operands instead. 1438// Provide other assembly rr and rm forms to address this explicitly. 1439def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1440 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1441 [(set VR128:$dst, 1442 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1443 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1444 1445// XMM only 1446def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1447 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1448 [(set VR128:$dst, 1449 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1450 Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1451 1452// YMM only 1453def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1454 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1455 [(set VR128:$dst, 1456 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1457 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1458def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1459 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1460 [(set VR128:$dst, 1461 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1462 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1463} 1464 1465def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1466 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1467def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1468 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1469 1470def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1471 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1472 [(set VR128:$dst, 1473 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1474 Sched<[WriteCvtPD2ILd]>; 1475def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1476 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1477 [(set VR128:$dst, 1478 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1479 Sched<[WriteCvtPD2I]>; 1480 1481// Convert with truncation packed single/double fp to doubleword 1482// SSE2 packed instructions with XS prefix 1483let Predicates = [HasAVX, NoVLX] in { 1484def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1485 "cvttps2dq\t{$src, $dst|$dst, $src}", 1486 [(set VR128:$dst, 1487 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, 1488 VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; 1489def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1490 "cvttps2dq\t{$src, $dst|$dst, $src}", 1491 [(set VR128:$dst, 1492 (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>, 1493 VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; 1494def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1495 "cvttps2dq\t{$src, $dst|$dst, $src}", 1496 [(set VR256:$dst, 1497 (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>, 1498 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; 1499def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1500 "cvttps2dq\t{$src, $dst|$dst, $src}", 1501 [(set VR256:$dst, 1502 (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>, 1503 VEX, VEX_L, 1504 Sched<[WriteCvtPS2IYLd]>, VEX_WIG; 1505} 1506 1507def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1508 "cvttps2dq\t{$src, $dst|$dst, $src}", 1509 [(set VR128:$dst, 1510 (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, 1511 Sched<[WriteCvtPS2I]>; 1512def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1513 "cvttps2dq\t{$src, $dst|$dst, $src}", 1514 [(set VR128:$dst, 1515 (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, 1516 Sched<[WriteCvtPS2ILd]>; 1517 1518// The assembler can recognize rr 256-bit instructions by seeing a ymm 1519// register, but the same isn't true when using memory operands instead. 1520// Provide other assembly rr and rm forms to address this explicitly. 1521let Predicates = [HasAVX, NoVLX] in { 1522// XMM only 1523def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1524 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1525 [(set VR128:$dst, 1526 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, 1527 VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; 1528def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1529 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1530 [(set VR128:$dst, 1531 (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, 1532 VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; 1533 1534// YMM only 1535def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1536 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1537 [(set VR128:$dst, 1538 (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>, 1539 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; 1540def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1541 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1542 [(set VR128:$dst, 1543 (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, 1544 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; 1545} // Predicates = [HasAVX, NoVLX] 1546 1547def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1548 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1549def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1550 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1551 1552let Predicates = [HasAVX, NoVLX] in { 1553 def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), 1554 (VCVTTPD2DQYrr VR256:$src)>; 1555 def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), 1556 (VCVTTPD2DQYrm addr:$src)>; 1557} 1558 1559def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1560 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1561 [(set VR128:$dst, 1562 (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, 1563 Sched<[WriteCvtPD2I]>; 1564def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1565 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1566 [(set VR128:$dst, 1567 (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, 1568 Sched<[WriteCvtPD2ILd]>; 1569 1570// Convert packed single to packed double 1571let Predicates = [HasAVX, NoVLX] in { 1572 // SSE2 instructions without OpSize prefix 1573def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1574 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1575 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, 1576 PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; 1577def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1578 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1579 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1580 PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; 1581def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1582 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1583 [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>, 1584 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; 1585def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1586 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1587 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1588 PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; 1589} 1590 1591let Predicates = [UseSSE2] in { 1592def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1593 "cvtps2pd\t{$src, $dst|$dst, $src}", 1594 [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, 1595 PS, Sched<[WriteCvtPS2PD]>; 1596def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1597 "cvtps2pd\t{$src, $dst|$dst, $src}", 1598 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1599 PS, Sched<[WriteCvtPS2PD.Folded]>; 1600} 1601 1602// Convert Packed DW Integers to Packed Double FP 1603let Predicates = [HasAVX, NoVLX] in { 1604let hasSideEffects = 0, mayLoad = 1 in 1605def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1606 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1607 [(set VR128:$dst, 1608 (v2f64 (X86VSintToFP 1609 (bc_v4i32 1610 (v2i64 (scalar_to_vector 1611 (loadi64 addr:$src)))))))]>, 1612 VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; 1613def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1614 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1615 [(set VR128:$dst, 1616 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, 1617 VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; 1618def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1619 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1620 [(set VR256:$dst, 1621 (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>, 1622 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1623 VEX_WIG; 1624def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1625 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1626 [(set VR256:$dst, 1627 (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, 1628 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; 1629} 1630 1631let hasSideEffects = 0, mayLoad = 1 in 1632def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1633 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1634 [(set VR128:$dst, 1635 (v2f64 (X86VSintToFP 1636 (bc_v4i32 1637 (v2i64 (scalar_to_vector 1638 (loadi64 addr:$src)))))))]>, 1639 Sched<[WriteCvtI2PDLd]>; 1640def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1641 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1642 [(set VR128:$dst, 1643 (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, 1644 Sched<[WriteCvtI2PD]>; 1645 1646// AVX register conversion intrinsics 1647let Predicates = [HasAVX, NoVLX] in { 1648 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1649 (VCVTDQ2PDrm addr:$src)>; 1650} // Predicates = [HasAVX, NoVLX] 1651 1652// SSE2 register conversion intrinsics 1653let Predicates = [UseSSE2] in { 1654 def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1655 (CVTDQ2PDrm addr:$src)>; 1656} // Predicates = [UseSSE2] 1657 1658// Convert packed double to packed single 1659// The assembler can recognize rr 256-bit instructions by seeing a ymm 1660// register, but the same isn't true when using memory operands instead. 1661// Provide other assembly rr and rm forms to address this explicitly. 1662let Predicates = [HasAVX, NoVLX] in { 1663// XMM only 1664def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1665 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1666 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, 1667 VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; 1668def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1669 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1670 [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, 1671 VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; 1672 1673def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1674 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1675 [(set VR128:$dst, (X86vfpround VR256:$src))]>, 1676 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; 1677def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1678 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1679 [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>, 1680 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; 1681} // Predicates = [HasAVX, NoVLX] 1682 1683def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1684 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1685def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1686 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1687 1688def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1689 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1690 [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, 1691 Sched<[WriteCvtPD2PS]>; 1692def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1693 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1694 [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, 1695 Sched<[WriteCvtPD2PS.Folded]>; 1696 1697let Predicates = [HasAVX, NoVLX] in { 1698 def : Pat<(v4f32 (fpround (v4f64 VR256:$src))), 1699 (VCVTPD2PSYrr VR256:$src)>; 1700 def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))), 1701 (VCVTPD2PSYrm addr:$src)>; 1702} 1703 1704//===----------------------------------------------------------------------===// 1705// SSE 1 & 2 - Compare Instructions 1706//===----------------------------------------------------------------------===// 1707 1708// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1709multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1710 SDNode OpNode, ValueType VT, 1711 PatFrag ld_frag, string asm, 1712 X86FoldableSchedWrite sched> { 1713 let isCommutable = 1 in 1714 def rr : SIi8<0xC2, MRMSrcReg, 1715 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1716 [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, 1717 Sched<[sched]>; 1718 def rm : SIi8<0xC2, MRMSrcMem, 1719 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1720 [(set RC:$dst, (OpNode (VT RC:$src1), 1721 (ld_frag addr:$src2), imm:$cc))]>, 1722 Sched<[sched.Folded, sched.ReadAfterFold]>; 1723} 1724 1725let isCodeGenOnly = 1 in { 1726 let ExeDomain = SSEPackedSingle in 1727 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, 1728 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1729 SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; 1730 let ExeDomain = SSEPackedDouble in 1731 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, 1732 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1733 SchedWriteFCmpSizes.PD.Scl>, 1734 XD, VEX_4V, VEX_LIG, VEX_WIG; 1735 1736 let Constraints = "$src1 = $dst" in { 1737 let ExeDomain = SSEPackedSingle in 1738 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32, 1739 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1740 SchedWriteFCmpSizes.PS.Scl>, XS; 1741 let ExeDomain = SSEPackedDouble in 1742 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64, 1743 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1744 SchedWriteFCmpSizes.PD.Scl>, XD; 1745 } 1746} 1747 1748multiclass sse12_cmp_scalar_int<Operand memop, 1749 Intrinsic Int, string asm, X86FoldableSchedWrite sched, 1750 ComplexPattern mem_cpat> { 1751 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1752 (ins VR128:$src1, VR128:$src, u8imm:$cc), asm, 1753 [(set VR128:$dst, (Int VR128:$src1, 1754 VR128:$src, imm:$cc))]>, 1755 Sched<[sched]>; 1756let mayLoad = 1 in 1757 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1758 (ins VR128:$src1, memop:$src, u8imm:$cc), asm, 1759 [(set VR128:$dst, (Int VR128:$src1, 1760 mem_cpat:$src, imm:$cc))]>, 1761 Sched<[sched.Folded, sched.ReadAfterFold]>; 1762} 1763 1764// Aliases to match intrinsics which expect XMM operand(s). 1765let ExeDomain = SSEPackedSingle in 1766defm VCMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, 1767 "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", 1768 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1769 XS, VEX_4V, VEX_LIG, VEX_WIG; 1770let ExeDomain = SSEPackedDouble in 1771defm VCMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, 1772 "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}", 1773 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1774 XD, VEX_4V, VEX_LIG, VEX_WIG; 1775let Constraints = "$src1 = $dst" in { 1776 let ExeDomain = SSEPackedSingle in 1777 defm CMPSS : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss, 1778 "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}", 1779 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; 1780 let ExeDomain = SSEPackedDouble in 1781 defm CMPSD : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd, 1782 "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}", 1783 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; 1784} 1785 1786 1787// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1788multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, 1789 ValueType vt, X86MemOperand x86memop, 1790 PatFrag ld_frag, string OpcodeStr, 1791 X86FoldableSchedWrite sched> { 1792let hasSideEffects = 0 in { 1793 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1794 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1795 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1796 Sched<[sched]>; 1797let mayLoad = 1 in 1798 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1799 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1800 [(set EFLAGS, (OpNode (vt RC:$src1), 1801 (ld_frag addr:$src2)))]>, 1802 Sched<[sched.Folded, sched.ReadAfterFold]>; 1803} 1804} 1805 1806// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1807multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1808 ValueType vt, Operand memop, 1809 ComplexPattern mem_cpat, string OpcodeStr, 1810 X86FoldableSchedWrite sched> { 1811 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1812 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1813 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1814 Sched<[sched]>; 1815let mayLoad = 1 in 1816 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1817 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1818 [(set EFLAGS, (OpNode (vt RC:$src1), 1819 mem_cpat:$src2))]>, 1820 Sched<[sched.Folded, sched.ReadAfterFold]>; 1821} 1822 1823let Defs = [EFLAGS] in { 1824 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 1825 "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1826 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 1827 "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1828 let Pattern = []<dag> in { 1829 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 1830 "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1831 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 1832 "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1833 } 1834 1835 let isCodeGenOnly = 1 in { 1836 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1837 sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1838 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1839 sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1840 1841 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1842 sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; 1843 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1844 sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; 1845 } 1846 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, 1847 "ucomiss", WriteFCom>, PS; 1848 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, 1849 "ucomisd", WriteFCom>, PD; 1850 1851 let Pattern = []<dag> in { 1852 defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, 1853 "comiss", WriteFCom>, PS; 1854 defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, 1855 "comisd", WriteFCom>, PD; 1856 } 1857 1858 let isCodeGenOnly = 1 in { 1859 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1860 sse_load_f32, "ucomiss", WriteFCom>, PS; 1861 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1862 sse_load_f64, "ucomisd", WriteFCom>, PD; 1863 1864 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1865 sse_load_f32, "comiss", WriteFCom>, PS; 1866 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1867 sse_load_f64, "comisd", WriteFCom>, PD; 1868 } 1869} // Defs = [EFLAGS] 1870 1871// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1872multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1873 ValueType VT, string asm, 1874 X86FoldableSchedWrite sched, 1875 Domain d, PatFrag ld_frag> { 1876 let isCommutable = 1 in 1877 def rri : PIi8<0xC2, MRMSrcReg, 1878 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1879 [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, 1880 Sched<[sched]>; 1881 def rmi : PIi8<0xC2, MRMSrcMem, 1882 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1883 [(set RC:$dst, 1884 (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, 1885 Sched<[sched.Folded, sched.ReadAfterFold]>; 1886} 1887 1888defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1889 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1890 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; 1891defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1892 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1893 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; 1894defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1895 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1896 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; 1897defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1898 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1899 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; 1900let Constraints = "$src1 = $dst" in { 1901 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1902 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1903 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; 1904 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1905 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1906 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; 1907} 1908 1909def CommutableCMPCC : PatLeaf<(imm), [{ 1910 uint64_t Imm = N->getZExtValue() & 0x7; 1911 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 1912}]>; 1913 1914// Patterns to select compares with loads in first operand. 1915let Predicates = [HasAVX] in { 1916 def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, 1917 CommutableCMPCC:$cc)), 1918 (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; 1919 1920 def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, 1921 CommutableCMPCC:$cc)), 1922 (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; 1923 1924 def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, 1925 CommutableCMPCC:$cc)), 1926 (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 1927 1928 def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, 1929 CommutableCMPCC:$cc)), 1930 (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; 1931 1932 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1933 CommutableCMPCC:$cc)), 1934 (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; 1935 1936 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 1937 CommutableCMPCC:$cc)), 1938 (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; 1939} 1940 1941let Predicates = [UseSSE2] in { 1942 def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, 1943 CommutableCMPCC:$cc)), 1944 (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; 1945 1946 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 1947 CommutableCMPCC:$cc)), 1948 (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; 1949} 1950 1951let Predicates = [UseSSE1] in { 1952 def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, 1953 CommutableCMPCC:$cc)), 1954 (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; 1955 1956 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 1957 CommutableCMPCC:$cc)), 1958 (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; 1959} 1960 1961//===----------------------------------------------------------------------===// 1962// SSE 1 & 2 - Shuffle Instructions 1963//===----------------------------------------------------------------------===// 1964 1965/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 1966multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 1967 ValueType vt, string asm, PatFrag mem_frag, 1968 X86FoldableSchedWrite sched, Domain d, 1969 bit IsCommutable = 0> { 1970 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 1971 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 1972 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 1973 (i8 imm:$src3))))], d>, 1974 Sched<[sched.Folded, sched.ReadAfterFold]>; 1975 let isCommutable = IsCommutable in 1976 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 1977 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 1978 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 1979 (i8 imm:$src3))))], d>, 1980 Sched<[sched]>; 1981} 1982 1983let Predicates = [HasAVX, NoVLX] in { 1984 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 1985 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 1986 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 1987 PS, VEX_4V, VEX_WIG; 1988 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 1989 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 1990 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 1991 PS, VEX_4V, VEX_L, VEX_WIG; 1992 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 1993 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 1994 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 1995 PD, VEX_4V, VEX_WIG; 1996 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 1997 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 1998 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 1999 PD, VEX_4V, VEX_L, VEX_WIG; 2000} 2001let Constraints = "$src1 = $dst" in { 2002 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2003 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2004 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2005 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2006 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2007 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2008} 2009 2010//===----------------------------------------------------------------------===// 2011// SSE 1 & 2 - Unpack FP Instructions 2012//===----------------------------------------------------------------------===// 2013 2014/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2015multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2016 PatFrag mem_frag, RegisterClass RC, 2017 X86MemOperand x86memop, string asm, 2018 X86FoldableSchedWrite sched, Domain d, 2019 bit IsCommutable = 0> { 2020 let isCommutable = IsCommutable in 2021 def rr : PI<opc, MRMSrcReg, 2022 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2023 asm, [(set RC:$dst, 2024 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2025 Sched<[sched]>; 2026 def rm : PI<opc, MRMSrcMem, 2027 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2028 asm, [(set RC:$dst, 2029 (vt (OpNode RC:$src1, 2030 (mem_frag addr:$src2))))], d>, 2031 Sched<[sched.Folded, sched.ReadAfterFold]>; 2032} 2033 2034let Predicates = [HasAVX, NoVLX] in { 2035defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2036 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2037 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2038defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2039 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2040 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; 2041defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2042 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2043 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; 2044defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2045 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2046 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; 2047 2048defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2049 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2050 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2051defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2052 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2053 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2054defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2055 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2056 SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; 2057defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2058 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2059 SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; 2060}// Predicates = [HasAVX, NoVLX] 2061 2062let Constraints = "$src1 = $dst" in { 2063 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2064 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2065 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2066 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2067 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2068 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; 2069 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2070 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2071 SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; 2072 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2073 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2074 SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; 2075} // Constraints = "$src1 = $dst" 2076 2077let Predicates = [HasAVX1Only] in { 2078 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2079 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2080 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2081 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2082 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2083 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2084 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2085 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2086 2087 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2088 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2089 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2090 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2091 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2092 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2093 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2094 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2095} 2096 2097let Predicates = [UseSSE2] in { 2098 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2099 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2100 (v2f64 (nonvolatile_load addr:$src2)))), 2101 (MOVHPDrm VR128:$src1, addr:$src2)>; 2102} 2103 2104//===----------------------------------------------------------------------===// 2105// SSE 1 & 2 - Extract Floating-Point Sign mask 2106//===----------------------------------------------------------------------===// 2107 2108/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2109multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2110 string asm, Domain d> { 2111 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2112 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2113 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2114 Sched<[WriteFMOVMSK]>; 2115} 2116 2117let Predicates = [HasAVX] in { 2118 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2119 SSEPackedSingle>, PS, VEX, VEX_WIG; 2120 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2121 SSEPackedDouble>, PD, VEX, VEX_WIG; 2122 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2123 SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; 2124 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2125 SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; 2126 2127 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2128 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2129 (VMOVMSKPSrr VR128:$src)>; 2130 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2131 (VMOVMSKPDrr VR128:$src)>; 2132 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2133 (VMOVMSKPSYrr VR256:$src)>; 2134 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2135 (VMOVMSKPDYrr VR256:$src)>; 2136} 2137 2138defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2139 SSEPackedSingle>, PS; 2140defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2141 SSEPackedDouble>, PD; 2142 2143let Predicates = [UseSSE2] in { 2144 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2145 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2146 (MOVMSKPSrr VR128:$src)>; 2147 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2148 (MOVMSKPDrr VR128:$src)>; 2149} 2150 2151//===---------------------------------------------------------------------===// 2152// SSE2 - Packed Integer Logical Instructions 2153//===---------------------------------------------------------------------===// 2154 2155let ExeDomain = SSEPackedInt in { // SSE integer instructions 2156 2157/// PDI_binop_rm - Simple SSE2 binary operator. 2158multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2159 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2160 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2161 bit IsCommutable, bit Is2Addr> { 2162 let isCommutable = IsCommutable in 2163 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2164 (ins RC:$src1, RC:$src2), 2165 !if(Is2Addr, 2166 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2167 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2168 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2169 Sched<[sched]>; 2170 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2171 (ins RC:$src1, x86memop:$src2), 2172 !if(Is2Addr, 2173 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2174 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2175 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2176 Sched<[sched.Folded, sched.ReadAfterFold]>; 2177} 2178} // ExeDomain = SSEPackedInt 2179 2180multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2181 ValueType OpVT128, ValueType OpVT256, 2182 X86SchedWriteWidths sched, bit IsCommutable, 2183 Predicate prd> { 2184let Predicates = [HasAVX, prd] in 2185 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2186 VR128, load, i128mem, sched.XMM, 2187 IsCommutable, 0>, VEX_4V, VEX_WIG; 2188 2189let Constraints = "$src1 = $dst" in 2190 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2191 memop, i128mem, sched.XMM, IsCommutable, 1>; 2192 2193let Predicates = [HasAVX2, prd] in 2194 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2195 OpVT256, VR256, load, i256mem, sched.YMM, 2196 IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; 2197} 2198 2199// These are ordered here for pattern ordering requirements with the fp versions 2200 2201defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2202 SchedWriteVecLogic, 1, NoVLX>; 2203defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2204 SchedWriteVecLogic, 1, NoVLX>; 2205defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2206 SchedWriteVecLogic, 1, NoVLX>; 2207defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2208 SchedWriteVecLogic, 0, NoVLX>; 2209 2210//===----------------------------------------------------------------------===// 2211// SSE 1 & 2 - Logical Instructions 2212//===----------------------------------------------------------------------===// 2213 2214/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2215/// 2216/// There are no patterns here because isel prefers integer versions for SSE2 2217/// and later. There are SSE1 v4f32 patterns later. 2218multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2219 SDNode OpNode, X86SchedWriteWidths sched> { 2220 let Predicates = [HasAVX, NoVLX] in { 2221 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2222 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2223 [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2224 2225 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2226 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2227 [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2228 2229 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2230 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2231 [], [], 0>, PS, VEX_4V, VEX_WIG; 2232 2233 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2234 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2235 [], [], 0>, PD, VEX_4V, VEX_WIG; 2236 } 2237 2238 let Constraints = "$src1 = $dst" in { 2239 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2240 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2241 [], []>, PS; 2242 2243 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2244 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2245 [], []>, PD; 2246 } 2247} 2248 2249defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; 2250defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; 2251defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; 2252let isCommutable = 0 in 2253 defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; 2254 2255let Predicates = [HasAVX2, NoVLX] in { 2256 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2257 (VPANDYrr VR256:$src1, VR256:$src2)>; 2258 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2259 (VPANDYrr VR256:$src1, VR256:$src2)>; 2260 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2261 (VPANDYrr VR256:$src1, VR256:$src2)>; 2262 2263 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2264 (VPORYrr VR256:$src1, VR256:$src2)>; 2265 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2266 (VPORYrr VR256:$src1, VR256:$src2)>; 2267 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2268 (VPORYrr VR256:$src1, VR256:$src2)>; 2269 2270 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2271 (VPXORYrr VR256:$src1, VR256:$src2)>; 2272 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2273 (VPXORYrr VR256:$src1, VR256:$src2)>; 2274 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2275 (VPXORYrr VR256:$src1, VR256:$src2)>; 2276 2277 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2278 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2279 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2280 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2281 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2282 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2283 2284 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2285 (VPANDYrm VR256:$src1, addr:$src2)>; 2286 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2287 (VPANDYrm VR256:$src1, addr:$src2)>; 2288 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2289 (VPANDYrm VR256:$src1, addr:$src2)>; 2290 2291 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2292 (VPORYrm VR256:$src1, addr:$src2)>; 2293 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2294 (VPORYrm VR256:$src1, addr:$src2)>; 2295 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2296 (VPORYrm VR256:$src1, addr:$src2)>; 2297 2298 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2299 (VPXORYrm VR256:$src1, addr:$src2)>; 2300 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2301 (VPXORYrm VR256:$src1, addr:$src2)>; 2302 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2303 (VPXORYrm VR256:$src1, addr:$src2)>; 2304 2305 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2306 (VPANDNYrm VR256:$src1, addr:$src2)>; 2307 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2308 (VPANDNYrm VR256:$src1, addr:$src2)>; 2309 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2310 (VPANDNYrm VR256:$src1, addr:$src2)>; 2311} 2312 2313// If only AVX1 is supported, we need to handle integer operations with 2314// floating point instructions since the integer versions aren't available. 2315let Predicates = [HasAVX1Only] in { 2316 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2317 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2318 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2319 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2320 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2321 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2322 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2323 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2324 2325 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2326 (VORPSYrr VR256:$src1, VR256:$src2)>; 2327 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2328 (VORPSYrr VR256:$src1, VR256:$src2)>; 2329 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2330 (VORPSYrr VR256:$src1, VR256:$src2)>; 2331 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2332 (VORPSYrr VR256:$src1, VR256:$src2)>; 2333 2334 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2335 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2336 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2337 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2338 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2339 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2340 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2341 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2342 2343 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2344 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2345 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2346 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2347 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2348 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2349 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2350 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2351 2352 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2353 (VANDPSYrm VR256:$src1, addr:$src2)>; 2354 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2355 (VANDPSYrm VR256:$src1, addr:$src2)>; 2356 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2357 (VANDPSYrm VR256:$src1, addr:$src2)>; 2358 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2359 (VANDPSYrm VR256:$src1, addr:$src2)>; 2360 2361 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2362 (VORPSYrm VR256:$src1, addr:$src2)>; 2363 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2364 (VORPSYrm VR256:$src1, addr:$src2)>; 2365 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2366 (VORPSYrm VR256:$src1, addr:$src2)>; 2367 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2368 (VORPSYrm VR256:$src1, addr:$src2)>; 2369 2370 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2371 (VXORPSYrm VR256:$src1, addr:$src2)>; 2372 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2373 (VXORPSYrm VR256:$src1, addr:$src2)>; 2374 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2375 (VXORPSYrm VR256:$src1, addr:$src2)>; 2376 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2377 (VXORPSYrm VR256:$src1, addr:$src2)>; 2378 2379 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2380 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2381 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2382 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2383 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2384 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2385 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2386 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2387} 2388 2389let Predicates = [HasAVX, NoVLX] in { 2390 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2391 (VPANDrr VR128:$src1, VR128:$src2)>; 2392 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2393 (VPANDrr VR128:$src1, VR128:$src2)>; 2394 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2395 (VPANDrr VR128:$src1, VR128:$src2)>; 2396 2397 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2398 (VPORrr VR128:$src1, VR128:$src2)>; 2399 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2400 (VPORrr VR128:$src1, VR128:$src2)>; 2401 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2402 (VPORrr VR128:$src1, VR128:$src2)>; 2403 2404 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2405 (VPXORrr VR128:$src1, VR128:$src2)>; 2406 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2407 (VPXORrr VR128:$src1, VR128:$src2)>; 2408 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2409 (VPXORrr VR128:$src1, VR128:$src2)>; 2410 2411 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2412 (VPANDNrr VR128:$src1, VR128:$src2)>; 2413 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2414 (VPANDNrr VR128:$src1, VR128:$src2)>; 2415 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2416 (VPANDNrr VR128:$src1, VR128:$src2)>; 2417 2418 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2419 (VPANDrm VR128:$src1, addr:$src2)>; 2420 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2421 (VPANDrm VR128:$src1, addr:$src2)>; 2422 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2423 (VPANDrm VR128:$src1, addr:$src2)>; 2424 2425 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2426 (VPORrm VR128:$src1, addr:$src2)>; 2427 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2428 (VPORrm VR128:$src1, addr:$src2)>; 2429 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2430 (VPORrm VR128:$src1, addr:$src2)>; 2431 2432 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2433 (VPXORrm VR128:$src1, addr:$src2)>; 2434 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2435 (VPXORrm VR128:$src1, addr:$src2)>; 2436 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2437 (VPXORrm VR128:$src1, addr:$src2)>; 2438 2439 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2440 (VPANDNrm VR128:$src1, addr:$src2)>; 2441 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2442 (VPANDNrm VR128:$src1, addr:$src2)>; 2443 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2444 (VPANDNrm VR128:$src1, addr:$src2)>; 2445} 2446 2447let Predicates = [UseSSE2] in { 2448 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2449 (PANDrr VR128:$src1, VR128:$src2)>; 2450 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2451 (PANDrr VR128:$src1, VR128:$src2)>; 2452 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2453 (PANDrr VR128:$src1, VR128:$src2)>; 2454 2455 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2456 (PORrr VR128:$src1, VR128:$src2)>; 2457 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2458 (PORrr VR128:$src1, VR128:$src2)>; 2459 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2460 (PORrr VR128:$src1, VR128:$src2)>; 2461 2462 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2463 (PXORrr VR128:$src1, VR128:$src2)>; 2464 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2465 (PXORrr VR128:$src1, VR128:$src2)>; 2466 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2467 (PXORrr VR128:$src1, VR128:$src2)>; 2468 2469 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2470 (PANDNrr VR128:$src1, VR128:$src2)>; 2471 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2472 (PANDNrr VR128:$src1, VR128:$src2)>; 2473 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2474 (PANDNrr VR128:$src1, VR128:$src2)>; 2475 2476 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2477 (PANDrm VR128:$src1, addr:$src2)>; 2478 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2479 (PANDrm VR128:$src1, addr:$src2)>; 2480 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2481 (PANDrm VR128:$src1, addr:$src2)>; 2482 2483 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2484 (PORrm VR128:$src1, addr:$src2)>; 2485 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2486 (PORrm VR128:$src1, addr:$src2)>; 2487 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2488 (PORrm VR128:$src1, addr:$src2)>; 2489 2490 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2491 (PXORrm VR128:$src1, addr:$src2)>; 2492 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2493 (PXORrm VR128:$src1, addr:$src2)>; 2494 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2495 (PXORrm VR128:$src1, addr:$src2)>; 2496 2497 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2498 (PANDNrm VR128:$src1, addr:$src2)>; 2499 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2500 (PANDNrm VR128:$src1, addr:$src2)>; 2501 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2502 (PANDNrm VR128:$src1, addr:$src2)>; 2503} 2504 2505// Patterns for packed operations when we don't have integer type available. 2506def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2507 (ANDPSrr VR128:$src1, VR128:$src2)>; 2508def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2509 (ORPSrr VR128:$src1, VR128:$src2)>; 2510def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2511 (XORPSrr VR128:$src1, VR128:$src2)>; 2512def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2513 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2514 2515def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2516 (ANDPSrm VR128:$src1, addr:$src2)>; 2517def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2518 (ORPSrm VR128:$src1, addr:$src2)>; 2519def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2520 (XORPSrm VR128:$src1, addr:$src2)>; 2521def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2522 (ANDNPSrm VR128:$src1, addr:$src2)>; 2523 2524//===----------------------------------------------------------------------===// 2525// SSE 1 & 2 - Arithmetic Instructions 2526//===----------------------------------------------------------------------===// 2527 2528/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2529/// vector forms. 2530/// 2531/// In addition, we also have a special variant of the scalar form here to 2532/// represent the associated intrinsic operation. This form is unlike the 2533/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2534/// and leaves the top elements unmodified (therefore these cannot be commuted). 2535/// 2536/// These three forms can each be reg+reg or reg+mem. 2537/// 2538 2539/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2540/// classes below 2541multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2542 SDNode OpNode, X86SchedWriteSizes sched> { 2543 let Predicates = [HasAVX, NoVLX] in { 2544 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2545 VR128, v4f32, f128mem, loadv4f32, 2546 SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; 2547 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2548 VR128, v2f64, f128mem, loadv2f64, 2549 SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; 2550 2551 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2552 OpNode, VR256, v8f32, f256mem, loadv8f32, 2553 SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; 2554 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2555 OpNode, VR256, v4f64, f256mem, loadv4f64, 2556 SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; 2557 } 2558 2559 let Constraints = "$src1 = $dst" in { 2560 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2561 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2562 sched.PS.XMM>, PS; 2563 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2564 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2565 sched.PD.XMM>, PD; 2566 } 2567} 2568 2569multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2570 X86SchedWriteSizes sched> { 2571 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2572 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2573 XS, VEX_4V, VEX_LIG, VEX_WIG; 2574 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2575 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2576 XD, VEX_4V, VEX_LIG, VEX_WIG; 2577 2578 let Constraints = "$src1 = $dst" in { 2579 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2580 OpNode, FR32, f32mem, SSEPackedSingle, 2581 sched.PS.Scl>, XS; 2582 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2583 OpNode, FR64, f64mem, SSEPackedDouble, 2584 sched.PD.Scl>, XD; 2585 } 2586} 2587 2588multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2589 SDPatternOperator OpNode, 2590 X86SchedWriteSizes sched> { 2591 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2592 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2593 SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; 2594 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2595 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2596 SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; 2597 2598 let Constraints = "$src1 = $dst" in { 2599 defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, 2600 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2601 SSEPackedSingle, sched.PS.Scl>, XS; 2602 defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, 2603 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2604 SSEPackedDouble, sched.PD.Scl>, XD; 2605 } 2606} 2607 2608// Binary Arithmetic instructions 2609defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>, 2610 basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>, 2611 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2612defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>, 2613 basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>, 2614 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2615let isCommutable = 0 in { 2616 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>, 2617 basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>, 2618 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2619 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>, 2620 basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>, 2621 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2622 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2623 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2624 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2625 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2626 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2627 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2628} 2629 2630let isCodeGenOnly = 1 in { 2631 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2632 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2633 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2634 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2635} 2636 2637// Patterns used to select SSE scalar fp arithmetic instructions from 2638// either: 2639// 2640// (1) a scalar fp operation followed by a blend 2641// 2642// The effect is that the backend no longer emits unnecessary vector 2643// insert instructions immediately after SSE scalar fp instructions 2644// like addss or mulss. 2645// 2646// For example, given the following code: 2647// __m128 foo(__m128 A, __m128 B) { 2648// A[0] += B[0]; 2649// return A; 2650// } 2651// 2652// Previously we generated: 2653// addss %xmm0, %xmm1 2654// movss %xmm1, %xmm0 2655// 2656// We now generate: 2657// addss %xmm1, %xmm0 2658// 2659// (2) a vector packed single/double fp operation followed by a vector insert 2660// 2661// The effect is that the backend converts the packed fp instruction 2662// followed by a vector insert into a single SSE scalar fp instruction. 2663// 2664// For example, given the following code: 2665// __m128 foo(__m128 A, __m128 B) { 2666// __m128 C = A + B; 2667// return (__m128) {c[0], a[1], a[2], a[3]}; 2668// } 2669// 2670// Previously we generated: 2671// addps %xmm0, %xmm1 2672// movss %xmm1, %xmm0 2673// 2674// We now generate: 2675// addss %xmm1, %xmm0 2676 2677// TODO: Some canonicalization in lowering would simplify the number of 2678// patterns we have to try to match. 2679multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, 2680 ValueType VT, ValueType EltTy, 2681 RegisterClass RC, PatFrag ld_frag, 2682 Predicate BasePredicate> { 2683 let Predicates = [BasePredicate] in { 2684 // extracted scalar math op with insert via movss/movsd 2685 def : Pat<(VT (Move (VT VR128:$dst), 2686 (VT (scalar_to_vector 2687 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2688 RC:$src))))), 2689 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2690 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2691 def : Pat<(VT (Move (VT VR128:$dst), 2692 (VT (scalar_to_vector 2693 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2694 (ld_frag addr:$src)))))), 2695 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2696 } 2697 2698 // Repeat for AVX versions of the instructions. 2699 let Predicates = [UseAVX] in { 2700 // extracted scalar math op with insert via movss/movsd 2701 def : Pat<(VT (Move (VT VR128:$dst), 2702 (VT (scalar_to_vector 2703 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2704 RC:$src))))), 2705 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2706 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2707 def : Pat<(VT (Move (VT VR128:$dst), 2708 (VT (scalar_to_vector 2709 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2710 (ld_frag addr:$src)))))), 2711 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2712 } 2713} 2714 2715defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2716defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2717defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2718defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2719 2720defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2721defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2722defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2723defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2724 2725/// Unop Arithmetic 2726/// In addition, we also have a special variant of the scalar form here to 2727/// represent the associated intrinsic operation. This form is unlike the 2728/// plain scalar form, in that it takes an entire vector (instead of a 2729/// scalar) and leaves the top elements undefined. 2730/// 2731/// And, we have a special variant form for a full-vector intrinsic form. 2732 2733/// sse_fp_unop_s - SSE1 unops in scalar form 2734/// For the non-AVX defs, we need $src1 to be tied to $dst because 2735/// the HW instructions are 2 operand / destructive. 2736multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2737 ValueType ScalarVT, X86MemOperand x86memop, 2738 Operand intmemop, SDNode OpNode, Domain d, 2739 X86FoldableSchedWrite sched, Predicate target> { 2740 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2741 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2742 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2743 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2744 Requires<[target]>; 2745 let mayLoad = 1 in 2746 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2747 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2748 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2749 Sched<[sched.Folded]>, 2750 Requires<[target, OptForSize]>; 2751 } 2752 2753 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2754 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2755 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2756 Sched<[sched]>; 2757 let mayLoad = 1 in 2758 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2759 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2760 Sched<[sched.Folded, sched.ReadAfterFold]>; 2761 } 2762 2763} 2764 2765multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, 2766 ComplexPattern int_cpat, Intrinsic Intr, 2767 Predicate target, string Suffix> { 2768 let Predicates = [target] in { 2769 // These are unary operations, but they are modeled as having 2 source operands 2770 // because the high elements of the destination are unchanged in SSE. 2771 def : Pat<(Intr VR128:$src), 2772 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2773 } 2774 // We don't want to fold scalar loads into these instructions unless 2775 // optimizing for size. This is because the folded instruction will have a 2776 // partial register update, while the unfolded sequence will not, e.g. 2777 // movss mem, %xmm0 2778 // rcpss %xmm0, %xmm0 2779 // which has a clobber before the rcp, vs. 2780 // rcpss mem, %xmm0 2781 let Predicates = [target, OptForSize] in { 2782 def : Pat<(Intr int_cpat:$src2), 2783 (!cast<Instruction>(NAME#m_Int) 2784 (vt (IMPLICIT_DEF)), addr:$src2)>; 2785 } 2786} 2787 2788multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, 2789 Intrinsic Intr, Predicate target> { 2790 let Predicates = [target] in { 2791 def : Pat<(Intr VR128:$src), 2792 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2793 VR128:$src)>; 2794 } 2795 let Predicates = [target, OptForSize] in { 2796 def : Pat<(Intr int_cpat:$src2), 2797 (!cast<Instruction>(NAME#m_Int) 2798 (vt (IMPLICIT_DEF)), addr:$src2)>; 2799 } 2800} 2801 2802multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2803 ValueType ScalarVT, X86MemOperand x86memop, 2804 Operand intmemop, SDNode OpNode, Domain d, 2805 X86FoldableSchedWrite sched, Predicate target> { 2806 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2807 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2808 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2809 [], d>, Sched<[sched]>; 2810 let mayLoad = 1 in 2811 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2812 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2813 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2814 } 2815 let hasSideEffects = 0, ExeDomain = d in { 2816 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2817 (ins VR128:$src1, VR128:$src2), 2818 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2819 []>, Sched<[sched]>; 2820 let mayLoad = 1 in 2821 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2822 (ins VR128:$src1, intmemop:$src2), 2823 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2824 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2825 } 2826 2827 // We don't want to fold scalar loads into these instructions unless 2828 // optimizing for size. This is because the folded instruction will have a 2829 // partial register update, while the unfolded sequence will not, e.g. 2830 // vmovss mem, %xmm0 2831 // vrcpss %xmm0, %xmm0, %xmm0 2832 // which has a clobber before the rcp, vs. 2833 // vrcpss mem, %xmm0, %xmm0 2834 // TODO: In theory, we could fold the load, and avoid the stall caused by 2835 // the partial register store, either in BreakFalseDeps or with smarter RA. 2836 let Predicates = [target] in { 2837 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2838 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2839 } 2840 let Predicates = [target, OptForSize] in { 2841 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2842 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2843 addr:$src)>; 2844 } 2845} 2846 2847/// sse1_fp_unop_p - SSE1 unops in packed form. 2848multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, 2849 X86SchedWriteWidths sched, list<Predicate> prds> { 2850let Predicates = prds in { 2851 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2852 !strconcat("v", OpcodeStr, 2853 "ps\t{$src, $dst|$dst, $src}"), 2854 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2855 VEX, Sched<[sched.XMM]>, VEX_WIG; 2856 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2857 !strconcat("v", OpcodeStr, 2858 "ps\t{$src, $dst|$dst, $src}"), 2859 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2860 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2861 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2862 !strconcat("v", OpcodeStr, 2863 "ps\t{$src, $dst|$dst, $src}"), 2864 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2865 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2866 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2867 !strconcat("v", OpcodeStr, 2868 "ps\t{$src, $dst|$dst, $src}"), 2869 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2870 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2871} 2872 2873 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2874 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2875 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2876 Sched<[sched.XMM]>; 2877 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2878 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2879 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2880 Sched<[sched.XMM.Folded]>; 2881} 2882 2883/// sse2_fp_unop_p - SSE2 unops in vector forms. 2884multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2885 SDNode OpNode, X86SchedWriteWidths sched> { 2886let Predicates = [HasAVX, NoVLX] in { 2887 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2888 !strconcat("v", OpcodeStr, 2889 "pd\t{$src, $dst|$dst, $src}"), 2890 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2891 VEX, Sched<[sched.XMM]>, VEX_WIG; 2892 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2893 !strconcat("v", OpcodeStr, 2894 "pd\t{$src, $dst|$dst, $src}"), 2895 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2896 VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; 2897 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2898 !strconcat("v", OpcodeStr, 2899 "pd\t{$src, $dst|$dst, $src}"), 2900 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2901 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 2902 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2903 !strconcat("v", OpcodeStr, 2904 "pd\t{$src, $dst|$dst, $src}"), 2905 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 2906 VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; 2907} 2908 2909 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2910 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2911 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2912 Sched<[sched.XMM]>; 2913 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2914 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 2915 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 2916 Sched<[sched.XMM.Folded]>; 2917} 2918 2919multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, 2920 X86SchedWriteWidths sched, Predicate AVXTarget> { 2921 defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2922 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2923 UseSSE1, "SS">, XS; 2924 defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, 2925 !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), 2926 AVXTarget>, 2927 XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; 2928} 2929 2930multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2931 X86SchedWriteWidths sched, Predicate AVXTarget> { 2932 defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, 2933 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; 2934 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, 2935 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 2936 XS, VEX_4V, VEX_LIG, VEX_WIG; 2937} 2938 2939multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, 2940 X86SchedWriteWidths sched, Predicate AVXTarget> { 2941 defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, 2942 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; 2943 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, 2944 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 2945 XD, VEX_4V, VEX_LIG, VEX_WIG; 2946} 2947 2948// Square root. 2949defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>, 2950 sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 2951 sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>, 2952 sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>; 2953 2954// Reciprocal approximations. Note that these typically require refinement 2955// in order to obtain suitable precision. 2956defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 2957 sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 2958 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 2959defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 2960 sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 2961 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 2962 2963// There is no f64 version of the reciprocal approximation instructions. 2964 2965multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, 2966 ValueType VT, Predicate BasePredicate> { 2967 let Predicates = [BasePredicate] in { 2968 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2969 (OpNode (extractelt VT:$src, 0))))), 2970 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2971 } 2972 2973 // Repeat for AVX versions of the instructions. 2974 let Predicates = [UseAVX] in { 2975 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 2976 (OpNode (extractelt VT:$src, 0))))), 2977 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2978 } 2979} 2980 2981defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 2982defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 2983 2984multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 2985 SDNode Move, ValueType VT, 2986 Predicate BasePredicate> { 2987 let Predicates = [BasePredicate] in { 2988 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 2989 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2990 } 2991 2992 // Repeat for AVX versions of the instructions. 2993 let Predicates = [HasAVX] in { 2994 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 2995 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 2996 } 2997} 2998 2999defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3000 v4f32, UseSSE1>; 3001defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3002 v4f32, UseSSE1>; 3003 3004 3005//===----------------------------------------------------------------------===// 3006// SSE 1 & 2 - Non-temporal stores 3007//===----------------------------------------------------------------------===// 3008 3009let AddedComplexity = 400 in { // Prefer non-temporal versions 3010let Predicates = [HasAVX, NoVLX] in { 3011let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3012def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3013 (ins f128mem:$dst, VR128:$src), 3014 "movntps\t{$src, $dst|$dst, $src}", 3015 [(alignednontemporalstore (v4f32 VR128:$src), 3016 addr:$dst)]>, VEX, VEX_WIG; 3017def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3018 (ins f128mem:$dst, VR128:$src), 3019 "movntpd\t{$src, $dst|$dst, $src}", 3020 [(alignednontemporalstore (v2f64 VR128:$src), 3021 addr:$dst)]>, VEX, VEX_WIG; 3022} // SchedRW 3023 3024let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3025def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3026 (ins f256mem:$dst, VR256:$src), 3027 "movntps\t{$src, $dst|$dst, $src}", 3028 [(alignednontemporalstore (v8f32 VR256:$src), 3029 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3030def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3031 (ins f256mem:$dst, VR256:$src), 3032 "movntpd\t{$src, $dst|$dst, $src}", 3033 [(alignednontemporalstore (v4f64 VR256:$src), 3034 addr:$dst)]>, VEX, VEX_L, VEX_WIG; 3035} // SchedRW 3036 3037let ExeDomain = SSEPackedInt in { 3038def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3039 (ins i128mem:$dst, VR128:$src), 3040 "movntdq\t{$src, $dst|$dst, $src}", 3041 [(alignednontemporalstore (v2i64 VR128:$src), 3042 addr:$dst)]>, VEX, VEX_WIG, 3043 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3044def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3045 (ins i256mem:$dst, VR256:$src), 3046 "movntdq\t{$src, $dst|$dst, $src}", 3047 [(alignednontemporalstore (v4i64 VR256:$src), 3048 addr:$dst)]>, VEX, VEX_L, VEX_WIG, 3049 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3050} // ExeDomain 3051} // Predicates 3052 3053let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3054def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3055 "movntps\t{$src, $dst|$dst, $src}", 3056 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3057def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3058 "movntpd\t{$src, $dst|$dst, $src}", 3059 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3060} // SchedRW 3061 3062let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3063def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3064 "movntdq\t{$src, $dst|$dst, $src}", 3065 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3066 3067let SchedRW = [WriteStoreNT] in { 3068// There is no AVX form for instructions below this point 3069def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3070 "movnti{l}\t{$src, $dst|$dst, $src}", 3071 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3072 PS, Requires<[HasSSE2]>; 3073def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3074 "movnti{q}\t{$src, $dst|$dst, $src}", 3075 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3076 PS, Requires<[HasSSE2]>; 3077} // SchedRW = [WriteStoreNT] 3078 3079let Predicates = [HasAVX, NoVLX] in { 3080 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3081 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3082 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3083 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3084 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3085 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3086 3087 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3088 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3089 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3090 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3091 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3092 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3093} 3094 3095let Predicates = [UseSSE2] in { 3096 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3097 (MOVNTDQmr addr:$dst, VR128:$src)>; 3098 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3099 (MOVNTDQmr addr:$dst, VR128:$src)>; 3100 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3101 (MOVNTDQmr addr:$dst, VR128:$src)>; 3102} 3103 3104} // AddedComplexity 3105 3106//===----------------------------------------------------------------------===// 3107// SSE 1 & 2 - Prefetch and memory fence 3108//===----------------------------------------------------------------------===// 3109 3110// Prefetch intrinsic. 3111let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3112def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3113 "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; 3114def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3115 "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; 3116def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3117 "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; 3118def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3119 "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; 3120} 3121 3122// FIXME: How should flush instruction be modeled? 3123let SchedRW = [WriteLoad] in { 3124// Flush cache 3125def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3126 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3127 PS, Requires<[HasSSE2]>; 3128} 3129 3130let SchedRW = [WriteNop] in { 3131// Pause. This "instruction" is encoded as "rep; nop", so even though it 3132// was introduced with SSE2, it's backward compatible. 3133def PAUSE : I<0x90, RawFrm, (outs), (ins), 3134 "pause", [(int_x86_sse2_pause)]>, OBXS; 3135} 3136 3137let SchedRW = [WriteFence] in { 3138// Load, store, and memory fence 3139// TODO: As with mfence, we may want to ease the availablity of sfence/lfence 3140// to include any 64-bit target. 3141def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3142 PS, Requires<[HasSSE1]>; 3143def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3144 PS, Requires<[HasSSE2]>; 3145def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3146 PS, Requires<[HasMFence]>; 3147} // SchedRW 3148 3149def : Pat<(X86MFence), (MFENCE)>; 3150 3151//===----------------------------------------------------------------------===// 3152// SSE 1 & 2 - Load/Store XCSR register 3153//===----------------------------------------------------------------------===// 3154 3155let mayLoad=1, hasSideEffects=1 in 3156def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3157 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3158 VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; 3159let mayStore=1, hasSideEffects=1 in 3160def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3161 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3162 VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; 3163 3164let mayLoad=1, hasSideEffects=1 in 3165def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3166 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3167 TB, Sched<[WriteLDMXCSR]>; 3168let mayStore=1, hasSideEffects=1 in 3169def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3170 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3171 TB, Sched<[WriteSTMXCSR]>; 3172 3173//===---------------------------------------------------------------------===// 3174// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3175//===---------------------------------------------------------------------===// 3176 3177let ExeDomain = SSEPackedInt in { // SSE integer instructions 3178 3179let hasSideEffects = 0 in { 3180def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3181 "movdqa\t{$src, $dst|$dst, $src}", []>, 3182 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3183def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3184 "movdqu\t{$src, $dst|$dst, $src}", []>, 3185 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; 3186def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3187 "movdqa\t{$src, $dst|$dst, $src}", []>, 3188 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3189def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3190 "movdqu\t{$src, $dst|$dst, $src}", []>, 3191 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; 3192} 3193 3194// For Disassembler 3195let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3196def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3197 "movdqa\t{$src, $dst|$dst, $src}", []>, 3198 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3199 VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; 3200def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3201 "movdqa\t{$src, $dst|$dst, $src}", []>, 3202 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3203 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; 3204def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3205 "movdqu\t{$src, $dst|$dst, $src}", []>, 3206 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3207 VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; 3208def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3209 "movdqu\t{$src, $dst|$dst, $src}", []>, 3210 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3211 VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; 3212} 3213 3214let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3215 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3216def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3217 "movdqa\t{$src, $dst|$dst, $src}", 3218 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3219 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 3220def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3221 "movdqa\t{$src, $dst|$dst, $src}", []>, 3222 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3223 VEX, VEX_L, VEX_WIG; 3224def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3225 "vmovdqu\t{$src, $dst|$dst, $src}", 3226 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3227 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3228 XS, VEX, VEX_WIG; 3229def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3230 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3231 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3232 XS, VEX, VEX_L, VEX_WIG; 3233} 3234 3235let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3236def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3237 (ins i128mem:$dst, VR128:$src), 3238 "movdqa\t{$src, $dst|$dst, $src}", 3239 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3240 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; 3241def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3242 (ins i256mem:$dst, VR256:$src), 3243 "movdqa\t{$src, $dst|$dst, $src}", []>, 3244 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; 3245def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3246 "vmovdqu\t{$src, $dst|$dst, $src}", 3247 [(store (v2i64 VR128:$src), addr:$dst)]>, 3248 Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; 3249def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3250 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3251 Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; 3252} 3253 3254let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3255let hasSideEffects = 0 in { 3256def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3257 "movdqa\t{$src, $dst|$dst, $src}", []>; 3258 3259def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3260 "movdqu\t{$src, $dst|$dst, $src}", []>, 3261 XS, Requires<[UseSSE2]>; 3262} 3263 3264// For Disassembler 3265let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3266def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3267 "movdqa\t{$src, $dst|$dst, $src}", []>, 3268 FoldGenData<"MOVDQArr">; 3269 3270def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3271 "movdqu\t{$src, $dst|$dst, $src}", []>, 3272 XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; 3273} 3274} // SchedRW 3275 3276let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3277 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3278def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3279 "movdqa\t{$src, $dst|$dst, $src}", 3280 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3281def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3282 "movdqu\t{$src, $dst|$dst, $src}", 3283 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3284 XS, Requires<[UseSSE2]>; 3285} 3286 3287let mayStore = 1, hasSideEffects = 0, 3288 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3289def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3290 "movdqa\t{$src, $dst|$dst, $src}", 3291 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3292def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3293 "movdqu\t{$src, $dst|$dst, $src}", 3294 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3295 XS, Requires<[UseSSE2]>; 3296} 3297 3298} // ExeDomain = SSEPackedInt 3299 3300// Reversed version with ".s" suffix for GAS compatibility. 3301def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3302 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3303def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3304 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3305def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3306 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3307def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3308 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3309 3310// Reversed version with ".s" suffix for GAS compatibility. 3311def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3312 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3313def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3314 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3315 3316let Predicates = [HasAVX, NoVLX] in { 3317 // Additional patterns for other integer sizes. 3318 def : Pat<(alignedloadv4i32 addr:$src), 3319 (VMOVDQArm addr:$src)>; 3320 def : Pat<(alignedloadv8i16 addr:$src), 3321 (VMOVDQArm addr:$src)>; 3322 def : Pat<(alignedloadv16i8 addr:$src), 3323 (VMOVDQArm addr:$src)>; 3324 def : Pat<(loadv4i32 addr:$src), 3325 (VMOVDQUrm addr:$src)>; 3326 def : Pat<(loadv8i16 addr:$src), 3327 (VMOVDQUrm addr:$src)>; 3328 def : Pat<(loadv16i8 addr:$src), 3329 (VMOVDQUrm addr:$src)>; 3330 3331 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3332 (VMOVDQAmr addr:$dst, VR128:$src)>; 3333 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3334 (VMOVDQAmr addr:$dst, VR128:$src)>; 3335 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3336 (VMOVDQAmr addr:$dst, VR128:$src)>; 3337 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3338 (VMOVDQUmr addr:$dst, VR128:$src)>; 3339 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3340 (VMOVDQUmr addr:$dst, VR128:$src)>; 3341 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3342 (VMOVDQUmr addr:$dst, VR128:$src)>; 3343} 3344 3345//===---------------------------------------------------------------------===// 3346// SSE2 - Packed Integer Arithmetic Instructions 3347//===---------------------------------------------------------------------===// 3348 3349let ExeDomain = SSEPackedInt in { // SSE integer instructions 3350 3351/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3352multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3353 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3354 PatFrag memop_frag, X86MemOperand x86memop, 3355 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3356 let isCommutable = 1 in 3357 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3358 (ins RC:$src1, RC:$src2), 3359 !if(Is2Addr, 3360 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3361 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3362 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3363 Sched<[sched]>; 3364 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3365 (ins RC:$src1, x86memop:$src2), 3366 !if(Is2Addr, 3367 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3368 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3369 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3370 (memop_frag addr:$src2))))]>, 3371 Sched<[sched.Folded, sched.ReadAfterFold]>; 3372} 3373} // ExeDomain = SSEPackedInt 3374 3375defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3376 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3377defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3378 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3379defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3380 SchedWriteVecALU, 1, NoVLX>; 3381defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3382 SchedWriteVecALU, 1, NoVLX>; 3383defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3384 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3385defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3386 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3387defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3388 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3389defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3390 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3391defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3392 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3393defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3394 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3395defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3396 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3397defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3398 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3399defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3400 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3401defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3402 SchedWriteVecALU, 0, NoVLX>; 3403defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3404 SchedWriteVecALU, 0, NoVLX>; 3405defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3406 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3407defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3408 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3409defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3410 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3411defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3412 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3413defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3414 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3415defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3416 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3417defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3418 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3419defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3420 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3421defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, 3422 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3423defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, 3424 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3425defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3426 SchedWriteVecIMul, 1, NoVLX>; 3427 3428let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3429defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3430 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3431 VEX_4V, VEX_WIG; 3432 3433let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3434defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3435 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3436 0>, VEX_4V, VEX_L, VEX_WIG; 3437let Constraints = "$src1 = $dst" in 3438defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3439 memop, i128mem, SchedWriteVecIMul.XMM>; 3440 3441let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3442defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3443 load, i128mem, SchedWritePSADBW.XMM, 0>, 3444 VEX_4V, VEX_WIG; 3445let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3446defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3447 load, i256mem, SchedWritePSADBW.YMM, 0>, 3448 VEX_4V, VEX_L, VEX_WIG; 3449let Constraints = "$src1 = $dst" in 3450defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3451 memop, i128mem, SchedWritePSADBW.XMM>; 3452 3453//===---------------------------------------------------------------------===// 3454// SSE2 - Packed Integer Logical Instructions 3455//===---------------------------------------------------------------------===// 3456 3457multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3458 string OpcodeStr, SDNode OpNode, 3459 SDNode OpNode2, RegisterClass RC, 3460 X86FoldableSchedWrite sched, 3461 X86FoldableSchedWrite schedImm, 3462 ValueType DstVT, ValueType SrcVT, 3463 PatFrag ld_frag, bit Is2Addr = 1> { 3464 // src2 is always 128-bit 3465 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3466 (ins RC:$src1, VR128:$src2), 3467 !if(Is2Addr, 3468 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3469 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3470 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3471 Sched<[sched]>; 3472 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3473 (ins RC:$src1, i128mem:$src2), 3474 !if(Is2Addr, 3475 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3476 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3477 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3478 (SrcVT (ld_frag addr:$src2)))))]>, 3479 Sched<[sched.Folded, sched.ReadAfterFold]>; 3480 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3481 (ins RC:$src1, u8imm:$src2), 3482 !if(Is2Addr, 3483 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3484 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3485 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, 3486 Sched<[schedImm]>; 3487} 3488 3489multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3490 string OpcodeStr, SDNode OpNode, 3491 SDNode OpNode2, ValueType DstVT128, 3492 ValueType DstVT256, ValueType SrcVT, 3493 X86SchedWriteWidths sched, 3494 X86SchedWriteWidths schedImm, Predicate prd> { 3495let Predicates = [HasAVX, prd] in 3496 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3497 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3498 DstVT128, SrcVT, load, 0>, VEX_4V, VEX_WIG; 3499let Predicates = [HasAVX2, prd] in 3500 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3501 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3502 DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L, 3503 VEX_WIG; 3504let Constraints = "$src1 = $dst" in 3505 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3506 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3507 memop>; 3508} 3509 3510multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3511 SDNode OpNode, RegisterClass RC, ValueType VT, 3512 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3513 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3514 !if(Is2Addr, 3515 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3516 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3517 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>, 3518 Sched<[sched]>; 3519} 3520 3521multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3522 SDNode OpNode, X86SchedWriteWidths sched> { 3523let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3524 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3525 VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; 3526let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3527 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3528 VR256, v32i8, sched.YMM, 0>, 3529 VEX_4V, VEX_L, VEX_WIG; 3530let Constraints = "$src1 = $dst" in 3531 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3532 sched.XMM>; 3533} 3534 3535let ExeDomain = SSEPackedInt in { 3536 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3537 v8i16, v16i16, v8i16, SchedWriteVecShift, 3538 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3539 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3540 v4i32, v8i32, v4i32, SchedWriteVecShift, 3541 SchedWriteVecShiftImm, NoVLX>; 3542 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3543 v2i64, v4i64, v2i64, SchedWriteVecShift, 3544 SchedWriteVecShiftImm, NoVLX>; 3545 3546 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3547 v8i16, v16i16, v8i16, SchedWriteVecShift, 3548 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3549 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3550 v4i32, v8i32, v4i32, SchedWriteVecShift, 3551 SchedWriteVecShiftImm, NoVLX>; 3552 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3553 v2i64, v4i64, v2i64, SchedWriteVecShift, 3554 SchedWriteVecShiftImm, NoVLX>; 3555 3556 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3557 v8i16, v16i16, v8i16, SchedWriteVecShift, 3558 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3559 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3560 v4i32, v8i32, v4i32, SchedWriteVecShift, 3561 SchedWriteVecShiftImm, NoVLX>; 3562 3563 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3564 SchedWriteShuffle>; 3565 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3566 SchedWriteShuffle>; 3567} // ExeDomain = SSEPackedInt 3568 3569//===---------------------------------------------------------------------===// 3570// SSE2 - Packed Integer Comparison Instructions 3571//===---------------------------------------------------------------------===// 3572 3573defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3574 SchedWriteVecALU, 1, TruePredicate>; 3575defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3576 SchedWriteVecALU, 1, TruePredicate>; 3577defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3578 SchedWriteVecALU, 1, TruePredicate>; 3579defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3580 SchedWriteVecALU, 0, TruePredicate>; 3581defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3582 SchedWriteVecALU, 0, TruePredicate>; 3583defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3584 SchedWriteVecALU, 0, TruePredicate>; 3585 3586//===---------------------------------------------------------------------===// 3587// SSE2 - Packed Integer Shuffle Instructions 3588//===---------------------------------------------------------------------===// 3589 3590let ExeDomain = SSEPackedInt in { 3591multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3592 SDNode OpNode, X86SchedWriteWidths sched, 3593 Predicate prd> { 3594let Predicates = [HasAVX, prd] in { 3595 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3596 (ins VR128:$src1, u8imm:$src2), 3597 !strconcat("v", OpcodeStr, 3598 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3599 [(set VR128:$dst, 3600 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, 3601 VEX, Sched<[sched.XMM]>, VEX_WIG; 3602 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3603 (ins i128mem:$src1, u8imm:$src2), 3604 !strconcat("v", OpcodeStr, 3605 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3606 [(set VR128:$dst, 3607 (vt128 (OpNode (load addr:$src1), 3608 (i8 imm:$src2))))]>, VEX, 3609 Sched<[sched.XMM.Folded]>, VEX_WIG; 3610} 3611 3612let Predicates = [HasAVX2, prd] in { 3613 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3614 (ins VR256:$src1, u8imm:$src2), 3615 !strconcat("v", OpcodeStr, 3616 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3617 [(set VR256:$dst, 3618 (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>, 3619 VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; 3620 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3621 (ins i256mem:$src1, u8imm:$src2), 3622 !strconcat("v", OpcodeStr, 3623 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3624 [(set VR256:$dst, 3625 (vt256 (OpNode (load addr:$src1), 3626 (i8 imm:$src2))))]>, VEX, VEX_L, 3627 Sched<[sched.YMM.Folded]>, VEX_WIG; 3628} 3629 3630let Predicates = [UseSSE2] in { 3631 def ri : Ii8<0x70, MRMSrcReg, 3632 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3633 !strconcat(OpcodeStr, 3634 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3635 [(set VR128:$dst, 3636 (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, 3637 Sched<[sched.XMM]>; 3638 def mi : Ii8<0x70, MRMSrcMem, 3639 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3640 !strconcat(OpcodeStr, 3641 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3642 [(set VR128:$dst, 3643 (vt128 (OpNode (memop addr:$src1), 3644 (i8 imm:$src2))))]>, 3645 Sched<[sched.XMM.Folded]>; 3646} 3647} 3648} // ExeDomain = SSEPackedInt 3649 3650defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3651 SchedWriteShuffle, NoVLX>, PD; 3652defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3653 SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; 3654defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3655 SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; 3656 3657//===---------------------------------------------------------------------===// 3658// Packed Integer Pack Instructions (SSE & AVX) 3659//===---------------------------------------------------------------------===// 3660 3661let ExeDomain = SSEPackedInt in { 3662multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3663 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3664 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3665 PatFrag ld_frag, bit Is2Addr = 1> { 3666 def rr : PDI<opc, MRMSrcReg, 3667 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3668 !if(Is2Addr, 3669 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3670 !strconcat(OpcodeStr, 3671 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3672 [(set RC:$dst, 3673 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3674 Sched<[sched]>; 3675 def rm : PDI<opc, MRMSrcMem, 3676 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3677 !if(Is2Addr, 3678 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3679 !strconcat(OpcodeStr, 3680 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3681 [(set RC:$dst, 3682 (OutVT (OpNode (ArgVT RC:$src1), 3683 (ld_frag addr:$src2))))]>, 3684 Sched<[sched.Folded, sched.ReadAfterFold]>; 3685} 3686 3687multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3688 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3689 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3690 PatFrag ld_frag, bit Is2Addr = 1> { 3691 def rr : SS48I<opc, MRMSrcReg, 3692 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3693 !if(Is2Addr, 3694 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3695 !strconcat(OpcodeStr, 3696 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3697 [(set RC:$dst, 3698 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3699 Sched<[sched]>; 3700 def rm : SS48I<opc, MRMSrcMem, 3701 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3702 !if(Is2Addr, 3703 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3704 !strconcat(OpcodeStr, 3705 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3706 [(set RC:$dst, 3707 (OutVT (OpNode (ArgVT RC:$src1), 3708 (ld_frag addr:$src2))))]>, 3709 Sched<[sched.Folded, sched.ReadAfterFold]>; 3710} 3711 3712let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3713 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3714 i128mem, SchedWriteShuffle.XMM, load, 0>, 3715 VEX_4V, VEX_WIG; 3716 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3717 i128mem, SchedWriteShuffle.XMM, load, 0>, 3718 VEX_4V, VEX_WIG; 3719 3720 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3721 i128mem, SchedWriteShuffle.XMM, load, 0>, 3722 VEX_4V, VEX_WIG; 3723 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3724 i128mem, SchedWriteShuffle.XMM, load, 0>, 3725 VEX_4V; 3726} 3727 3728let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3729 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3730 i256mem, SchedWriteShuffle.YMM, load, 0>, 3731 VEX_4V, VEX_L, VEX_WIG; 3732 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3733 i256mem, SchedWriteShuffle.YMM, load, 0>, 3734 VEX_4V, VEX_L, VEX_WIG; 3735 3736 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3737 i256mem, SchedWriteShuffle.YMM, load, 0>, 3738 VEX_4V, VEX_L, VEX_WIG; 3739 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3740 i256mem, SchedWriteShuffle.YMM, load, 0>, 3741 VEX_4V, VEX_L; 3742} 3743 3744let Constraints = "$src1 = $dst" in { 3745 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3746 i128mem, SchedWriteShuffle.XMM, memop>; 3747 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3748 i128mem, SchedWriteShuffle.XMM, memop>; 3749 3750 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3751 i128mem, SchedWriteShuffle.XMM, memop>; 3752 3753 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3754 i128mem, SchedWriteShuffle.XMM, memop>; 3755} 3756} // ExeDomain = SSEPackedInt 3757 3758//===---------------------------------------------------------------------===// 3759// SSE2 - Packed Integer Unpack Instructions 3760//===---------------------------------------------------------------------===// 3761 3762let ExeDomain = SSEPackedInt in { 3763multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3764 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3765 X86FoldableSchedWrite sched, PatFrag ld_frag, 3766 bit Is2Addr = 1> { 3767 def rr : PDI<opc, MRMSrcReg, 3768 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3769 !if(Is2Addr, 3770 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3771 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3772 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3773 Sched<[sched]>; 3774 def rm : PDI<opc, MRMSrcMem, 3775 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3776 !if(Is2Addr, 3777 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3778 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3779 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3780 Sched<[sched.Folded, sched.ReadAfterFold]>; 3781} 3782 3783let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3784 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3785 i128mem, SchedWriteShuffle.XMM, load, 0>, 3786 VEX_4V, VEX_WIG; 3787 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3788 i128mem, SchedWriteShuffle.XMM, load, 0>, 3789 VEX_4V, VEX_WIG; 3790 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3791 i128mem, SchedWriteShuffle.XMM, load, 0>, 3792 VEX_4V, VEX_WIG; 3793 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3794 i128mem, SchedWriteShuffle.XMM, load, 0>, 3795 VEX_4V, VEX_WIG; 3796} 3797 3798let Predicates = [HasAVX, NoVLX] in { 3799 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3800 i128mem, SchedWriteShuffle.XMM, load, 0>, 3801 VEX_4V, VEX_WIG; 3802 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3803 i128mem, SchedWriteShuffle.XMM, load, 0>, 3804 VEX_4V, VEX_WIG; 3805 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3806 i128mem, SchedWriteShuffle.XMM, load, 0>, 3807 VEX_4V, VEX_WIG; 3808 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3809 i128mem, SchedWriteShuffle.XMM, load, 0>, 3810 VEX_4V, VEX_WIG; 3811} 3812 3813let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3814 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3815 i256mem, SchedWriteShuffle.YMM, load, 0>, 3816 VEX_4V, VEX_L, VEX_WIG; 3817 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3818 i256mem, SchedWriteShuffle.YMM, load, 0>, 3819 VEX_4V, VEX_L, VEX_WIG; 3820 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3821 i256mem, SchedWriteShuffle.YMM, load, 0>, 3822 VEX_4V, VEX_L, VEX_WIG; 3823 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3824 i256mem, SchedWriteShuffle.YMM, load, 0>, 3825 VEX_4V, VEX_L, VEX_WIG; 3826} 3827 3828let Predicates = [HasAVX2, NoVLX] in { 3829 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3830 i256mem, SchedWriteShuffle.YMM, load, 0>, 3831 VEX_4V, VEX_L, VEX_WIG; 3832 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3833 i256mem, SchedWriteShuffle.YMM, load, 0>, 3834 VEX_4V, VEX_L, VEX_WIG; 3835 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3836 i256mem, SchedWriteShuffle.YMM, load, 0>, 3837 VEX_4V, VEX_L, VEX_WIG; 3838 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3839 i256mem, SchedWriteShuffle.YMM, load, 0>, 3840 VEX_4V, VEX_L, VEX_WIG; 3841} 3842 3843let Constraints = "$src1 = $dst" in { 3844 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3845 i128mem, SchedWriteShuffle.XMM, memop>; 3846 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3847 i128mem, SchedWriteShuffle.XMM, memop>; 3848 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3849 i128mem, SchedWriteShuffle.XMM, memop>; 3850 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3851 i128mem, SchedWriteShuffle.XMM, memop>; 3852 3853 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3854 i128mem, SchedWriteShuffle.XMM, memop>; 3855 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3856 i128mem, SchedWriteShuffle.XMM, memop>; 3857 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3858 i128mem, SchedWriteShuffle.XMM, memop>; 3859 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3860 i128mem, SchedWriteShuffle.XMM, memop>; 3861} 3862} // ExeDomain = SSEPackedInt 3863 3864//===---------------------------------------------------------------------===// 3865// SSE2 - Packed Integer Extract and Insert 3866//===---------------------------------------------------------------------===// 3867 3868let ExeDomain = SSEPackedInt in { 3869multiclass sse2_pinsrw<bit Is2Addr = 1> { 3870 def rr : Ii8<0xC4, MRMSrcReg, 3871 (outs VR128:$dst), (ins VR128:$src1, 3872 GR32orGR64:$src2, u8imm:$src3), 3873 !if(Is2Addr, 3874 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3875 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3876 [(set VR128:$dst, 3877 (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 3878 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3879 def rm : Ii8<0xC4, MRMSrcMem, 3880 (outs VR128:$dst), (ins VR128:$src1, 3881 i16mem:$src2, u8imm:$src3), 3882 !if(Is2Addr, 3883 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3884 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3885 [(set VR128:$dst, 3886 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3887 imm:$src3))]>, 3888 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 3889} 3890 3891// Extract 3892let Predicates = [HasAVX, NoBWI] in 3893def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 3894 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3895 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3896 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3897 imm:$src2))]>, 3898 PD, VEX, VEX_WIG, Sched<[WriteVecExtract]>; 3899def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 3900 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 3901 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 3902 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 3903 imm:$src2))]>, 3904 Sched<[WriteVecExtract]>; 3905 3906// Insert 3907let Predicates = [HasAVX, NoBWI] in 3908defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, VEX_WIG; 3909 3910let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 3911defm PINSRW : sse2_pinsrw, PD; 3912 3913} // ExeDomain = SSEPackedInt 3914 3915//===---------------------------------------------------------------------===// 3916// SSE2 - Packed Mask Creation 3917//===---------------------------------------------------------------------===// 3918 3919let ExeDomain = SSEPackedInt in { 3920 3921def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3922 (ins VR128:$src), 3923 "pmovmskb\t{$src, $dst|$dst, $src}", 3924 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3925 Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; 3926 3927let Predicates = [HasAVX2] in { 3928def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 3929 (ins VR256:$src), 3930 "pmovmskb\t{$src, $dst|$dst, $src}", 3931 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 3932 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; 3933} 3934 3935def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 3936 "pmovmskb\t{$src, $dst|$dst, $src}", 3937 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 3938 Sched<[WriteVecMOVMSK]>; 3939 3940} // ExeDomain = SSEPackedInt 3941 3942//===---------------------------------------------------------------------===// 3943// SSE2 - Conditional Store 3944//===---------------------------------------------------------------------===// 3945 3946let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3947let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in 3948def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 3949 (ins VR128:$src, VR128:$mask), 3950 "maskmovdqu\t{$mask, $src|$src, $mask}", 3951 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 3952 VEX, VEX_WIG; 3953let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 3954def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 3955 (ins VR128:$src, VR128:$mask), 3956 "maskmovdqu\t{$mask, $src|$src, $mask}", 3957 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 3958 VEX, VEX_WIG; 3959 3960let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in 3961def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 3962 "maskmovdqu\t{$mask, $src|$src, $mask}", 3963 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 3964let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 3965def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 3966 "maskmovdqu\t{$mask, $src|$src, $mask}", 3967 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 3968 3969} // ExeDomain = SSEPackedInt 3970 3971//===---------------------------------------------------------------------===// 3972// SSE2 - Move Doubleword/Quadword 3973//===---------------------------------------------------------------------===// 3974 3975//===---------------------------------------------------------------------===// 3976// Move Int Doubleword to Packed Double Int 3977// 3978let ExeDomain = SSEPackedInt in { 3979def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 3980 "movd\t{$src, $dst|$dst, $src}", 3981 [(set VR128:$dst, 3982 (v4i32 (scalar_to_vector GR32:$src)))]>, 3983 VEX, Sched<[WriteVecMoveFromGpr]>; 3984def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 3985 "movd\t{$src, $dst|$dst, $src}", 3986 [(set VR128:$dst, 3987 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 3988 VEX, Sched<[WriteVecLoad]>; 3989def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 3990 "movq\t{$src, $dst|$dst, $src}", 3991 [(set VR128:$dst, 3992 (v2i64 (scalar_to_vector GR64:$src)))]>, 3993 VEX, Sched<[WriteVecMoveFromGpr]>; 3994let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 3995def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 3996 "movq\t{$src, $dst|$dst, $src}", []>, 3997 VEX, Sched<[WriteVecLoad]>; 3998let isCodeGenOnly = 1 in 3999def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4000 "movq\t{$src, $dst|$dst, $src}", 4001 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4002 VEX, Sched<[WriteVecMoveFromGpr]>; 4003 4004def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4005 "movd\t{$src, $dst|$dst, $src}", 4006 [(set VR128:$dst, 4007 (v4i32 (scalar_to_vector GR32:$src)))]>, 4008 Sched<[WriteVecMoveFromGpr]>; 4009def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4010 "movd\t{$src, $dst|$dst, $src}", 4011 [(set VR128:$dst, 4012 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4013 Sched<[WriteVecLoad]>; 4014def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4015 "movq\t{$src, $dst|$dst, $src}", 4016 [(set VR128:$dst, 4017 (v2i64 (scalar_to_vector GR64:$src)))]>, 4018 Sched<[WriteVecMoveFromGpr]>; 4019let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4020def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4021 "movq\t{$src, $dst|$dst, $src}", []>, 4022 Sched<[WriteVecLoad]>; 4023let isCodeGenOnly = 1 in 4024def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4025 "movq\t{$src, $dst|$dst, $src}", 4026 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4027 Sched<[WriteVecMoveFromGpr]>; 4028} // ExeDomain = SSEPackedInt 4029 4030//===---------------------------------------------------------------------===// 4031// Move Int Doubleword to Single Scalar 4032// 4033let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4034 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4035 "movd\t{$src, $dst|$dst, $src}", 4036 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4037 VEX, Sched<[WriteVecMoveFromGpr]>; 4038 4039 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4040 "movd\t{$src, $dst|$dst, $src}", 4041 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4042 Sched<[WriteVecMoveFromGpr]>; 4043 4044} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4045 4046//===---------------------------------------------------------------------===// 4047// Move Packed Doubleword Int to Packed Double Int 4048// 4049let ExeDomain = SSEPackedInt in { 4050def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4051 "movd\t{$src, $dst|$dst, $src}", 4052 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4053 (iPTR 0)))]>, VEX, 4054 Sched<[WriteVecMoveToGpr]>; 4055def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4056 (ins i32mem:$dst, VR128:$src), 4057 "movd\t{$src, $dst|$dst, $src}", 4058 [(store (i32 (extractelt (v4i32 VR128:$src), 4059 (iPTR 0))), addr:$dst)]>, 4060 VEX, Sched<[WriteVecStore]>; 4061def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4062 "movd\t{$src, $dst|$dst, $src}", 4063 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4064 (iPTR 0)))]>, 4065 Sched<[WriteVecMoveToGpr]>; 4066def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4067 "movd\t{$src, $dst|$dst, $src}", 4068 [(store (i32 (extractelt (v4i32 VR128:$src), 4069 (iPTR 0))), addr:$dst)]>, 4070 Sched<[WriteVecStore]>; 4071} // ExeDomain = SSEPackedInt 4072 4073//===---------------------------------------------------------------------===// 4074// Move Packed Doubleword Int first element to Doubleword Int 4075// 4076let ExeDomain = SSEPackedInt in { 4077let SchedRW = [WriteVecMoveToGpr] in { 4078def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4079 "movq\t{$src, $dst|$dst, $src}", 4080 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4081 (iPTR 0)))]>, 4082 VEX; 4083 4084def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4085 "movq\t{$src, $dst|$dst, $src}", 4086 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4087 (iPTR 0)))]>; 4088} //SchedRW 4089 4090let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4091def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4092 (ins i64mem:$dst, VR128:$src), 4093 "movq\t{$src, $dst|$dst, $src}", []>, 4094 VEX, Sched<[WriteVecStore]>; 4095let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4096def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4097 "movq\t{$src, $dst|$dst, $src}", []>, 4098 Sched<[WriteVecStore]>; 4099} // ExeDomain = SSEPackedInt 4100 4101//===---------------------------------------------------------------------===// 4102// Bitcast FR64 <-> GR64 4103// 4104let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4105 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4106 "movq\t{$src, $dst|$dst, $src}", 4107 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4108 VEX, Sched<[WriteVecMoveToGpr]>; 4109 4110 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4111 "movq\t{$src, $dst|$dst, $src}", 4112 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4113 Sched<[WriteVecMoveToGpr]>; 4114} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4115 4116//===---------------------------------------------------------------------===// 4117// Move Scalar Single to Double Int 4118// 4119let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4120 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4121 "movd\t{$src, $dst|$dst, $src}", 4122 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4123 VEX, Sched<[WriteVecMoveToGpr]>; 4124 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4125 "movd\t{$src, $dst|$dst, $src}", 4126 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4127 Sched<[WriteVecMoveToGpr]>; 4128} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4129 4130let Predicates = [UseAVX] in { 4131 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4132 (VMOVDI2PDIrr GR32:$src)>; 4133 4134 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4135 (VMOV64toPQIrr GR64:$src)>; 4136 4137 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4138 // These instructions also write zeros in the high part of a 256-bit register. 4139 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4140 (VMOVDI2PDIrm addr:$src)>; 4141 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4142 (VMOVDI2PDIrm addr:$src)>; 4143 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4144 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4145} 4146 4147let Predicates = [UseSSE2] in { 4148 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4149 (MOVDI2PDIrr GR32:$src)>; 4150 4151 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4152 (MOV64toPQIrr GR64:$src)>; 4153 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), 4154 (MOVDI2PDIrm addr:$src)>; 4155 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4156 (MOVDI2PDIrm addr:$src)>; 4157} 4158 4159// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4160// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4161// these aliases. 4162def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4163 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4164def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4165 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4166// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4167def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4168 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4169def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4170 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4171 4172//===---------------------------------------------------------------------===// 4173// SSE2 - Move Quadword 4174//===---------------------------------------------------------------------===// 4175 4176//===---------------------------------------------------------------------===// 4177// Move Quadword Int to Packed Quadword Int 4178// 4179 4180let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4181def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4182 "vmovq\t{$src, $dst|$dst, $src}", 4183 [(set VR128:$dst, 4184 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, 4185 VEX, Requires<[UseAVX]>, VEX_WIG; 4186def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4187 "movq\t{$src, $dst|$dst, $src}", 4188 [(set VR128:$dst, 4189 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4190 XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4191} // ExeDomain, SchedRW 4192 4193//===---------------------------------------------------------------------===// 4194// Move Packed Quadword Int to Quadword Int 4195// 4196let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4197def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4198 "movq\t{$src, $dst|$dst, $src}", 4199 [(store (i64 (extractelt (v2i64 VR128:$src), 4200 (iPTR 0))), addr:$dst)]>, 4201 VEX, VEX_WIG; 4202def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4203 "movq\t{$src, $dst|$dst, $src}", 4204 [(store (i64 (extractelt (v2i64 VR128:$src), 4205 (iPTR 0))), addr:$dst)]>; 4206} // ExeDomain, SchedRW 4207 4208// For disassembler only 4209let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4210 SchedRW = [SchedWriteVecLogic.XMM] in { 4211def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4212 "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; 4213def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4214 "movq\t{$src, $dst|$dst, $src}", []>; 4215} 4216 4217def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4218 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4219def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4220 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4221 4222let Predicates = [UseAVX] in { 4223 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4224 (VMOVQI2PQIrm addr:$src)>; 4225 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4226 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4227 4228 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4229 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4230} 4231 4232let Predicates = [UseSSE2] in { 4233 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4234 4235 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4236 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4237} 4238 4239//===---------------------------------------------------------------------===// 4240// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4241// IA32 document. movq xmm1, xmm2 does clear the high bits. 4242// 4243let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4244def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4245 "vmovq\t{$src, $dst|$dst, $src}", 4246 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4247 XS, VEX, Requires<[UseAVX]>, VEX_WIG; 4248def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4249 "movq\t{$src, $dst|$dst, $src}", 4250 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4251 XS, Requires<[UseSSE2]>; 4252} // ExeDomain, SchedRW 4253 4254let Predicates = [UseAVX] in { 4255 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4256 (VMOVZPQILo2PQIrr VR128:$src)>; 4257} 4258let Predicates = [UseSSE2] in { 4259 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4260 (MOVZPQILo2PQIrr VR128:$src)>; 4261} 4262 4263let Predicates = [UseAVX] in { 4264 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4265 (SUBREG_TO_REG (i32 0), 4266 (v2f64 (VMOVZPQILo2PQIrr 4267 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4268 sub_xmm)>; 4269 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4270 (SUBREG_TO_REG (i32 0), 4271 (v2i64 (VMOVZPQILo2PQIrr 4272 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4273 sub_xmm)>; 4274} 4275 4276//===---------------------------------------------------------------------===// 4277// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4278//===---------------------------------------------------------------------===// 4279 4280multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4281 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4282 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4283def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4284 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4285 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4286 Sched<[sched]>; 4287def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4288 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4289 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4290 Sched<[sched.Folded]>; 4291} 4292 4293let Predicates = [HasAVX, NoVLX] in { 4294 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4295 v4f32, VR128, loadv4f32, f128mem, 4296 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4297 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4298 v4f32, VR128, loadv4f32, f128mem, 4299 SchedWriteFShuffle.XMM>, VEX, VEX_WIG; 4300 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4301 v8f32, VR256, loadv8f32, f256mem, 4302 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4303 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4304 v8f32, VR256, loadv8f32, f256mem, 4305 SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; 4306} 4307defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4308 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4309defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4310 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4311 4312let Predicates = [HasAVX, NoVLX] in { 4313 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4314 (VMOVSHDUPrr VR128:$src)>; 4315 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4316 (VMOVSHDUPrm addr:$src)>; 4317 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4318 (VMOVSLDUPrr VR128:$src)>; 4319 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4320 (VMOVSLDUPrm addr:$src)>; 4321 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4322 (VMOVSHDUPYrr VR256:$src)>; 4323 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4324 (VMOVSHDUPYrm addr:$src)>; 4325 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4326 (VMOVSLDUPYrr VR256:$src)>; 4327 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4328 (VMOVSLDUPYrm addr:$src)>; 4329} 4330 4331let Predicates = [UseSSE3] in { 4332 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4333 (MOVSHDUPrr VR128:$src)>; 4334 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4335 (MOVSHDUPrm addr:$src)>; 4336 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4337 (MOVSLDUPrr VR128:$src)>; 4338 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4339 (MOVSLDUPrm addr:$src)>; 4340} 4341 4342//===---------------------------------------------------------------------===// 4343// SSE3 - Replicate Double FP - MOVDDUP 4344//===---------------------------------------------------------------------===// 4345 4346multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4347def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4348 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4349 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4350 Sched<[sched.XMM]>; 4351def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4352 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4353 [(set VR128:$dst, 4354 (v2f64 (X86Movddup 4355 (scalar_to_vector (loadf64 addr:$src)))))]>, 4356 Sched<[sched.XMM.Folded]>; 4357} 4358 4359// FIXME: Merge with above classes when there are patterns for the ymm version 4360multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4361def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4362 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4363 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4364 Sched<[sched.YMM]>; 4365def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4366 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4367 [(set VR256:$dst, 4368 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4369 Sched<[sched.YMM.Folded]>; 4370} 4371 4372let Predicates = [HasAVX, NoVLX] in { 4373 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4374 VEX, VEX_WIG; 4375 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4376 VEX, VEX_L, VEX_WIG; 4377} 4378 4379defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4380 4381 4382let Predicates = [HasAVX, NoVLX] in { 4383 def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), 4384 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4385 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4386 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4387} 4388 4389let Predicates = [UseSSE3] in { 4390 // No need for aligned memory as this only loads 64-bits. 4391 def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))), 4392 (MOVDDUPrm addr:$src)>; 4393 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4394 (MOVDDUPrm addr:$src)>; 4395} 4396 4397//===---------------------------------------------------------------------===// 4398// SSE3 - Move Unaligned Integer 4399//===---------------------------------------------------------------------===// 4400 4401let Predicates = [HasAVX] in { 4402 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4403 "vlddqu\t{$src, $dst|$dst, $src}", 4404 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4405 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; 4406 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4407 "vlddqu\t{$src, $dst|$dst, $src}", 4408 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4409 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; 4410} // Predicates 4411 4412def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4413 "lddqu\t{$src, $dst|$dst, $src}", 4414 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4415 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4416 4417//===---------------------------------------------------------------------===// 4418// SSE3 - Arithmetic 4419//===---------------------------------------------------------------------===// 4420 4421multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4422 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4423 PatFrag ld_frag, bit Is2Addr = 1> { 4424 def rr : I<0xD0, MRMSrcReg, 4425 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4426 !if(Is2Addr, 4427 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4428 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4429 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4430 Sched<[sched]>; 4431 def rm : I<0xD0, MRMSrcMem, 4432 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4433 !if(Is2Addr, 4434 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4435 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4436 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4437 Sched<[sched.Folded, sched.ReadAfterFold]>; 4438} 4439 4440let Predicates = [HasAVX] in { 4441 let ExeDomain = SSEPackedSingle in { 4442 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4443 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4444 XD, VEX_4V, VEX_WIG; 4445 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4446 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4447 XD, VEX_4V, VEX_L, VEX_WIG; 4448 } 4449 let ExeDomain = SSEPackedDouble in { 4450 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4451 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4452 PD, VEX_4V, VEX_WIG; 4453 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4454 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4455 PD, VEX_4V, VEX_L, VEX_WIG; 4456 } 4457} 4458let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4459 let ExeDomain = SSEPackedSingle in 4460 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4461 SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; 4462 let ExeDomain = SSEPackedDouble in 4463 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4464 SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; 4465} 4466 4467//===---------------------------------------------------------------------===// 4468// SSE3 Instructions 4469//===---------------------------------------------------------------------===// 4470 4471// Horizontal ops 4472multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4473 X86MemOperand x86memop, SDNode OpNode, 4474 X86FoldableSchedWrite sched, PatFrag ld_frag, 4475 bit Is2Addr = 1> { 4476 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4477 !if(Is2Addr, 4478 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4479 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4480 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4481 Sched<[sched]>; 4482 4483 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4484 !if(Is2Addr, 4485 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4486 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4487 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4488 Sched<[sched.Folded, sched.ReadAfterFold]>; 4489} 4490multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4491 X86MemOperand x86memop, SDNode OpNode, 4492 X86FoldableSchedWrite sched, PatFrag ld_frag, 4493 bit Is2Addr = 1> { 4494 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4495 !if(Is2Addr, 4496 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4497 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4498 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4499 Sched<[sched]>; 4500 4501 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4502 !if(Is2Addr, 4503 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4504 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4505 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4506 Sched<[sched.Folded, sched.ReadAfterFold]>; 4507} 4508 4509let Predicates = [HasAVX] in { 4510 let ExeDomain = SSEPackedSingle in { 4511 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4512 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4513 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4514 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; 4515 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4516 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4517 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4518 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; 4519 } 4520 let ExeDomain = SSEPackedDouble in { 4521 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4522 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4523 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4524 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; 4525 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4526 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4527 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4528 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; 4529 } 4530} 4531 4532let Constraints = "$src1 = $dst" in { 4533 let ExeDomain = SSEPackedSingle in { 4534 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4535 WriteFHAdd, memopv4f32>; 4536 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4537 WriteFHAdd, memopv4f32>; 4538 } 4539 let ExeDomain = SSEPackedDouble in { 4540 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4541 WriteFHAdd, memopv2f64>; 4542 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4543 WriteFHAdd, memopv2f64>; 4544 } 4545} 4546 4547//===---------------------------------------------------------------------===// 4548// SSSE3 - Packed Absolute Instructions 4549//===---------------------------------------------------------------------===// 4550 4551/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4552multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4553 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4554 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4555 (ins VR128:$src), 4556 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4557 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4558 Sched<[sched.XMM]>; 4559 4560 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4561 (ins i128mem:$src), 4562 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4563 [(set VR128:$dst, 4564 (vt (OpNode (ld_frag addr:$src))))]>, 4565 Sched<[sched.XMM.Folded]>; 4566} 4567 4568/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4569multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4570 SDNode OpNode, X86SchedWriteWidths sched> { 4571 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4572 (ins VR256:$src), 4573 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4574 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4575 Sched<[sched.YMM]>; 4576 4577 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4578 (ins i256mem:$src), 4579 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4580 [(set VR256:$dst, 4581 (vt (OpNode (load addr:$src))))]>, 4582 Sched<[sched.YMM.Folded]>; 4583} 4584 4585let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4586 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4587 load>, VEX, VEX_WIG; 4588 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4589 load>, VEX, VEX_WIG; 4590} 4591let Predicates = [HasAVX, NoVLX] in { 4592 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4593 load>, VEX, VEX_WIG; 4594} 4595let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4596 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4597 VEX, VEX_L, VEX_WIG; 4598 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4599 VEX, VEX_L, VEX_WIG; 4600} 4601let Predicates = [HasAVX2, NoVLX] in { 4602 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4603 VEX, VEX_L, VEX_WIG; 4604} 4605 4606defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4607 memop>; 4608defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4609 memop>; 4610defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4611 memop>; 4612 4613//===---------------------------------------------------------------------===// 4614// SSSE3 - Packed Binary Operator Instructions 4615//===---------------------------------------------------------------------===// 4616 4617/// SS3I_binop_rm - Simple SSSE3 bin op 4618multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4619 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4620 PatFrag memop_frag, X86MemOperand x86memop, 4621 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4622 let isCommutable = 1 in 4623 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4624 (ins RC:$src1, RC:$src2), 4625 !if(Is2Addr, 4626 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4627 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4628 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4629 Sched<[sched]>; 4630 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4631 (ins RC:$src1, x86memop:$src2), 4632 !if(Is2Addr, 4633 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4634 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4635 [(set RC:$dst, 4636 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4637 Sched<[sched.Folded, sched.ReadAfterFold]>; 4638} 4639 4640/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4641multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4642 Intrinsic IntId128, X86FoldableSchedWrite sched, 4643 PatFrag ld_frag, bit Is2Addr = 1> { 4644 let isCommutable = 1 in 4645 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4646 (ins VR128:$src1, VR128:$src2), 4647 !if(Is2Addr, 4648 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4649 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4650 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4651 Sched<[sched]>; 4652 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4653 (ins VR128:$src1, i128mem:$src2), 4654 !if(Is2Addr, 4655 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4656 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4657 [(set VR128:$dst, 4658 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4659 Sched<[sched.Folded, sched.ReadAfterFold]>; 4660} 4661 4662multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4663 Intrinsic IntId256, 4664 X86FoldableSchedWrite sched> { 4665 let isCommutable = 1 in 4666 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4667 (ins VR256:$src1, VR256:$src2), 4668 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4669 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4670 Sched<[sched]>; 4671 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4672 (ins VR256:$src1, i256mem:$src2), 4673 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4674 [(set VR256:$dst, 4675 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4676 Sched<[sched.Folded, sched.ReadAfterFold]>; 4677} 4678 4679let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4680let isCommutable = 0 in { 4681 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4682 VR128, load, i128mem, 4683 SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4684 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4685 v16i8, VR128, load, i128mem, 4686 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4687} 4688defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4689 VR128, load, i128mem, 4690 SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; 4691} 4692 4693let ImmT = NoImm, Predicates = [HasAVX] in { 4694let isCommutable = 0 in { 4695 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4696 load, i128mem, 4697 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4698 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4699 load, i128mem, 4700 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4701 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4702 load, i128mem, 4703 SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; 4704 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4705 load, i128mem, 4706 SchedWritePHAdd.XMM, 0>, VEX_4V; 4707 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4708 int_x86_ssse3_psign_b_128, 4709 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4710 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4711 int_x86_ssse3_psign_w_128, 4712 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4713 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4714 int_x86_ssse3_psign_d_128, 4715 SchedWriteVecALU.XMM, load, 0>, VEX_4V, VEX_WIG; 4716 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4717 int_x86_ssse3_phadd_sw_128, 4718 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4719 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4720 int_x86_ssse3_phsub_sw_128, 4721 SchedWritePHAdd.XMM, load, 0>, VEX_4V, VEX_WIG; 4722} 4723} 4724 4725let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4726let isCommutable = 0 in { 4727 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4728 VR256, load, i256mem, 4729 SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4730 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4731 v32i8, VR256, load, i256mem, 4732 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4733} 4734defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4735 VR256, load, i256mem, 4736 SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4737} 4738 4739let ImmT = NoImm, Predicates = [HasAVX2] in { 4740let isCommutable = 0 in { 4741 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4742 VR256, load, i256mem, 4743 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4744 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4745 load, i256mem, 4746 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4747 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4748 VR256, load, i256mem, 4749 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4750 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4751 load, i256mem, 4752 SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; 4753 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4754 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4755 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4756 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4757 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4758 SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; 4759 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4760 int_x86_avx2_phadd_sw, 4761 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4762 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4763 int_x86_avx2_phsub_sw, 4764 SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; 4765} 4766} 4767 4768// None of these have i8 immediate fields. 4769let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4770let isCommutable = 0 in { 4771 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4772 memop, i128mem, SchedWritePHAdd.XMM>; 4773 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4774 memop, i128mem, SchedWritePHAdd.XMM>; 4775 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4776 memop, i128mem, SchedWritePHAdd.XMM>; 4777 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4778 memop, i128mem, SchedWritePHAdd.XMM>; 4779 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4780 SchedWriteVecALU.XMM, memop>; 4781 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4782 SchedWriteVecALU.XMM, memop>; 4783 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4784 SchedWriteVecALU.XMM, memop>; 4785 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4786 memop, i128mem, SchedWriteVarShuffle.XMM>; 4787 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4788 int_x86_ssse3_phadd_sw_128, 4789 SchedWritePHAdd.XMM, memop>; 4790 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4791 int_x86_ssse3_phsub_sw_128, 4792 SchedWritePHAdd.XMM, memop>; 4793 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4794 v16i8, VR128, memop, i128mem, 4795 SchedWriteVecIMul.XMM>; 4796} 4797defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4798 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4799} 4800 4801//===---------------------------------------------------------------------===// 4802// SSSE3 - Packed Align Instruction Patterns 4803//===---------------------------------------------------------------------===// 4804 4805multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4806 PatFrag memop_frag, X86MemOperand x86memop, 4807 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4808 let hasSideEffects = 0 in { 4809 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4810 (ins RC:$src1, RC:$src2, u8imm:$src3), 4811 !if(Is2Addr, 4812 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4813 !strconcat(asm, 4814 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4815 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>, 4816 Sched<[sched]>; 4817 let mayLoad = 1 in 4818 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4819 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4820 !if(Is2Addr, 4821 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4822 !strconcat(asm, 4823 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4824 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4825 (memop_frag addr:$src2), 4826 (i8 imm:$src3))))]>, 4827 Sched<[sched.Folded, sched.ReadAfterFold]>; 4828 } 4829} 4830 4831let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4832 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4833 SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; 4834let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4835 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4836 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; 4837let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4838 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4839 SchedWriteShuffle.XMM>; 4840 4841//===---------------------------------------------------------------------===// 4842// SSSE3 - Thread synchronization 4843//===---------------------------------------------------------------------===// 4844 4845let SchedRW = [WriteSystem] in { 4846let Uses = [EAX, ECX, EDX] in 4847def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4848 TB, Requires<[HasSSE3, Not64BitMode]>; 4849let Uses = [RAX, ECX, EDX] in 4850def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4851 TB, Requires<[HasSSE3, In64BitMode]>; 4852 4853let Uses = [ECX, EAX] in 4854def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4855 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4856} // SchedRW 4857 4858def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4859def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4860 4861def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4862 Requires<[Not64BitMode]>; 4863def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4864 Requires<[In64BitMode]>; 4865 4866//===----------------------------------------------------------------------===// 4867// SSE4.1 - Packed Move with Sign/Zero Extend 4868// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 4869//===----------------------------------------------------------------------===// 4870 4871multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4872 RegisterClass OutRC, RegisterClass InRC, 4873 X86FoldableSchedWrite sched> { 4874 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 4875 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4876 Sched<[sched]>; 4877 4878 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 4879 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 4880 Sched<[sched.Folded]>; 4881} 4882 4883multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 4884 X86MemOperand MemOp, X86MemOperand MemYOp, 4885 Predicate prd> { 4886 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 4887 SchedWriteShuffle.XMM>; 4888 let Predicates = [HasAVX, prd] in 4889 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 4890 VR128, VR128, SchedWriteShuffle.XMM>, 4891 VEX, VEX_WIG; 4892 let Predicates = [HasAVX2, prd] in 4893 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 4894 VR256, VR128, WriteShuffle256>, 4895 VEX, VEX_L, VEX_WIG; 4896} 4897 4898multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4899 X86MemOperand MemYOp, Predicate prd> { 4900 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 4901 MemOp, MemYOp, prd>; 4902 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 4903 !strconcat("pmovzx", OpcodeStr), 4904 MemOp, MemYOp, prd>; 4905} 4906 4907defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 4908defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 4909defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 4910 4911defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 4912defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 4913 4914defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 4915 4916// AVX2 Patterns 4917multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 4918 SDNode ExtOp, SDNode InVecOp> { 4919 // Register-Register patterns 4920 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4921 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 4922 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 4923 } 4924 let Predicates = [HasAVX2, NoVLX] in { 4925 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 4926 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 4927 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 4928 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 4929 4930 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 4931 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 4932 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 4933 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 4934 4935 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 4936 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 4937 } 4938 4939 // Simple Register-Memory patterns 4940 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4941 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4942 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4943 4944 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 4945 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 4946 } 4947 4948 let Predicates = [HasAVX2, NoVLX] in { 4949 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4950 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4951 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 4952 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 4953 4954 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 4955 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 4956 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 4957 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 4958 4959 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 4960 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 4961 } 4962 4963 // AVX2 Register-Memory patterns 4964 let Predicates = [HasAVX2, NoVLX] in { 4965 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 4966 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 4967 4968 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 4969 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4970 def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))), 4971 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 4972 4973 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 4974 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 4975 4976 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 4977 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 4978 def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))), 4979 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 4980 4981 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 4982 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 4983 def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))), 4984 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 4985 } 4986} 4987 4988defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 4989defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 4990 4991// SSE4.1/AVX patterns. 4992multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 4993 SDNode ExtOp> { 4994 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4995 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 4996 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 4997 } 4998 let Predicates = [HasAVX, NoVLX] in { 4999 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5000 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5001 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5002 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5003 5004 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5005 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5006 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5007 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5008 5009 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5010 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5011 } 5012 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5013 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5014 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5015 } 5016 let Predicates = [HasAVX, NoVLX] in { 5017 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5018 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5019 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5020 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5021 5022 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5023 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5024 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5025 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5026 5027 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5028 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5029 } 5030 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5031 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5032 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5033 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5034 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5035 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5036 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5037 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5038 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5039 } 5040 let Predicates = [HasAVX, NoVLX] in { 5041 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5042 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5043 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5044 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5045 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5046 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5047 5048 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5049 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5050 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5051 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5052 5053 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5054 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5055 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5056 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5057 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5058 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5059 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5060 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5061 5062 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5063 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5064 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5065 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5066 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5067 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5068 5069 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5070 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5071 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5072 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5073 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5074 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5075 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5076 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5077 } 5078} 5079 5080defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5081defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5082 5083let Predicates = [UseSSE41] in { 5084 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5085 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5086} 5087 5088//===----------------------------------------------------------------------===// 5089// SSE4.1 - Extract Instructions 5090//===----------------------------------------------------------------------===// 5091 5092/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5093multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5094 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5095 (ins VR128:$src1, u8imm:$src2), 5096 !strconcat(OpcodeStr, 5097 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5098 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5099 imm:$src2))]>, 5100 Sched<[WriteVecExtract]>; 5101 let hasSideEffects = 0, mayStore = 1 in 5102 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5103 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5104 !strconcat(OpcodeStr, 5105 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5106 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), 5107 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5108} 5109 5110let Predicates = [HasAVX, NoBWI] in 5111 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, VEX_WIG; 5112 5113defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5114 5115 5116/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5117multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5118 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5119 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5120 (ins VR128:$src1, u8imm:$src2), 5121 !strconcat(OpcodeStr, 5122 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5123 Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; 5124 5125 let hasSideEffects = 0, mayStore = 1 in 5126 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5127 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5128 !strconcat(OpcodeStr, 5129 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5130 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), 5131 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5132} 5133 5134let Predicates = [HasAVX, NoBWI] in 5135 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, VEX_WIG; 5136 5137defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5138 5139 5140/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5141multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5142 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5143 (ins VR128:$src1, u8imm:$src2), 5144 !strconcat(OpcodeStr, 5145 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5146 [(set GR32:$dst, 5147 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5148 Sched<[WriteVecExtract]>; 5149 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5150 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5151 !strconcat(OpcodeStr, 5152 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5153 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5154 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5155} 5156 5157let Predicates = [HasAVX, NoDQI] in 5158 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5159 5160defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5161 5162/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5163multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5164 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5165 (ins VR128:$src1, u8imm:$src2), 5166 !strconcat(OpcodeStr, 5167 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5168 [(set GR64:$dst, 5169 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5170 Sched<[WriteVecExtract]>; 5171 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5172 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5173 !strconcat(OpcodeStr, 5174 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5175 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5176 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5177} 5178 5179let Predicates = [HasAVX, NoDQI] in 5180 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; 5181 5182defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5183 5184/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5185/// destination 5186multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5187 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5188 (ins VR128:$src1, u8imm:$src2), 5189 !strconcat(OpcodeStr, 5190 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5191 [(set GR32orGR64:$dst, 5192 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5193 Sched<[WriteVecExtract]>; 5194 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5195 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5196 !strconcat(OpcodeStr, 5197 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5198 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5199 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5200} 5201 5202let ExeDomain = SSEPackedSingle in { 5203 let Predicates = [UseAVX] in 5204 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; 5205 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5206} 5207 5208//===----------------------------------------------------------------------===// 5209// SSE4.1 - Insert Instructions 5210//===----------------------------------------------------------------------===// 5211 5212multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5213 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5214 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5215 !if(Is2Addr, 5216 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5217 !strconcat(asm, 5218 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5219 [(set VR128:$dst, 5220 (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, 5221 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5222 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5223 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5224 !if(Is2Addr, 5225 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5226 !strconcat(asm, 5227 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5228 [(set VR128:$dst, 5229 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), imm:$src3))]>, 5230 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5231} 5232 5233let Predicates = [HasAVX, NoBWI] in 5234 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, VEX_WIG; 5235let Constraints = "$src1 = $dst" in 5236 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5237 5238multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5239 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5240 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5241 !if(Is2Addr, 5242 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5243 !strconcat(asm, 5244 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5245 [(set VR128:$dst, 5246 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5247 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5248 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5249 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5250 !if(Is2Addr, 5251 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5252 !strconcat(asm, 5253 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5254 [(set VR128:$dst, 5255 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5256 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5257} 5258 5259let Predicates = [HasAVX, NoDQI] in 5260 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; 5261let Constraints = "$src1 = $dst" in 5262 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5263 5264multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5265 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5266 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5267 !if(Is2Addr, 5268 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5269 !strconcat(asm, 5270 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5271 [(set VR128:$dst, 5272 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5273 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5274 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5275 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5276 !if(Is2Addr, 5277 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5278 !strconcat(asm, 5279 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5280 [(set VR128:$dst, 5281 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5282 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5283} 5284 5285let Predicates = [HasAVX, NoDQI] in 5286 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; 5287let Constraints = "$src1 = $dst" in 5288 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5289 5290// insertps has a few different modes, there's the first two here below which 5291// are optimized inserts that won't zero arbitrary elements in the destination 5292// vector. The next one matches the intrinsic and could zero arbitrary elements 5293// in the target vector. 5294multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5295 let isCommutable = 1 in 5296 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5297 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5298 !if(Is2Addr, 5299 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5300 !strconcat(asm, 5301 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5302 [(set VR128:$dst, 5303 (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>, 5304 Sched<[SchedWriteFShuffle.XMM]>; 5305 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5306 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5307 !if(Is2Addr, 5308 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5309 !strconcat(asm, 5310 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5311 [(set VR128:$dst, 5312 (X86insertps VR128:$src1, 5313 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5314 imm:$src3))]>, 5315 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5316} 5317 5318let ExeDomain = SSEPackedSingle in { 5319 let Predicates = [UseAVX] in 5320 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5321 VEX_4V, VEX_WIG; 5322 let Constraints = "$src1 = $dst" in 5323 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5324} 5325 5326let Predicates = [UseAVX] in { 5327 // If we're inserting an element from a vbroadcast of a load, fold the 5328 // load into the X86insertps instruction. 5329 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 5330 (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), 5331 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 5332 def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), 5333 (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), 5334 (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; 5335} 5336 5337//===----------------------------------------------------------------------===// 5338// SSE4.1 - Round Instructions 5339//===----------------------------------------------------------------------===// 5340 5341multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5342 X86MemOperand x86memop, RegisterClass RC, 5343 ValueType VT, PatFrag mem_frag, SDNode OpNode, 5344 X86FoldableSchedWrite sched> { 5345 // Intrinsic operation, reg. 5346 // Vector intrinsic operation, reg 5347 def r : SS4AIi8<opc, MRMSrcReg, 5348 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5349 !strconcat(OpcodeStr, 5350 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5351 [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>, 5352 Sched<[sched]>; 5353 5354 // Vector intrinsic operation, mem 5355 def m : SS4AIi8<opc, MRMSrcMem, 5356 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5357 !strconcat(OpcodeStr, 5358 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5359 [(set RC:$dst, 5360 (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>, 5361 Sched<[sched.Folded]>; 5362} 5363 5364multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5365 string OpcodeStr, X86FoldableSchedWrite sched> { 5366let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { 5367 def SSr : SS4AIi8<opcss, MRMSrcReg, 5368 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5369 !strconcat(OpcodeStr, 5370 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5371 []>, Sched<[sched]>; 5372 5373 let mayLoad = 1 in 5374 def SSm : SS4AIi8<opcss, MRMSrcMem, 5375 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5376 !strconcat(OpcodeStr, 5377 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5378 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5379} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5380 5381let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { 5382 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5383 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5384 !strconcat(OpcodeStr, 5385 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5386 []>, Sched<[sched]>; 5387 5388 let mayLoad = 1 in 5389 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5390 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5391 !strconcat(OpcodeStr, 5392 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5393 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5394} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5395} 5396 5397multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5398 string OpcodeStr, X86FoldableSchedWrite sched> { 5399let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { 5400 def SSr : SS4AIi8<opcss, MRMSrcReg, 5401 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5402 !strconcat(OpcodeStr, 5403 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5404 []>, Sched<[sched]>; 5405 5406 let mayLoad = 1 in 5407 def SSm : SS4AIi8<opcss, MRMSrcMem, 5408 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5409 !strconcat(OpcodeStr, 5410 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5411 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5412} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5413 5414let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { 5415 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5416 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5417 !strconcat(OpcodeStr, 5418 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5419 []>, Sched<[sched]>; 5420 5421 let mayLoad = 1 in 5422 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5423 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5424 !strconcat(OpcodeStr, 5425 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5426 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5427} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5428} 5429 5430multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5431 string OpcodeStr, X86FoldableSchedWrite sched, 5432 ValueType VT32, ValueType VT64, 5433 SDNode OpNode, bit Is2Addr = 1> { 5434let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { 5435 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5436 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5437 !if(Is2Addr, 5438 !strconcat(OpcodeStr, 5439 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5440 !strconcat(OpcodeStr, 5441 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5442 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, 5443 Sched<[sched]>; 5444 5445 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5446 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5447 !if(Is2Addr, 5448 !strconcat(OpcodeStr, 5449 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5450 !strconcat(OpcodeStr, 5451 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5452 [(set VR128:$dst, 5453 (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, 5454 Sched<[sched.Folded, sched.ReadAfterFold]>; 5455} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5456 5457let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { 5458 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5459 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5460 !if(Is2Addr, 5461 !strconcat(OpcodeStr, 5462 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5463 !strconcat(OpcodeStr, 5464 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5465 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, 5466 Sched<[sched]>; 5467 5468 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5469 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5470 !if(Is2Addr, 5471 !strconcat(OpcodeStr, 5472 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5473 !strconcat(OpcodeStr, 5474 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5475 [(set VR128:$dst, 5476 (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, 5477 Sched<[sched.Folded, sched.ReadAfterFold]>; 5478} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5479} 5480 5481// FP round - roundss, roundps, roundsd, roundpd 5482let Predicates = [HasAVX, NoVLX] in { 5483 let ExeDomain = SSEPackedSingle in { 5484 // Intrinsic form 5485 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5486 loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>, 5487 VEX, VEX_WIG; 5488 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5489 loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>, 5490 VEX, VEX_L, VEX_WIG; 5491 } 5492 5493 let ExeDomain = SSEPackedDouble in { 5494 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5495 loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>, 5496 VEX, VEX_WIG; 5497 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5498 loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>, 5499 VEX, VEX_L, VEX_WIG; 5500 } 5501} 5502let Predicates = [UseAVX] in { 5503 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5504 v4f32, v2f64, X86RndScales, 0>, 5505 VEX_4V, VEX_LIG, VEX_WIG; 5506 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5507 VEX_4V, VEX_LIG, VEX_WIG; 5508} 5509 5510let Predicates = [UseAVX] in { 5511 def : Pat<(X86VRndScale FR32:$src1, imm:$src2), 5512 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, imm:$src2)>; 5513 def : Pat<(X86VRndScale FR64:$src1, imm:$src2), 5514 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, imm:$src2)>; 5515} 5516 5517let Predicates = [UseAVX, OptForSize] in { 5518 def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), 5519 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; 5520 def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), 5521 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, imm:$src2)>; 5522} 5523 5524let ExeDomain = SSEPackedSingle in 5525defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5526 memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>; 5527let ExeDomain = SSEPackedDouble in 5528defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5529 memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>; 5530 5531defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5532 5533let Constraints = "$src1 = $dst" in 5534defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5535 v4f32, v2f64, X86RndScales>; 5536 5537let Predicates = [UseSSE41] in { 5538 def : Pat<(X86VRndScale FR32:$src1, imm:$src2), 5539 (ROUNDSSr FR32:$src1, imm:$src2)>; 5540 def : Pat<(X86VRndScale FR64:$src1, imm:$src2), 5541 (ROUNDSDr FR64:$src1, imm:$src2)>; 5542} 5543 5544let Predicates = [UseSSE41, OptForSize] in { 5545 def : Pat<(X86VRndScale (loadf32 addr:$src1), imm:$src2), 5546 (ROUNDSSm addr:$src1, imm:$src2)>; 5547 def : Pat<(X86VRndScale (loadf64 addr:$src1), imm:$src2), 5548 (ROUNDSDm addr:$src1, imm:$src2)>; 5549} 5550 5551//===----------------------------------------------------------------------===// 5552// SSE4.1 - Packed Bit Test 5553//===----------------------------------------------------------------------===// 5554 5555// ptest instruction we'll lower to this in X86ISelLowering primarily from 5556// the intel intrinsic that corresponds to this. 5557let Defs = [EFLAGS], Predicates = [HasAVX] in { 5558def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5559 "vptest\t{$src2, $src1|$src1, $src2}", 5560 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5561 Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; 5562def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5563 "vptest\t{$src2, $src1|$src1, $src2}", 5564 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5565 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5566 VEX, VEX_WIG; 5567 5568def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5569 "vptest\t{$src2, $src1|$src1, $src2}", 5570 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5571 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; 5572def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5573 "vptest\t{$src2, $src1|$src1, $src2}", 5574 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5575 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5576 VEX, VEX_L, VEX_WIG; 5577} 5578 5579let Defs = [EFLAGS] in { 5580def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5581 "ptest\t{$src2, $src1|$src1, $src2}", 5582 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5583 Sched<[SchedWriteVecTest.XMM]>; 5584def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5585 "ptest\t{$src2, $src1|$src1, $src2}", 5586 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5587 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5588} 5589 5590// The bit test instructions below are AVX only 5591multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5592 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5593 X86FoldableSchedWrite sched> { 5594 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5595 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5596 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5597 Sched<[sched]>, VEX; 5598 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5599 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5600 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5601 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5602} 5603 5604let Defs = [EFLAGS], Predicates = [HasAVX] in { 5605let ExeDomain = SSEPackedSingle in { 5606defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5607 SchedWriteFTest.XMM>; 5608defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5609 SchedWriteFTest.YMM>, VEX_L; 5610} 5611let ExeDomain = SSEPackedDouble in { 5612defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5613 SchedWriteFTest.XMM>; 5614defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5615 SchedWriteFTest.YMM>, VEX_L; 5616} 5617} 5618 5619//===----------------------------------------------------------------------===// 5620// SSE4.1 - Misc Instructions 5621//===----------------------------------------------------------------------===// 5622 5623let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5624 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5625 "popcnt{w}\t{$src, $dst|$dst, $src}", 5626 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5627 Sched<[WritePOPCNT]>, OpSize16, XS; 5628 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5629 "popcnt{w}\t{$src, $dst|$dst, $src}", 5630 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5631 (implicit EFLAGS)]>, 5632 Sched<[WritePOPCNT.Folded]>, OpSize16, XS; 5633 5634 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5635 "popcnt{l}\t{$src, $dst|$dst, $src}", 5636 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5637 Sched<[WritePOPCNT]>, OpSize32, XS; 5638 5639 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5640 "popcnt{l}\t{$src, $dst|$dst, $src}", 5641 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5642 (implicit EFLAGS)]>, 5643 Sched<[WritePOPCNT.Folded]>, OpSize32, XS; 5644 5645 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5646 "popcnt{q}\t{$src, $dst|$dst, $src}", 5647 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5648 Sched<[WritePOPCNT]>, XS; 5649 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5650 "popcnt{q}\t{$src, $dst|$dst, $src}", 5651 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5652 (implicit EFLAGS)]>, 5653 Sched<[WritePOPCNT.Folded]>, XS; 5654} 5655 5656// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5657multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5658 SDNode OpNode, PatFrag ld_frag, 5659 X86FoldableSchedWrite Sched> { 5660 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5661 (ins VR128:$src), 5662 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5663 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5664 Sched<[Sched]>; 5665 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5666 (ins i128mem:$src), 5667 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5668 [(set VR128:$dst, 5669 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5670 Sched<[Sched.Folded]>; 5671} 5672 5673// PHMIN has the same profile as PSAD, thus we use the same scheduling 5674// model, although the naming is misleading. 5675let Predicates = [HasAVX] in 5676defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5677 X86phminpos, load, 5678 WritePHMINPOS>, VEX, VEX_WIG; 5679defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5680 X86phminpos, memop, 5681 WritePHMINPOS>; 5682 5683/// SS48I_binop_rm - Simple SSE41 binary operator. 5684multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5685 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5686 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5687 bit Is2Addr = 1> { 5688 let isCommutable = 1 in 5689 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5690 (ins RC:$src1, RC:$src2), 5691 !if(Is2Addr, 5692 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5693 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5694 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5695 Sched<[sched]>; 5696 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5697 (ins RC:$src1, x86memop:$src2), 5698 !if(Is2Addr, 5699 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5700 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5701 [(set RC:$dst, 5702 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5703 Sched<[sched.Folded, sched.ReadAfterFold]>; 5704} 5705 5706let Predicates = [HasAVX, NoVLX] in { 5707 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5708 load, i128mem, SchedWriteVecALU.XMM, 0>, 5709 VEX_4V, VEX_WIG; 5710 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5711 load, i128mem, SchedWriteVecALU.XMM, 0>, 5712 VEX_4V, VEX_WIG; 5713 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5714 load, i128mem, SchedWriteVecALU.XMM, 0>, 5715 VEX_4V, VEX_WIG; 5716 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5717 load, i128mem, SchedWriteVecALU.XMM, 0>, 5718 VEX_4V, VEX_WIG; 5719 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5720 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5721 VEX_4V, VEX_WIG; 5722} 5723let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5724 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5725 load, i128mem, SchedWriteVecALU.XMM, 0>, 5726 VEX_4V, VEX_WIG; 5727 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5728 load, i128mem, SchedWriteVecALU.XMM, 0>, 5729 VEX_4V, VEX_WIG; 5730 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5731 load, i128mem, SchedWriteVecALU.XMM, 0>, 5732 VEX_4V, VEX_WIG; 5733 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5734 load, i128mem, SchedWriteVecALU.XMM, 0>, 5735 VEX_4V, VEX_WIG; 5736} 5737 5738let Predicates = [HasAVX2, NoVLX] in { 5739 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5740 load, i256mem, SchedWriteVecALU.YMM, 0>, 5741 VEX_4V, VEX_L, VEX_WIG; 5742 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5743 load, i256mem, SchedWriteVecALU.YMM, 0>, 5744 VEX_4V, VEX_L, VEX_WIG; 5745 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5746 load, i256mem, SchedWriteVecALU.YMM, 0>, 5747 VEX_4V, VEX_L, VEX_WIG; 5748 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5749 load, i256mem, SchedWriteVecALU.YMM, 0>, 5750 VEX_4V, VEX_L, VEX_WIG; 5751 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5752 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5753 VEX_4V, VEX_L, VEX_WIG; 5754} 5755let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5756 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5757 load, i256mem, SchedWriteVecALU.YMM, 0>, 5758 VEX_4V, VEX_L, VEX_WIG; 5759 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5760 load, i256mem, SchedWriteVecALU.YMM, 0>, 5761 VEX_4V, VEX_L, VEX_WIG; 5762 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5763 load, i256mem, SchedWriteVecALU.YMM, 0>, 5764 VEX_4V, VEX_L, VEX_WIG; 5765 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5766 load, i256mem, SchedWriteVecALU.YMM, 0>, 5767 VEX_4V, VEX_L, VEX_WIG; 5768} 5769 5770let Constraints = "$src1 = $dst" in { 5771 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5772 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5773 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5774 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5775 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5776 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5777 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5778 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5779 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5780 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5781 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5782 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5783 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5784 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5785 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5786 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5787 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5788 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5789} 5790 5791let Predicates = [HasAVX, NoVLX] in 5792 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5793 load, i128mem, SchedWritePMULLD.XMM, 0>, 5794 VEX_4V, VEX_WIG; 5795let Predicates = [HasAVX] in 5796 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5797 load, i128mem, SchedWriteVecALU.XMM, 0>, 5798 VEX_4V, VEX_WIG; 5799 5800let Predicates = [HasAVX2, NoVLX] in 5801 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5802 load, i256mem, SchedWritePMULLD.YMM, 0>, 5803 VEX_4V, VEX_L, VEX_WIG; 5804let Predicates = [HasAVX2] in 5805 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5806 load, i256mem, SchedWriteVecALU.YMM, 0>, 5807 VEX_4V, VEX_L, VEX_WIG; 5808 5809let Constraints = "$src1 = $dst" in { 5810 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5811 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5812 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5813 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5814} 5815 5816/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5817multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5818 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5819 X86MemOperand x86memop, bit Is2Addr, 5820 X86FoldableSchedWrite sched> { 5821 let isCommutable = 1 in 5822 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5823 (ins RC:$src1, RC:$src2, u8imm:$src3), 5824 !if(Is2Addr, 5825 !strconcat(OpcodeStr, 5826 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5827 !strconcat(OpcodeStr, 5828 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5829 [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, 5830 Sched<[sched]>; 5831 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5832 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5833 !if(Is2Addr, 5834 !strconcat(OpcodeStr, 5835 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5836 !strconcat(OpcodeStr, 5837 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5838 [(set RC:$dst, 5839 (IntId RC:$src1, (memop_frag addr:$src2), imm:$src3))]>, 5840 Sched<[sched.Folded, sched.ReadAfterFold]>; 5841} 5842 5843/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5844multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5845 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5846 X86MemOperand x86memop, bit Is2Addr, 5847 X86FoldableSchedWrite sched> { 5848 let isCommutable = 1 in 5849 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5850 (ins RC:$src1, RC:$src2, u8imm:$src3), 5851 !if(Is2Addr, 5852 !strconcat(OpcodeStr, 5853 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5854 !strconcat(OpcodeStr, 5855 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5856 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 5857 Sched<[sched]>; 5858 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5859 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5860 !if(Is2Addr, 5861 !strconcat(OpcodeStr, 5862 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5863 !strconcat(OpcodeStr, 5864 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5865 [(set RC:$dst, 5866 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, 5867 Sched<[sched.Folded, sched.ReadAfterFold]>; 5868} 5869 5870def BlendCommuteImm2 : SDNodeXForm<imm, [{ 5871 uint8_t Imm = N->getZExtValue() & 0x03; 5872 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 5873}]>; 5874 5875def BlendCommuteImm4 : SDNodeXForm<imm, [{ 5876 uint8_t Imm = N->getZExtValue() & 0x0f; 5877 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 5878}]>; 5879 5880def BlendCommuteImm8 : SDNodeXForm<imm, [{ 5881 uint8_t Imm = N->getZExtValue() & 0xff; 5882 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 5883}]>; 5884 5885// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 5886def BlendScaleImm4 : SDNodeXForm<imm, [{ 5887 uint8_t Imm = N->getZExtValue(); 5888 uint8_t NewImm = 0; 5889 for (unsigned i = 0; i != 4; ++i) { 5890 if (Imm & (1 << i)) 5891 NewImm |= 0x3 << (i * 2); 5892 } 5893 return getI8Imm(NewImm, SDLoc(N)); 5894}]>; 5895 5896// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 5897def BlendScaleImm2 : SDNodeXForm<imm, [{ 5898 uint8_t Imm = N->getZExtValue(); 5899 uint8_t NewImm = 0; 5900 for (unsigned i = 0; i != 2; ++i) { 5901 if (Imm & (1 << i)) 5902 NewImm |= 0xf << (i * 4); 5903 } 5904 return getI8Imm(NewImm, SDLoc(N)); 5905}]>; 5906 5907// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 5908def BlendScaleImm2to4 : SDNodeXForm<imm, [{ 5909 uint8_t Imm = N->getZExtValue(); 5910 uint8_t NewImm = 0; 5911 for (unsigned i = 0; i != 2; ++i) { 5912 if (Imm & (1 << i)) 5913 NewImm |= 0x3 << (i * 2); 5914 } 5915 return getI8Imm(NewImm, SDLoc(N)); 5916}]>; 5917 5918// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 5919def BlendScaleCommuteImm4 : SDNodeXForm<imm, [{ 5920 uint8_t Imm = N->getZExtValue(); 5921 uint8_t NewImm = 0; 5922 for (unsigned i = 0; i != 4; ++i) { 5923 if (Imm & (1 << i)) 5924 NewImm |= 0x3 << (i * 2); 5925 } 5926 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5927}]>; 5928 5929// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 5930def BlendScaleCommuteImm2 : SDNodeXForm<imm, [{ 5931 uint8_t Imm = N->getZExtValue(); 5932 uint8_t NewImm = 0; 5933 for (unsigned i = 0; i != 2; ++i) { 5934 if (Imm & (1 << i)) 5935 NewImm |= 0xf << (i * 4); 5936 } 5937 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 5938}]>; 5939 5940// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 5941def BlendScaleCommuteImm2to4 : SDNodeXForm<imm, [{ 5942 uint8_t Imm = N->getZExtValue(); 5943 uint8_t NewImm = 0; 5944 for (unsigned i = 0; i != 2; ++i) { 5945 if (Imm & (1 << i)) 5946 NewImm |= 0x3 << (i * 2); 5947 } 5948 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 5949}]>; 5950 5951let Predicates = [HasAVX] in { 5952 let isCommutable = 0 in { 5953 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 5954 VR128, load, i128mem, 0, 5955 SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; 5956 } 5957 5958 let ExeDomain = SSEPackedSingle in 5959 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 5960 VR128, load, f128mem, 0, 5961 SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; 5962 let ExeDomain = SSEPackedDouble in 5963 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 5964 VR128, load, f128mem, 0, 5965 SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; 5966 let ExeDomain = SSEPackedSingle in 5967 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 5968 VR256, load, i256mem, 0, 5969 SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; 5970} 5971 5972let Predicates = [HasAVX2] in { 5973 let isCommutable = 0 in { 5974 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 5975 VR256, load, i256mem, 0, 5976 SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; 5977 } 5978} 5979 5980let Constraints = "$src1 = $dst" in { 5981 let isCommutable = 0 in { 5982 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 5983 VR128, memop, i128mem, 1, 5984 SchedWriteMPSAD.XMM>; 5985 } 5986 5987 let ExeDomain = SSEPackedSingle in 5988 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 5989 VR128, memop, f128mem, 1, 5990 SchedWriteDPPS.XMM>; 5991 let ExeDomain = SSEPackedDouble in 5992 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 5993 VR128, memop, f128mem, 1, 5994 SchedWriteDPPD.XMM>; 5995} 5996 5997/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 5998multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5999 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6000 X86MemOperand x86memop, bit Is2Addr, Domain d, 6001 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6002let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6003 let isCommutable = 1 in 6004 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6005 (ins RC:$src1, RC:$src2, u8imm:$src3), 6006 !if(Is2Addr, 6007 !strconcat(OpcodeStr, 6008 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6009 !strconcat(OpcodeStr, 6010 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6011 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 6012 Sched<[sched]>; 6013 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6014 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6015 !if(Is2Addr, 6016 !strconcat(OpcodeStr, 6017 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6018 !strconcat(OpcodeStr, 6019 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6020 [(set RC:$dst, 6021 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), imm:$src3)))]>, 6022 Sched<[sched.Folded, sched.ReadAfterFold]>; 6023} 6024 6025 // Pattern to commute if load is in first source. 6026 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, imm:$src3)), 6027 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6028 (commuteXForm imm:$src3))>; 6029} 6030 6031let Predicates = [HasAVX] in { 6032 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6033 VR128, load, f128mem, 0, SSEPackedSingle, 6034 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6035 VEX_4V, VEX_WIG; 6036 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6037 VR256, load, f256mem, 0, SSEPackedSingle, 6038 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6039 VEX_4V, VEX_L, VEX_WIG; 6040 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6041 VR128, load, f128mem, 0, SSEPackedDouble, 6042 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6043 VEX_4V, VEX_WIG; 6044 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6045 VR256, load, f256mem, 0, SSEPackedDouble, 6046 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6047 VEX_4V, VEX_L, VEX_WIG; 6048 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6049 VR128, load, i128mem, 0, SSEPackedInt, 6050 SchedWriteBlend.XMM, BlendCommuteImm8>, 6051 VEX_4V, VEX_WIG; 6052} 6053 6054let Predicates = [HasAVX2] in { 6055 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6056 VR256, load, i256mem, 0, SSEPackedInt, 6057 SchedWriteBlend.YMM, BlendCommuteImm8>, 6058 VEX_4V, VEX_L, VEX_WIG; 6059} 6060 6061// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6062// ExecutionDomainFixPass will cleanup domains later on. 6063let Predicates = [HasAVX1Only] in { 6064def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), 6065 (VBLENDPDYrri VR256:$src1, VR256:$src2, imm:$src3)>; 6066def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), 6067 (VBLENDPDYrmi VR256:$src1, addr:$src2, imm:$src3)>; 6068def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), 6069 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 imm:$src3))>; 6070 6071// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6072// it from becoming movsd via commuting under optsize. 6073def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), 6074 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; 6075def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), 6076 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; 6077def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), 6078 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; 6079 6080def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), imm:$src3), 6081 (VBLENDPSYrri VR256:$src1, VR256:$src2, imm:$src3)>; 6082def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), imm:$src3), 6083 (VBLENDPSYrmi VR256:$src1, addr:$src2, imm:$src3)>; 6084def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, imm:$src3), 6085 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 imm:$src3))>; 6086 6087// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6088// it from becoming movss via commuting under optsize. 6089def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), 6090 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; 6091def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), imm:$src3), 6092 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; 6093def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, imm:$src3), 6094 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; 6095} 6096 6097defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6098 VR128, memop, f128mem, 1, SSEPackedSingle, 6099 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6100defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6101 VR128, memop, f128mem, 1, SSEPackedDouble, 6102 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6103defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6104 VR128, memop, i128mem, 1, SSEPackedInt, 6105 SchedWriteBlend.XMM, BlendCommuteImm8>; 6106 6107let Predicates = [UseSSE41] in { 6108// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6109// it from becoming movss via commuting under optsize. 6110def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), 6111 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 imm:$src3))>; 6112def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), imm:$src3), 6113 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 imm:$src3))>; 6114def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, imm:$src3), 6115 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 imm:$src3))>; 6116 6117def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), imm:$src3), 6118 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 imm:$src3))>; 6119def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), imm:$src3), 6120 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; 6121def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, imm:$src3), 6122 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; 6123} 6124 6125// For insertion into the zero index (low half) of a 256-bit vector, it is 6126// more efficient to generate a blend with immediate instead of an insert*128. 6127let Predicates = [HasAVX] in { 6128def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6129 (VBLENDPDYrri VR256:$src1, 6130 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6131 VR128:$src2, sub_xmm), 0x3)>; 6132def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6133 (VBLENDPSYrri VR256:$src1, 6134 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6135 VR128:$src2, sub_xmm), 0xf)>; 6136 6137def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6138 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6139 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6140def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6141 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6142 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6143} 6144 6145/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6146multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6147 X86MemOperand x86memop, ValueType VT, 6148 PatFrag mem_frag, SDNode OpNode, 6149 X86FoldableSchedWrite sched> { 6150 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6151 (ins RC:$src1, RC:$src2, RC:$src3), 6152 !strconcat(OpcodeStr, 6153 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6154 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6155 SSEPackedInt>, TAPD, VEX_4V, 6156 Sched<[sched]>; 6157 6158 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6159 (ins RC:$src1, x86memop:$src2, RC:$src3), 6160 !strconcat(OpcodeStr, 6161 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6162 [(set RC:$dst, 6163 (OpNode RC:$src3, (mem_frag addr:$src2), 6164 RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, 6165 Sched<[sched.Folded, sched.ReadAfterFold, 6166 // x86memop:$src2 6167 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6168 ReadDefault, 6169 // RC::$src3 6170 sched.ReadAfterFold]>; 6171} 6172 6173let Predicates = [HasAVX] in { 6174let ExeDomain = SSEPackedDouble in { 6175defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6176 v2f64, loadv2f64, X86Blendv, 6177 SchedWriteFVarBlend.XMM>; 6178defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6179 v4f64, loadv4f64, X86Blendv, 6180 SchedWriteFVarBlend.YMM>, VEX_L; 6181} // ExeDomain = SSEPackedDouble 6182let ExeDomain = SSEPackedSingle in { 6183defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6184 v4f32, loadv4f32, X86Blendv, 6185 SchedWriteFVarBlend.XMM>; 6186defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6187 v8f32, loadv8f32, X86Blendv, 6188 SchedWriteFVarBlend.YMM>, VEX_L; 6189} // ExeDomain = SSEPackedSingle 6190defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6191 v16i8, loadv16i8, X86Blendv, 6192 SchedWriteVarBlend.XMM>; 6193} 6194 6195let Predicates = [HasAVX2] in { 6196defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6197 v32i8, loadv32i8, X86Blendv, 6198 SchedWriteVarBlend.YMM>, VEX_L; 6199} 6200 6201let Predicates = [HasAVX] in { 6202 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6203 (v4i32 VR128:$src2))), 6204 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6205 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6206 (v2i64 VR128:$src2))), 6207 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6208 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6209 (v8i32 VR256:$src2))), 6210 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6211 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6212 (v4i64 VR256:$src2))), 6213 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6214} 6215 6216// Prefer a movss or movsd over a blendps when optimizing for size. these were 6217// changed to use blends because blends have better throughput on sandybridge 6218// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6219let Predicates = [HasAVX, OptForSpeed] in { 6220 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6221 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6222 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6223 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6224 6225 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6226 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6227 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6228 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6229 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6230 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6231 6232 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6233 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6234 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6235 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6236 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6237 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6238 6239 // Move low f32 and clear high bits. 6240 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6241 (SUBREG_TO_REG (i32 0), 6242 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6243 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6244 (i8 1))), sub_xmm)>; 6245 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6246 (SUBREG_TO_REG (i32 0), 6247 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6248 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6249 (i8 3))), sub_xmm)>; 6250} 6251 6252// Prefer a movss or movsd over a blendps when optimizing for size. these were 6253// changed to use blends because blends have better throughput on sandybridge 6254// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6255let Predicates = [UseSSE41, OptForSpeed] in { 6256 // With SSE41 we can use blends for these patterns. 6257 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6258 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6259 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6260 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6261 6262 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6263 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6264 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6265 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6266 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6267 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6268 6269 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6270 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6271 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6272 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6273 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6274 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6275} 6276 6277 6278/// SS41I_ternary - SSE 4.1 ternary operator 6279let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6280 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6281 PatFrag mem_frag, X86MemOperand x86memop, 6282 SDNode OpNode, X86FoldableSchedWrite sched> { 6283 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6284 (ins VR128:$src1, VR128:$src2), 6285 !strconcat(OpcodeStr, 6286 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6287 [(set VR128:$dst, 6288 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6289 Sched<[sched]>; 6290 6291 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6292 (ins VR128:$src1, x86memop:$src2), 6293 !strconcat(OpcodeStr, 6294 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6295 [(set VR128:$dst, 6296 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6297 Sched<[sched.Folded, sched.ReadAfterFold]>; 6298 } 6299} 6300 6301let ExeDomain = SSEPackedDouble in 6302defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6303 X86Blendv, SchedWriteFVarBlend.XMM>; 6304let ExeDomain = SSEPackedSingle in 6305defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6306 X86Blendv, SchedWriteFVarBlend.XMM>; 6307defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6308 X86Blendv, SchedWriteVarBlend.XMM>; 6309 6310// Aliases with the implicit xmm0 argument 6311def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6312 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6313def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6314 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6315def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6316 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6317def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6318 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6319def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6320 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6321def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6322 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6323 6324let Predicates = [UseSSE41] in { 6325 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6326 (v4i32 VR128:$src2))), 6327 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6328 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6329 (v2i64 VR128:$src2))), 6330 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6331} 6332 6333let AddedComplexity = 400 in { // Prefer non-temporal versions 6334 6335let Predicates = [HasAVX, NoVLX] in 6336def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6337 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6338 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; 6339let Predicates = [HasAVX2, NoVLX] in 6340def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6341 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6342 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; 6343def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6344 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6345 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6346 6347let Predicates = [HasAVX2, NoVLX] in { 6348 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6349 (VMOVNTDQAYrm addr:$src)>; 6350 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6351 (VMOVNTDQAYrm addr:$src)>; 6352 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6353 (VMOVNTDQAYrm addr:$src)>; 6354 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6355 (VMOVNTDQAYrm addr:$src)>; 6356 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6357 (VMOVNTDQAYrm addr:$src)>; 6358 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6359 (VMOVNTDQAYrm addr:$src)>; 6360} 6361 6362let Predicates = [HasAVX, NoVLX] in { 6363 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6364 (VMOVNTDQArm addr:$src)>; 6365 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6366 (VMOVNTDQArm addr:$src)>; 6367 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6368 (VMOVNTDQArm addr:$src)>; 6369 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6370 (VMOVNTDQArm addr:$src)>; 6371 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6372 (VMOVNTDQArm addr:$src)>; 6373 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6374 (VMOVNTDQArm addr:$src)>; 6375} 6376 6377let Predicates = [UseSSE41] in { 6378 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6379 (MOVNTDQArm addr:$src)>; 6380 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6381 (MOVNTDQArm addr:$src)>; 6382 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6383 (MOVNTDQArm addr:$src)>; 6384 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6385 (MOVNTDQArm addr:$src)>; 6386 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6387 (MOVNTDQArm addr:$src)>; 6388 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6389 (MOVNTDQArm addr:$src)>; 6390} 6391 6392} // AddedComplexity 6393 6394//===----------------------------------------------------------------------===// 6395// SSE4.2 - Compare Instructions 6396//===----------------------------------------------------------------------===// 6397 6398/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6399multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6400 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6401 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6402 bit Is2Addr = 1> { 6403 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6404 (ins RC:$src1, RC:$src2), 6405 !if(Is2Addr, 6406 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6407 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6408 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6409 Sched<[sched]>; 6410 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6411 (ins RC:$src1, x86memop:$src2), 6412 !if(Is2Addr, 6413 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6414 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6415 [(set RC:$dst, 6416 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6417 Sched<[sched.Folded, sched.ReadAfterFold]>; 6418} 6419 6420let Predicates = [HasAVX] in 6421 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6422 load, i128mem, SchedWriteVecALU.XMM, 0>, 6423 VEX_4V, VEX_WIG; 6424 6425let Predicates = [HasAVX2] in 6426 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6427 load, i256mem, SchedWriteVecALU.YMM, 0>, 6428 VEX_4V, VEX_L, VEX_WIG; 6429 6430let Constraints = "$src1 = $dst" in 6431 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6432 memop, i128mem, SchedWriteVecALU.XMM>; 6433 6434//===----------------------------------------------------------------------===// 6435// SSE4.2 - String/text Processing Instructions 6436//===----------------------------------------------------------------------===// 6437 6438multiclass pcmpistrm_SS42AI<string asm> { 6439 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6440 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6441 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6442 []>, Sched<[WritePCmpIStrM]>; 6443 let mayLoad = 1 in 6444 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6445 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6446 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6447 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6448} 6449 6450let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6451 let Predicates = [HasAVX] in 6452 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; 6453 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6454} 6455 6456multiclass SS42AI_pcmpestrm<string asm> { 6457 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6458 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6459 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6460 []>, Sched<[WritePCmpEStrM]>; 6461 let mayLoad = 1 in 6462 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6463 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6464 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6465 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6466} 6467 6468let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6469 let Predicates = [HasAVX] in 6470 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; 6471 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6472} 6473 6474multiclass SS42AI_pcmpistri<string asm> { 6475 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6476 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6477 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6478 []>, Sched<[WritePCmpIStrI]>; 6479 let mayLoad = 1 in 6480 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6481 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6482 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6483 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6484} 6485 6486let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6487 let Predicates = [HasAVX] in 6488 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; 6489 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6490} 6491 6492multiclass SS42AI_pcmpestri<string asm> { 6493 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6494 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6495 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6496 []>, Sched<[WritePCmpEStrI]>; 6497 let mayLoad = 1 in 6498 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6499 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6500 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6501 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6502} 6503 6504let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6505 let Predicates = [HasAVX] in 6506 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; 6507 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6508} 6509 6510//===----------------------------------------------------------------------===// 6511// SSE4.2 - CRC Instructions 6512//===----------------------------------------------------------------------===// 6513 6514// No CRC instructions have AVX equivalents 6515 6516// crc intrinsic instruction 6517// This set of instructions are only rm, the only difference is the size 6518// of r and m. 6519class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, 6520 RegisterClass RCIn, SDPatternOperator Int> : 6521 SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), 6522 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6523 [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, 6524 Sched<[WriteCRC32]>; 6525 6526class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, 6527 X86MemOperand x86memop, SDPatternOperator Int> : 6528 SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), 6529 !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), 6530 [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, 6531 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>; 6532 6533let Constraints = "$src1 = $dst" in { 6534 def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, 6535 int_x86_sse42_crc32_32_8>; 6536 def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, 6537 int_x86_sse42_crc32_32_8>; 6538 def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, 6539 int_x86_sse42_crc32_32_16>, OpSize16; 6540 def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, 6541 int_x86_sse42_crc32_32_16>, OpSize16; 6542 def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, 6543 int_x86_sse42_crc32_32_32>, OpSize32; 6544 def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, 6545 int_x86_sse42_crc32_32_32>, OpSize32; 6546 def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, 6547 int_x86_sse42_crc32_64_64>, REX_W; 6548 def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, 6549 int_x86_sse42_crc32_64_64>, REX_W; 6550 let hasSideEffects = 0 in { 6551 let mayLoad = 1 in 6552 def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, 6553 null_frag>, REX_W; 6554 def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, 6555 null_frag>, REX_W; 6556 } 6557} 6558 6559//===----------------------------------------------------------------------===// 6560// SHA-NI Instructions 6561//===----------------------------------------------------------------------===// 6562 6563// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6564multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6565 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6566 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6567 (ins VR128:$src1, VR128:$src2), 6568 !if(UsesXMM0, 6569 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6570 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6571 [!if(UsesXMM0, 6572 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6573 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6574 T8, Sched<[sched]>; 6575 6576 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6577 (ins VR128:$src1, i128mem:$src2), 6578 !if(UsesXMM0, 6579 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6580 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6581 [!if(UsesXMM0, 6582 (set VR128:$dst, (IntId VR128:$src1, 6583 (memop addr:$src2), XMM0)), 6584 (set VR128:$dst, (IntId VR128:$src1, 6585 (memop addr:$src2))))]>, T8, 6586 Sched<[sched.Folded, sched.ReadAfterFold]>; 6587} 6588 6589let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6590 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6591 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6592 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6593 [(set VR128:$dst, 6594 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6595 (i8 imm:$src3)))]>, TA, 6596 Sched<[SchedWriteVecIMul.XMM]>; 6597 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6598 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6599 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6600 [(set VR128:$dst, 6601 (int_x86_sha1rnds4 VR128:$src1, 6602 (memop addr:$src2), 6603 (i8 imm:$src3)))]>, TA, 6604 Sched<[SchedWriteVecIMul.XMM.Folded, 6605 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6606 6607 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6608 SchedWriteVecIMul.XMM>; 6609 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6610 SchedWriteVecIMul.XMM>; 6611 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6612 SchedWriteVecIMul.XMM>; 6613 6614 let Uses=[XMM0] in 6615 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6616 SchedWriteVecIMul.XMM, 1>; 6617 6618 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6619 SchedWriteVecIMul.XMM>; 6620 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6621 SchedWriteVecIMul.XMM>; 6622} 6623 6624// Aliases with explicit %xmm0 6625def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6626 (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; 6627def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", 6628 (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; 6629 6630//===----------------------------------------------------------------------===// 6631// AES-NI Instructions 6632//===----------------------------------------------------------------------===// 6633 6634multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6635 Intrinsic IntId, PatFrag ld_frag, 6636 bit Is2Addr = 0, RegisterClass RC = VR128, 6637 X86MemOperand MemOp = i128mem> { 6638 let AsmString = OpcodeStr## 6639 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6640 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6641 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6642 (ins RC:$src1, RC:$src2), "", 6643 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6644 Sched<[WriteAESDecEnc]>; 6645 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6646 (ins RC:$src1, MemOp:$src2), "", 6647 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6648 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6649 } 6650} 6651 6652// Perform One Round of an AES Encryption/Decryption Flow 6653let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6654 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6655 int_x86_aesni_aesenc, load>, VEX_4V, VEX_WIG; 6656 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6657 int_x86_aesni_aesenclast, load>, VEX_4V, VEX_WIG; 6658 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6659 int_x86_aesni_aesdec, load>, VEX_4V, VEX_WIG; 6660 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6661 int_x86_aesni_aesdeclast, load>, VEX_4V, VEX_WIG; 6662} 6663 6664let Predicates = [NoVLX, HasVAES] in { 6665 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6666 int_x86_aesni_aesenc_256, load, 0, VR256, 6667 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6668 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6669 int_x86_aesni_aesenclast_256, load, 0, VR256, 6670 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6671 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6672 int_x86_aesni_aesdec_256, load, 0, VR256, 6673 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6674 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6675 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6676 i256mem>, VEX_4V, VEX_L, VEX_WIG; 6677} 6678 6679let Constraints = "$src1 = $dst" in { 6680 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6681 int_x86_aesni_aesenc, memop, 1>; 6682 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6683 int_x86_aesni_aesenclast, memop, 1>; 6684 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6685 int_x86_aesni_aesdec, memop, 1>; 6686 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6687 int_x86_aesni_aesdeclast, memop, 1>; 6688} 6689 6690// Perform the AES InvMixColumn Transformation 6691let Predicates = [HasAVX, HasAES] in { 6692 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6693 (ins VR128:$src1), 6694 "vaesimc\t{$src1, $dst|$dst, $src1}", 6695 [(set VR128:$dst, 6696 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6697 VEX, VEX_WIG; 6698 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6699 (ins i128mem:$src1), 6700 "vaesimc\t{$src1, $dst|$dst, $src1}", 6701 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6702 Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; 6703} 6704def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6705 (ins VR128:$src1), 6706 "aesimc\t{$src1, $dst|$dst, $src1}", 6707 [(set VR128:$dst, 6708 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6709def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6710 (ins i128mem:$src1), 6711 "aesimc\t{$src1, $dst|$dst, $src1}", 6712 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6713 Sched<[WriteAESIMC.Folded]>; 6714 6715// AES Round Key Generation Assist 6716let Predicates = [HasAVX, HasAES] in { 6717 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6718 (ins VR128:$src1, u8imm:$src2), 6719 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6720 [(set VR128:$dst, 6721 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 6722 Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; 6723 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6724 (ins i128mem:$src1, u8imm:$src2), 6725 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6726 [(set VR128:$dst, 6727 (int_x86_aesni_aeskeygenassist (load addr:$src1), imm:$src2))]>, 6728 Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; 6729} 6730def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6731 (ins VR128:$src1, u8imm:$src2), 6732 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6733 [(set VR128:$dst, 6734 (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, 6735 Sched<[WriteAESKeyGen]>; 6736def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6737 (ins i128mem:$src1, u8imm:$src2), 6738 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6739 [(set VR128:$dst, 6740 (int_x86_aesni_aeskeygenassist (memop addr:$src1), imm:$src2))]>, 6741 Sched<[WriteAESKeyGen.Folded]>; 6742 6743//===----------------------------------------------------------------------===// 6744// PCLMUL Instructions 6745//===----------------------------------------------------------------------===// 6746 6747// Immediate transform to help with commuting. 6748def PCLMULCommuteImm : SDNodeXForm<imm, [{ 6749 uint8_t Imm = N->getZExtValue(); 6750 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6751}]>; 6752 6753// SSE carry-less Multiplication instructions 6754let Predicates = [NoAVX, HasPCLMUL] in { 6755 let Constraints = "$src1 = $dst" in { 6756 let isCommutable = 1 in 6757 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6758 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6759 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6760 [(set VR128:$dst, 6761 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, 6762 Sched<[WriteCLMul]>; 6763 6764 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6765 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6766 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6767 [(set VR128:$dst, 6768 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6769 imm:$src3))]>, 6770 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6771 } // Constraints = "$src1 = $dst" 6772 6773 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6774 (i8 imm:$src3)), 6775 (PCLMULQDQrm VR128:$src1, addr:$src2, 6776 (PCLMULCommuteImm imm:$src3))>; 6777} // Predicates = [NoAVX, HasPCLMUL] 6778 6779// SSE aliases 6780foreach HI = ["hq","lq"] in 6781foreach LO = ["hq","lq"] in { 6782 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6783 (PCLMULQDQrr VR128:$dst, VR128:$src, 6784 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6785 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6786 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6787 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6788} 6789 6790// AVX carry-less Multiplication instructions 6791multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6792 PatFrag LdFrag, Intrinsic IntId> { 6793 let isCommutable = 1 in 6794 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6795 (ins RC:$src1, RC:$src2, u8imm:$src3), 6796 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6797 [(set RC:$dst, 6798 (IntId RC:$src1, RC:$src2, imm:$src3))]>, 6799 Sched<[WriteCLMul]>; 6800 6801 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6802 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6803 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6804 [(set RC:$dst, 6805 (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, 6806 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6807 6808 // We can commute a load in the first operand by swapping the sources and 6809 // rotating the immediate. 6810 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), 6811 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6812 (PCLMULCommuteImm imm:$src3))>; 6813} 6814 6815let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6816defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6817 int_x86_pclmulqdq>, VEX_4V, VEX_WIG; 6818 6819let Predicates = [NoVLX, HasVPCLMULQDQ] in 6820defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6821 int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; 6822 6823multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6824 X86MemOperand MemOp, string Hi, string Lo> { 6825 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6826 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 6827 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6828 def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6829 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 6830 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 6831} 6832 6833multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 6834 X86MemOperand MemOp> { 6835 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 6836 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 6837 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 6838 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 6839} 6840 6841// AVX aliases 6842defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 6843defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 6844 6845//===----------------------------------------------------------------------===// 6846// SSE4A Instructions 6847//===----------------------------------------------------------------------===// 6848 6849let Predicates = [HasSSE4A] in { 6850 6851let ExeDomain = SSEPackedInt in { 6852let Constraints = "$src = $dst" in { 6853def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 6854 (ins VR128:$src, u8imm:$len, u8imm:$idx), 6855 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 6856 [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, 6857 imm:$idx))]>, 6858 PD, Sched<[SchedWriteVecALU.XMM]>; 6859def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6860 (ins VR128:$src, VR128:$mask), 6861 "extrq\t{$mask, $src|$src, $mask}", 6862 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 6863 VR128:$mask))]>, 6864 PD, Sched<[SchedWriteVecALU.XMM]>; 6865 6866def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 6867 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 6868 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 6869 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 6870 imm:$len, imm:$idx))]>, 6871 XD, Sched<[SchedWriteVecALU.XMM]>; 6872def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 6873 (ins VR128:$src, VR128:$mask), 6874 "insertq\t{$mask, $src|$src, $mask}", 6875 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 6876 VR128:$mask))]>, 6877 XD, Sched<[SchedWriteVecALU.XMM]>; 6878} 6879} // ExeDomain = SSEPackedInt 6880 6881// Non-temporal (unaligned) scalar stores. 6882let AddedComplexity = 400 in { // Prefer non-temporal versions 6883let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 6884def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 6885 "movntss\t{$src, $dst|$dst, $src}", []>, XS; 6886 6887def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 6888 "movntsd\t{$src, $dst|$dst, $src}", []>, XD; 6889} // SchedRW 6890 6891def : Pat<(nontemporalstore FR32:$src, addr:$dst), 6892 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 6893 6894def : Pat<(nontemporalstore FR64:$src, addr:$dst), 6895 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 6896 6897} // AddedComplexity 6898} // HasSSE4A 6899 6900//===----------------------------------------------------------------------===// 6901// AVX Instructions 6902//===----------------------------------------------------------------------===// 6903 6904//===----------------------------------------------------------------------===// 6905// VBROADCAST - Load from memory and broadcast to all elements of the 6906// destination operand 6907// 6908class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 6909 X86MemOperand x86memop, ValueType VT, 6910 PatFrag ld_frag, SchedWrite Sched> : 6911 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 6912 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6913 [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, 6914 Sched<[Sched]>, VEX; 6915 6916// AVX2 adds register forms 6917class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 6918 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 6919 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 6920 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 6921 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 6922 Sched<[Sched]>, VEX; 6923 6924let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 6925 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 6926 f32mem, v4f32, loadf32, 6927 SchedWriteFShuffle.XMM.Folded>; 6928 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 6929 f32mem, v8f32, loadf32, 6930 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6931} 6932let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 6933def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 6934 v4f64, loadf64, 6935 SchedWriteFShuffle.XMM.Folded>, VEX_L; 6936 6937let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 6938 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 6939 v4f32, v4f32, SchedWriteFShuffle.XMM>; 6940 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 6941 v8f32, v4f32, WriteFShuffle256>, VEX_L; 6942} 6943let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 6944def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 6945 v4f64, v2f64, WriteFShuffle256>, VEX_L; 6946 6947let Predicates = [HasAVX, NoVLX] in { 6948 def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 6949 (VBROADCASTSSrm addr:$src)>; 6950 def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), 6951 (VBROADCASTSSYrm addr:$src)>; 6952 def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), 6953 (VBROADCASTSDYrm addr:$src)>; 6954} 6955 6956//===----------------------------------------------------------------------===// 6957// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 6958// halves of a 256-bit vector. 6959// 6960let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 6961def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 6962 (ins i128mem:$src), 6963 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 6964 Sched<[WriteShuffleLd]>, VEX, VEX_L; 6965 6966let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 6967 ExeDomain = SSEPackedSingle in 6968def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 6969 (ins f128mem:$src), 6970 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 6971 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 6972 6973let Predicates = [HasAVX, NoVLX] in { 6974def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), 6975 (VBROADCASTF128 addr:$src)>; 6976def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), 6977 (VBROADCASTF128 addr:$src)>; 6978} 6979 6980// NOTE: We're using FP instructions here, but execution domain fixing can 6981// convert to integer when profitable. 6982let Predicates = [HasAVX, NoVLX] in { 6983def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), 6984 (VBROADCASTF128 addr:$src)>; 6985def : Pat<(v8i32 (X86SubVBroadcast (loadv4i32 addr:$src))), 6986 (VBROADCASTF128 addr:$src)>; 6987def : Pat<(v16i16 (X86SubVBroadcast (loadv8i16 addr:$src))), 6988 (VBROADCASTF128 addr:$src)>; 6989def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))), 6990 (VBROADCASTF128 addr:$src)>; 6991} 6992 6993//===----------------------------------------------------------------------===// 6994// VINSERTF128 - Insert packed floating-point values 6995// 6996let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 6997def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 6998 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 6999 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7000 []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; 7001let mayLoad = 1 in 7002def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7003 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7004 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7005 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7006} 7007 7008// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7009// with YMM register containing zero. 7010// FIXME: Avoid producing vxorps to clear the fake inputs. 7011let Predicates = [HasAVX1Only] in { 7012def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7013} 7014 7015multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, 7016 PatFrag memop_frag> { 7017 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7018 (iPTR imm)), 7019 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7020 (INSERT_get_vinsert128_imm VR256:$ins))>; 7021 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7022 (From (memop_frag addr:$src2)), 7023 (iPTR imm)), 7024 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7025 (INSERT_get_vinsert128_imm VR256:$ins))>; 7026} 7027 7028let Predicates = [HasAVX, NoVLX] in { 7029 defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; 7030 defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; 7031} 7032 7033let Predicates = [HasAVX1Only] in { 7034 defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; 7035 defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv4i32>; 7036 defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv8i16>; 7037 defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv16i8>; 7038} 7039 7040//===----------------------------------------------------------------------===// 7041// VEXTRACTF128 - Extract packed floating-point values 7042// 7043let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7044def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7045 (ins VR256:$src1, u8imm:$src2), 7046 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7047 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7048let mayStore = 1 in 7049def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7050 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7051 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7052 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7053} 7054 7055multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7056 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7057 (To (!cast<Instruction>(InstrStr#rr) 7058 (From VR256:$src1), 7059 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7060 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7061 (iPTR imm))), addr:$dst), 7062 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7063 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7064} 7065 7066// AVX1 patterns 7067let Predicates = [HasAVX, NoVLX] in { 7068 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7069 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7070} 7071 7072let Predicates = [HasAVX1Only] in { 7073 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7074 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7075 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7076 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7077} 7078 7079//===----------------------------------------------------------------------===// 7080// VMASKMOV - Conditional SIMD Packed Loads and Stores 7081// 7082multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7083 Intrinsic IntLd, Intrinsic IntLd256, 7084 Intrinsic IntSt, Intrinsic IntSt256> { 7085 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7086 (ins VR128:$src1, f128mem:$src2), 7087 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7088 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7089 VEX_4V, Sched<[WriteFMaskedLoad]>; 7090 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7091 (ins VR256:$src1, f256mem:$src2), 7092 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7093 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7094 VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; 7095 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7096 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7097 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7098 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7099 VEX_4V, Sched<[WriteFMaskedStore]>; 7100 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7101 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7102 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7103 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7104 VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; 7105} 7106 7107let ExeDomain = SSEPackedSingle in 7108defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7109 int_x86_avx_maskload_ps, 7110 int_x86_avx_maskload_ps_256, 7111 int_x86_avx_maskstore_ps, 7112 int_x86_avx_maskstore_ps_256>; 7113let ExeDomain = SSEPackedDouble in 7114defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7115 int_x86_avx_maskload_pd, 7116 int_x86_avx_maskload_pd_256, 7117 int_x86_avx_maskstore_pd, 7118 int_x86_avx_maskstore_pd_256>; 7119 7120//===----------------------------------------------------------------------===// 7121// VPERMIL - Permute Single and Double Floating-Point Values 7122// 7123 7124multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7125 RegisterClass RC, X86MemOperand x86memop_f, 7126 X86MemOperand x86memop_i, 7127 ValueType f_vt, ValueType i_vt, 7128 X86FoldableSchedWrite sched, 7129 X86FoldableSchedWrite varsched> { 7130 let Predicates = [HasAVX, NoVLX] in { 7131 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7132 (ins RC:$src1, RC:$src2), 7133 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7134 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, 7135 Sched<[varsched]>; 7136 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7137 (ins RC:$src1, x86memop_i:$src2), 7138 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7139 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7140 (i_vt (load addr:$src2)))))]>, VEX_4V, 7141 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7142 7143 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7144 (ins RC:$src1, u8imm:$src2), 7145 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7146 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, 7147 Sched<[sched]>; 7148 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7149 (ins x86memop_f:$src1, u8imm:$src2), 7150 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7151 [(set RC:$dst, 7152 (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, 7153 Sched<[sched.Folded]>; 7154 }// Predicates = [HasAVX, NoVLX] 7155} 7156 7157let ExeDomain = SSEPackedSingle in { 7158 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7159 v4f32, v4i32, SchedWriteFShuffle.XMM, 7160 SchedWriteFVarShuffle.XMM>; 7161 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7162 v8f32, v8i32, SchedWriteFShuffle.YMM, 7163 SchedWriteFVarShuffle.YMM>, VEX_L; 7164} 7165let ExeDomain = SSEPackedDouble in { 7166 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7167 v2f64, v2i64, SchedWriteFShuffle.XMM, 7168 SchedWriteFVarShuffle.XMM>; 7169 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7170 v4f64, v4i64, SchedWriteFShuffle.YMM, 7171 SchedWriteFVarShuffle.YMM>, VEX_L; 7172} 7173 7174//===----------------------------------------------------------------------===// 7175// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7176// 7177 7178let ExeDomain = SSEPackedSingle in { 7179let isCommutable = 1 in 7180def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7181 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7182 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7183 [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7184 (i8 imm:$src3))))]>, VEX_4V, VEX_L, 7185 Sched<[WriteFShuffle256]>; 7186def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7187 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7188 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7189 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), 7190 (i8 imm:$src3)))]>, VEX_4V, VEX_L, 7191 Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7192} 7193 7194// Immediate transform to help with commuting. 7195def Perm2XCommuteImm : SDNodeXForm<imm, [{ 7196 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7197}]>; 7198 7199let Predicates = [HasAVX] in { 7200// Pattern with load in other operand. 7201def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), 7202 VR256:$src1, (i8 imm:$imm))), 7203 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7204} 7205 7206let Predicates = [HasAVX1Only] in { 7207def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), 7208 (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; 7209def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, 7210 (loadv4i64 addr:$src2), (i8 imm:$imm))), 7211 (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; 7212// Pattern with load in other operand. 7213def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7214 VR256:$src1, (i8 imm:$imm))), 7215 (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7216} 7217 7218//===----------------------------------------------------------------------===// 7219// VZERO - Zero YMM registers 7220// Note: These instruction do not affect the YMM16-YMM31. 7221// 7222 7223let SchedRW = [WriteSystem] in { 7224let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7225 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7226 // Zero All YMM registers 7227 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7228 [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, 7229 Requires<[HasAVX]>, VEX_WIG; 7230 7231 // Zero Upper bits of YMM registers 7232 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7233 [(int_x86_avx_vzeroupper)]>, PS, VEX, 7234 Requires<[HasAVX]>, VEX_WIG; 7235} // Defs 7236} // SchedRW 7237 7238//===----------------------------------------------------------------------===// 7239// Half precision conversion instructions 7240// 7241 7242multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7243 X86FoldableSchedWrite sched> { 7244 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7245 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7246 [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, 7247 T8PD, VEX, Sched<[sched]>; 7248 let hasSideEffects = 0, mayLoad = 1 in 7249 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7250 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7251 [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>, 7252 T8PD, VEX, Sched<[sched.Folded]>; 7253} 7254 7255multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7256 SchedWrite RR, SchedWrite MR> { 7257 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7258 (ins RC:$src1, i32u8imm:$src2), 7259 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7260 [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, 7261 TAPD, VEX, Sched<[RR]>; 7262 let hasSideEffects = 0, mayStore = 1 in 7263 def mr : Ii8<0x1D, MRMDestMem, (outs), 7264 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7265 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7266 TAPD, VEX, Sched<[MR]>; 7267} 7268 7269let Predicates = [HasF16C, NoVLX] in { 7270 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>; 7271 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L; 7272 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7273 WriteCvtPS2PHSt>; 7274 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7275 WriteCvtPS2PHYSt>, VEX_L; 7276 7277 // Pattern match vcvtph2ps of a scalar i64 load. 7278 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7279 (VCVTPH2PSrm addr:$src)>; 7280 def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 7281 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7282 (VCVTPH2PSrm addr:$src)>; 7283 7284 def : Pat<(store (f64 (extractelt 7285 (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), 7286 (iPTR 0))), addr:$dst), 7287 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 7288 def : Pat<(store (i64 (extractelt 7289 (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), 7290 (iPTR 0))), addr:$dst), 7291 (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; 7292 def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), 7293 (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; 7294} 7295 7296// Patterns for matching conversions from float to half-float and vice versa. 7297let Predicates = [HasF16C, NoVLX] in { 7298 // Use MXCSR.RC for rounding instead of explicitly specifying the default 7299 // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the 7300 // configurations we support (the default). However, falling back to MXCSR is 7301 // more consistent with other instructions, which are always controlled by it. 7302 // It's encoded as 0b100. 7303 def : Pat<(fp_to_f16 FR32:$src), 7304 (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr 7305 (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; 7306 7307 def : Pat<(f16_to_fp GR16:$src), 7308 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7309 (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; 7310 7311 def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), 7312 (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr 7313 (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; 7314} 7315 7316//===----------------------------------------------------------------------===// 7317// AVX2 Instructions 7318//===----------------------------------------------------------------------===// 7319 7320/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7321multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7322 ValueType OpVT, X86FoldableSchedWrite sched, 7323 RegisterClass RC, 7324 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7325 let isCommutable = 1 in 7326 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7327 (ins RC:$src1, RC:$src2, u8imm:$src3), 7328 !strconcat(OpcodeStr, 7329 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7330 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, 7331 Sched<[sched]>, VEX_4V; 7332 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7333 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7334 !strconcat(OpcodeStr, 7335 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7336 [(set RC:$dst, 7337 (OpVT (OpNode RC:$src1, (load addr:$src2), imm:$src3)))]>, 7338 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V; 7339 7340 // Pattern to commute if load is in first source. 7341 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, imm:$src3)), 7342 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7343 (commuteXForm imm:$src3))>; 7344} 7345 7346let Predicates = [HasAVX2] in { 7347defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7348 SchedWriteBlend.XMM, VR128, i128mem, 7349 BlendCommuteImm4>; 7350defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7351 SchedWriteBlend.YMM, VR256, i256mem, 7352 BlendCommuteImm8>, VEX_L; 7353 7354def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), imm:$src3), 7355 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 imm:$src3))>; 7356def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), imm:$src3), 7357 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 imm:$src3))>; 7358def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, imm:$src3), 7359 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 imm:$src3))>; 7360 7361def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), imm:$src3), 7362 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 imm:$src3))>; 7363def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), imm:$src3), 7364 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 imm:$src3))>; 7365def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, imm:$src3), 7366 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 imm:$src3))>; 7367} 7368 7369// For insertion into the zero index (low half) of a 256-bit vector, it is 7370// more efficient to generate a blend with immediate instead of an insert*128. 7371// NOTE: We're using FP instructions here, but exeuction domain fixing should 7372// take care of using integer instructions when profitable. 7373let Predicates = [HasAVX] in { 7374def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7375 (VBLENDPSYrri VR256:$src1, 7376 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7377 VR128:$src2, sub_xmm), 0xf)>; 7378def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7379 (VBLENDPSYrri VR256:$src1, 7380 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7381 VR128:$src2, sub_xmm), 0xf)>; 7382def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7383 (VBLENDPSYrri VR256:$src1, 7384 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7385 VR128:$src2, sub_xmm), 0xf)>; 7386def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7387 (VBLENDPSYrri VR256:$src1, 7388 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7389 VR128:$src2, sub_xmm), 0xf)>; 7390 7391def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7392 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7393 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7394def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7395 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7396 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7397def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7398 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7399 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7400def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7401 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7402 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7403} 7404 7405//===----------------------------------------------------------------------===// 7406// VPBROADCAST - Load from memory and broadcast to all elements of the 7407// destination operand 7408// 7409multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7410 X86MemOperand x86memop, PatFrag ld_frag, 7411 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7412 let Predicates = [HasAVX2, prd] in { 7413 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7414 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7415 [(set VR128:$dst, 7416 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7417 Sched<[SchedWriteShuffle.XMM]>, VEX; 7418 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7419 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7420 [(set VR128:$dst, 7421 (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, 7422 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7423 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7424 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7425 [(set VR256:$dst, 7426 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7427 Sched<[WriteShuffle256]>, VEX, VEX_L; 7428 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7429 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7430 [(set VR256:$dst, 7431 (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, 7432 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7433 7434 // Provide aliases for broadcast from the same register class that 7435 // automatically does the extract. 7436 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7437 (!cast<Instruction>(NAME#"Yrr") 7438 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7439 } 7440} 7441 7442defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, 7443 v16i8, v32i8, NoVLX_Or_NoBWI>; 7444defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, 7445 v8i16, v16i16, NoVLX_Or_NoBWI>; 7446defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, 7447 v4i32, v8i32, NoVLX>; 7448defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, 7449 v2i64, v4i64, NoVLX>; 7450 7451let Predicates = [HasAVX2, NoVLX] in { 7452 // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. 7453 def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), 7454 (VPBROADCASTQrm addr:$src)>; 7455 def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))), 7456 (VPBROADCASTQYrm addr:$src)>; 7457 7458 def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 7459 (VPBROADCASTDrm addr:$src)>; 7460 def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), 7461 (VPBROADCASTDYrm addr:$src)>; 7462 def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 7463 (VPBROADCASTQrm addr:$src)>; 7464 def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), 7465 (VPBROADCASTQYrm addr:$src)>; 7466} 7467let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7468 // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. 7469 // This means we'll encounter truncated i32 loads; match that here. 7470 def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7471 (VPBROADCASTWrm addr:$src)>; 7472 def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), 7473 (VPBROADCASTWYrm addr:$src)>; 7474 def : Pat<(v8i16 (X86VBroadcast 7475 (i16 (trunc (i32 (extloadi16 addr:$src)))))), 7476 (VPBROADCASTWrm addr:$src)>; 7477 def : Pat<(v8i16 (X86VBroadcast 7478 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7479 (VPBROADCASTWrm addr:$src)>; 7480 def : Pat<(v16i16 (X86VBroadcast 7481 (i16 (trunc (i32 (extloadi16 addr:$src)))))), 7482 (VPBROADCASTWYrm addr:$src)>; 7483 def : Pat<(v16i16 (X86VBroadcast 7484 (i16 (trunc (i32 (zextloadi16 addr:$src)))))), 7485 (VPBROADCASTWYrm addr:$src)>; 7486} 7487 7488let Predicates = [HasAVX2, NoVLX] in { 7489 // Provide aliases for broadcast from the same register class that 7490 // automatically does the extract. 7491 def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), 7492 (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), 7493 sub_xmm)))>; 7494 def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), 7495 (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), 7496 sub_xmm)))>; 7497} 7498 7499let Predicates = [HasAVX2, NoVLX] in { 7500 // Provide fallback in case the load node that is used in the patterns above 7501 // is used by additional users, which prevents the pattern selection. 7502 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7503 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7504 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7505 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7506 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7507 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7508} 7509 7510let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7511 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7512 (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS 7513 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7514 GR8:$src, sub_8bit)), 7515 VR128)))>; 7516 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7517 (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS 7518 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7519 GR8:$src, sub_8bit)), 7520 VR128)))>; 7521 7522 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7523 (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS 7524 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7525 GR16:$src, sub_16bit)), 7526 VR128)))>; 7527 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7528 (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS 7529 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7530 GR16:$src, sub_16bit)), 7531 VR128)))>; 7532} 7533let Predicates = [HasAVX2, NoVLX] in { 7534 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7535 (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; 7536 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7537 (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; 7538 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7539 (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; 7540 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7541 (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; 7542} 7543 7544// AVX1 broadcast patterns 7545let Predicates = [HasAVX1Only] in { 7546def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), 7547 (VBROADCASTSSYrm addr:$src)>; 7548def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), 7549 (VBROADCASTSDYrm addr:$src)>; 7550def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), 7551 (VBROADCASTSSrm addr:$src)>; 7552} 7553 7554 // Provide fallback in case the load node that is used in the patterns above 7555 // is used by additional users, which prevents the pattern selection. 7556let Predicates = [HasAVX, NoVLX] in { 7557 // 128bit broadcasts: 7558 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7559 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7560 def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), 7561 (VMOVDDUPrm addr:$src)>; 7562 7563 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7564 (VMOVDDUPrr VR128:$src)>; 7565 def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), 7566 (VMOVDDUPrm addr:$src)>; 7567 def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))), 7568 (VMOVDDUPrm addr:$src)>; 7569} 7570 7571let Predicates = [HasAVX1Only] in { 7572 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7573 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7574 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7575 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7576 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7577 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7578 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7579 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7580 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7581 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7582 7583 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7584 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; 7585 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7586 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7587 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), 7588 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; 7589 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7590 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7591 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), 7592 (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; 7593 7594 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7595 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; 7596 def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), 7597 (VMOVDDUPrm addr:$src)>; 7598} 7599 7600//===----------------------------------------------------------------------===// 7601// VPERM - Permute instructions 7602// 7603 7604multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7605 ValueType OpVT, X86FoldableSchedWrite Sched, 7606 X86MemOperand memOp> { 7607 let Predicates = [HasAVX2, NoVLX] in { 7608 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7609 (ins VR256:$src1, VR256:$src2), 7610 !strconcat(OpcodeStr, 7611 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7612 [(set VR256:$dst, 7613 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7614 Sched<[Sched]>, VEX_4V, VEX_L; 7615 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7616 (ins VR256:$src1, memOp:$src2), 7617 !strconcat(OpcodeStr, 7618 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7619 [(set VR256:$dst, 7620 (OpVT (X86VPermv VR256:$src1, 7621 (load addr:$src2))))]>, 7622 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L; 7623 } 7624} 7625 7626defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7627let ExeDomain = SSEPackedSingle in 7628defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7629 7630multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7631 ValueType OpVT, X86FoldableSchedWrite Sched, 7632 X86MemOperand memOp> { 7633 let Predicates = [HasAVX2, NoVLX] in { 7634 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7635 (ins VR256:$src1, u8imm:$src2), 7636 !strconcat(OpcodeStr, 7637 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7638 [(set VR256:$dst, 7639 (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, 7640 Sched<[Sched]>, VEX, VEX_L; 7641 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7642 (ins memOp:$src1, u8imm:$src2), 7643 !strconcat(OpcodeStr, 7644 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7645 [(set VR256:$dst, 7646 (OpVT (X86VPermi (mem_frag addr:$src1), 7647 (i8 imm:$src2))))]>, 7648 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7649 } 7650} 7651 7652defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7653 WriteShuffle256, i256mem>, VEX_W; 7654let ExeDomain = SSEPackedDouble in 7655defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7656 WriteFShuffle256, f256mem>, VEX_W; 7657 7658//===----------------------------------------------------------------------===// 7659// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks 7660// 7661let isCommutable = 1 in 7662def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7663 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7664 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7665 [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, 7666 (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, 7667 VEX_4V, VEX_L; 7668def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7669 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7670 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7671 [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), 7672 (i8 imm:$src3)))]>, 7673 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7674 7675let Predicates = [HasAVX2] in 7676def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), 7677 VR256:$src1, (i8 imm:$imm))), 7678 (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; 7679 7680 7681//===----------------------------------------------------------------------===// 7682// VINSERTI128 - Insert packed integer values 7683// 7684let hasSideEffects = 0 in { 7685def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7686 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7687 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7688 []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; 7689let mayLoad = 1 in 7690def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7691 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7692 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7693 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L; 7694} 7695 7696let Predicates = [HasAVX2, NoVLX] in { 7697 defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; 7698 defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv4i32>; 7699 defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv8i16>; 7700 defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv16i8>; 7701} 7702 7703//===----------------------------------------------------------------------===// 7704// VEXTRACTI128 - Extract packed integer values 7705// 7706def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7707 (ins VR256:$src1, u8imm:$src2), 7708 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7709 Sched<[WriteShuffle256]>, VEX, VEX_L; 7710let hasSideEffects = 0, mayStore = 1 in 7711def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7712 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7713 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7714 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7715 7716let Predicates = [HasAVX2, NoVLX] in { 7717 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7718 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7719 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7720 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7721} 7722 7723//===----------------------------------------------------------------------===// 7724// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7725// 7726multiclass avx2_pmovmask<string OpcodeStr, 7727 Intrinsic IntLd128, Intrinsic IntLd256, 7728 Intrinsic IntSt128, Intrinsic IntSt256> { 7729 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7730 (ins VR128:$src1, i128mem:$src2), 7731 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7732 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7733 VEX_4V, Sched<[WriteVecMaskedLoad]>; 7734 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7735 (ins VR256:$src1, i256mem:$src2), 7736 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7737 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7738 VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>; 7739 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7740 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7741 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7742 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7743 VEX_4V, Sched<[WriteVecMaskedStore]>; 7744 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7745 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7746 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7747 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7748 VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>; 7749} 7750 7751defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7752 int_x86_avx2_maskload_d, 7753 int_x86_avx2_maskload_d_256, 7754 int_x86_avx2_maskstore_d, 7755 int_x86_avx2_maskstore_d_256>; 7756defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7757 int_x86_avx2_maskload_q, 7758 int_x86_avx2_maskload_q_256, 7759 int_x86_avx2_maskstore_q, 7760 int_x86_avx2_maskstore_q_256>, VEX_W; 7761 7762multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7763 ValueType MaskVT, string BlendStr, ValueType ZeroVT> { 7764 // masked store 7765 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7766 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7767 // masked load 7768 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7769 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7770 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7771 (VT immAllZerosV))), 7772 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7773} 7774let Predicates = [HasAVX] in { 7775 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; 7776 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; 7777 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; 7778 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; 7779} 7780let Predicates = [HasAVX1Only] in { 7781 // load/store i32/i64 not supported use ps/pd version 7782 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; 7783 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; 7784 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; 7785 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; 7786} 7787let Predicates = [HasAVX2] in { 7788 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; 7789 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; 7790 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; 7791 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; 7792} 7793 7794//===----------------------------------------------------------------------===// 7795// SubVector Broadcasts 7796// Provide fallback in case the load node that is used in the patterns above 7797// is used by additional users, which prevents the pattern selection. 7798 7799let Predicates = [HasAVX, NoVLX] in { 7800def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), 7801 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7802 (v2f64 VR128:$src), 1)>; 7803def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), 7804 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7805 (v4f32 VR128:$src), 1)>; 7806} 7807 7808// NOTE: We're using FP instructions here, but execution domain fixing can 7809// convert to integer when profitable. 7810let Predicates = [HasAVX, NoVLX] in { 7811def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), 7812 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7813 (v2i64 VR128:$src), 1)>; 7814def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), 7815 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7816 (v4i32 VR128:$src), 1)>; 7817def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), 7818 (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7819 (v8i16 VR128:$src), 1)>; 7820def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), 7821 (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), 7822 (v16i8 VR128:$src), 1)>; 7823} 7824 7825//===----------------------------------------------------------------------===// 7826// Variable Bit Shifts 7827// 7828multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 7829 ValueType vt128, ValueType vt256> { 7830 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 7831 (ins VR128:$src1, VR128:$src2), 7832 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7833 [(set VR128:$dst, 7834 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 7835 VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; 7836 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 7837 (ins VR128:$src1, i128mem:$src2), 7838 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7839 [(set VR128:$dst, 7840 (vt128 (OpNode VR128:$src1, 7841 (vt128 (load addr:$src2)))))]>, 7842 VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, 7843 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 7844 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7845 (ins VR256:$src1, VR256:$src2), 7846 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7847 [(set VR256:$dst, 7848 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 7849 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 7850 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7851 (ins VR256:$src1, i256mem:$src2), 7852 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7853 [(set VR256:$dst, 7854 (vt256 (OpNode VR256:$src1, 7855 (vt256 (load addr:$src2)))))]>, 7856 VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 7857 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 7858} 7859 7860let Predicates = [HasAVX2, NoVLX] in { 7861 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 7862 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, VEX_W; 7863 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 7864 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, VEX_W; 7865 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 7866} 7867 7868//===----------------------------------------------------------------------===// 7869// VGATHER - GATHER Operations 7870 7871// FIXME: Improve scheduling of gather instructions. 7872multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, 7873 ValueType VTy, PatFrag GatherNode128, 7874 PatFrag GatherNode256, RegisterClass RC256, 7875 X86MemOperand memop128, X86MemOperand memop256, 7876 ValueType MTx = VTx, ValueType MTy = VTy> { 7877 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 7878 (ins VR128:$src1, memop128:$src2, VR128:$mask), 7879 !strconcat(OpcodeStr, 7880 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7881 [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), 7882 (GatherNode128 VR128:$src1, VR128:$mask, 7883 vectoraddr:$src2))]>, 7884 VEX, Sched<[WriteLoad]>; 7885 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 7886 (ins RC256:$src1, memop256:$src2, RC256:$mask), 7887 !strconcat(OpcodeStr, 7888 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 7889 [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), 7890 (GatherNode256 RC256:$src1, RC256:$mask, 7891 vectoraddr:$src2))]>, 7892 VEX, VEX_L, Sched<[WriteLoad]>; 7893} 7894 7895let Predicates = [HasAVX2] in { 7896 let mayLoad = 1, hasSideEffects = 0, Constraints 7897 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 7898 in { 7899 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, 7900 mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; 7901 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, 7902 mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; 7903 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, 7904 mgatherv8i32, VR256, vx128mem, vy256mem>; 7905 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, 7906 mgatherv4i64, VR128, vx64mem, vy128mem>; 7907 7908 let ExeDomain = SSEPackedDouble in { 7909 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, 7910 mgatherv4i32, VR256, vx128mem, vx256mem, 7911 v2i64, v4i64>, VEX_W; 7912 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, 7913 mgatherv4i64, VR256, vx128mem, vy256mem, 7914 v2i64, v4i64>, VEX_W; 7915 } 7916 7917 let ExeDomain = SSEPackedSingle in { 7918 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, 7919 mgatherv8i32, VR256, vx128mem, vy256mem, 7920 v4i32, v8i32>; 7921 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, 7922 mgatherv4i64, VR128, vx64mem, vy128mem, 7923 v4i32, v4i32>; 7924 } 7925 } 7926} 7927 7928//===----------------------------------------------------------------------===// 7929// GFNI instructions 7930//===----------------------------------------------------------------------===// 7931 7932multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 7933 RegisterClass RC, PatFrag MemOpFrag, 7934 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7935 let ExeDomain = SSEPackedInt, 7936 AsmString = !if(Is2Addr, 7937 OpcodeStr##"\t{$src2, $dst|$dst, $src2}", 7938 OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 7939 let isCommutable = 1 in 7940 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 7941 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 7942 Sched<[SchedWriteVecALU.XMM]>, T8PD; 7943 7944 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 7945 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 7946 (MemOpFrag addr:$src2))))]>, 7947 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>, T8PD; 7948 } 7949} 7950 7951multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 7952 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 7953 X86MemOperand X86MemOp, bit Is2Addr = 0> { 7954 let AsmString = !if(Is2Addr, 7955 OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 7956 OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 7957 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 7958 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 7959 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], 7960 SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; 7961 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 7962 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 7963 [(set RC:$dst, (OpVT (OpNode RC:$src1, 7964 (MemOpFrag addr:$src2), 7965 imm:$src3)))], SSEPackedInt>, 7966 Sched<[SchedWriteVecALU.XMM.Folded, SchedWriteVecALU.XMM.ReadAfterFold]>; 7967 } 7968} 7969 7970multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 7971 let Constraints = "$src1 = $dst", 7972 Predicates = [HasGFNI, UseSSE2] in 7973 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 7974 VR128, load, i128mem, 1>; 7975 let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 7976 defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, 7977 load, i128mem>, VEX_4V, VEX_W; 7978 defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, 7979 load, i256mem>, VEX_4V, VEX_L, VEX_W; 7980 } 7981} 7982 7983// GF2P8MULB 7984let Constraints = "$src1 = $dst", 7985 Predicates = [HasGFNI, UseSSE2] in 7986defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 7987 i128mem, 1>; 7988let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { 7989 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 7990 i128mem>, VEX_4V; 7991 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 7992 i256mem>, VEX_4V, VEX_L; 7993} 7994// GF2P8AFFINEINVQB, GF2P8AFFINEQB 7995let isCommutable = 0 in { 7996 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 7997 X86GF2P8affineinvqb>, TAPD; 7998 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 7999 X86GF2P8affineqb>, TAPD; 8000} 8001 8002