1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 PatFrags mem_frags, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", 116 [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; 117 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 118 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 119 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 120 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 121 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 122 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 123} 124 125//===----------------------------------------------------------------------===// 126// AVX & SSE - Zero/One Vectors 127//===----------------------------------------------------------------------===// 128 129// Alias instruction that maps zero vector to pxor / xorp* for sse. 130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 131// swizzled by ExecutionDomainFix to pxor. 132// We set canFoldAsLoad because this can be converted to a constant-pool 133// load of an all-zeros value if folding it would be beneficial. 134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 135 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 137 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 138} 139 140let Predicates = [NoAVX512] in { 141def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 142def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 143def : Pat<(v8f16 immAllZerosV), (V_SET0)>; 144def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 145def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 146def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 147} 148 149 150// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 151// and doesn't need it because on sandy bridge the register is set to zero 152// at the rename stage without using any execution unit, so SET0PSY 153// and SET0PDY can be used for vector int instructions without penalty 154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 155 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 157 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 158} 159 160let Predicates = [NoAVX512] in { 161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; 164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 167} 168 169// We set canFoldAsLoad because this can be converted to a constant-pool 170// load of an all-ones value if folding it would be beneficial. 171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 172 isPseudo = 1, SchedRW = [WriteZero] in { 173 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 174 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 175 let Predicates = [HasAVX1Only, OptForMinSize] in { 176 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178 } 179 let Predicates = [HasAVX2] in 180 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 181 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 182} 183 184//===----------------------------------------------------------------------===// 185// SSE 1 & 2 - Move FP Scalar Instructions 186// 187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 188// register copies because it's a partial register update; Register-to-register 189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 190// that the insert be implementable in terms of a copy, and just mentioned, we 191// don't use movss/movsd for copies. 192//===----------------------------------------------------------------------===// 193 194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc, 195 string asm_opr, Domain d> { 196 let isCommutable = 1 in 197 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 198 (ins VR128:$src1, VR128:$src2), 199 !strconcat(base_opc, asm_opr), 200 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 201 Sched<[SchedWriteFShuffle.XMM]>; 202 203 // For the disassembler 204 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 205 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 206 (ins VR128:$src1, VR128:$src2), 207 !strconcat(base_opc, asm_opr), []>, 208 Sched<[SchedWriteFShuffle.XMM]>; 209} 210 211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 212 X86MemOperand x86memop, string OpcodeStr, 213 Domain d, Predicate pred> { 214 // AVX 215 let Predicates = [UseAVX, OptForSize] in 216 defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, 218 VEX, VVVV, VEX_LIG, WIG; 219 220 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 221 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 222 [(store RC:$src, addr:$dst)], d>, 223 VEX, VEX_LIG, Sched<[WriteFStore]>, WIG; 224 // SSE1 & 2 225 let Constraints = "$src1 = $dst" in { 226 let Predicates = [pred, NoSSE41_Or_OptForSize] in 227 defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 228 "\t{$src2, $dst|$dst, $src2}", d>; 229 } 230 231 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 232 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 233 [(store RC:$src, addr:$dst)], d>, 234 Sched<[WriteFStore]>; 235 236 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 237 (!cast<Instruction>("V"#NAME#"rr_REV") 238 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 239 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 240 (!cast<Instruction>(NAME#"rr_REV") 241 VR128:$dst, VR128:$src2), 0>; 242} 243 244// Loading from memory automatically zeroing upper bits. 245multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 246 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 247 Domain d> { 248 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 249 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 250 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 251 VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG; 252 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 253 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 254 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 255 Sched<[WriteFLoad]>; 256 257 // _alt version uses FR32/FR64 register class. 258 let isCodeGenOnly = 1 in { 259 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 260 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 261 [(set RC:$dst, (mem_pat addr:$src))], d>, 262 VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG; 263 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 264 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 265 [(set RC:$dst, (mem_pat addr:$src))], d>, 266 Sched<[WriteFLoad]>; 267 } 268} 269 270defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 271 SSEPackedSingle, UseSSE1>, TB, XS; 272defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 273 SSEPackedDouble, UseSSE2>, TB, XD; 274 275let canFoldAsLoad = 1, isReMaterializable = 1 in { 276 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 277 SSEPackedSingle>, TB, XS; 278 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 279 SSEPackedDouble>, TB, XD; 280} 281 282// Patterns 283let Predicates = [UseAVX] in { 284 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 285 (VMOVSSrm addr:$src)>; 286 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 287 (VMOVSDrm addr:$src)>; 288 289 // Represent the same patterns above but in the form they appear for 290 // 256-bit types 291 def : Pat<(v8f32 (X86vzload32 addr:$src)), 292 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 293 def : Pat<(v4f64 (X86vzload64 addr:$src)), 294 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 295} 296 297let Predicates = [UseAVX, OptForSize] in { 298 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 299 // MOVSS to the lower bits. 300 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 301 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 302 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 303 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 304 305 // Move low f32 and clear high bits. 306 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 307 (SUBREG_TO_REG (i32 0), 308 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 309 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 310 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 311 (SUBREG_TO_REG (i32 0), 312 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 313 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 314} 315 316let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 317// Move scalar to XMM zero-extended, zeroing a VR128 then do a 318// MOVSS to the lower bits. 319def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 320 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 321def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 322 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 323} 324 325let Predicates = [UseSSE2] in 326def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 327 (MOVSDrm addr:$src)>; 328 329let Predicates = [UseSSE1] in 330def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 331 (MOVSSrm addr:$src)>; 332 333//===----------------------------------------------------------------------===// 334// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 335//===----------------------------------------------------------------------===// 336 337multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 338 X86MemOperand x86memop, PatFrag ld_frag, 339 string asm, Domain d, 340 X86SchedWriteMoveLS sched> { 341let hasSideEffects = 0, isMoveReg = 1 in 342 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 343 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 344 Sched<[sched.RR]>; 345let canFoldAsLoad = 1, isReMaterializable = 1 in 346 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 347 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 348 [(set RC:$dst, (ld_frag addr:$src))], d>, 349 Sched<[sched.RM]>; 350} 351 352let Predicates = [HasAVX, NoVLX] in { 353defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 354 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 355 TB, VEX, WIG; 356defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 357 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 358 TB, PD, VEX, WIG; 359defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 360 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 361 TB, VEX, WIG; 362defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 363 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 364 TB, PD, VEX, WIG; 365 366defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 367 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 368 TB, VEX, VEX_L, WIG; 369defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 370 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 371 TB, PD, VEX, VEX_L, WIG; 372defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 373 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 374 TB, VEX, VEX_L, WIG; 375defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 376 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 377 TB, PD, VEX, VEX_L, WIG; 378} 379 380let Predicates = [UseSSE1] in { 381defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 382 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 383 TB; 384defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 385 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 386 TB; 387} 388let Predicates = [UseSSE2] in { 389defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 390 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 391 TB, PD; 392defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 393 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 394 TB, PD; 395} 396 397let Predicates = [HasAVX, NoVLX] in { 398let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 399def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 400 "movaps\t{$src, $dst|$dst, $src}", 401 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 402 VEX, WIG; 403def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 404 "movapd\t{$src, $dst|$dst, $src}", 405 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 406 VEX, WIG; 407def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 408 "movups\t{$src, $dst|$dst, $src}", 409 [(store (v4f32 VR128:$src), addr:$dst)]>, 410 VEX, WIG; 411def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 412 "movupd\t{$src, $dst|$dst, $src}", 413 [(store (v2f64 VR128:$src), addr:$dst)]>, 414 VEX, WIG; 415} // SchedRW 416 417let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 418def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 419 "movaps\t{$src, $dst|$dst, $src}", 420 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 421 VEX, VEX_L, WIG; 422def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 423 "movapd\t{$src, $dst|$dst, $src}", 424 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 425 VEX, VEX_L, WIG; 426def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 427 "movups\t{$src, $dst|$dst, $src}", 428 [(store (v8f32 VR256:$src), addr:$dst)]>, 429 VEX, VEX_L, WIG; 430def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 431 "movupd\t{$src, $dst|$dst, $src}", 432 [(store (v4f64 VR256:$src), addr:$dst)]>, 433 VEX, VEX_L, WIG; 434} // SchedRW 435} // Predicate 436 437// For disassembler 438let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 439 isMoveReg = 1 in { 440let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 441 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 442 (ins VR128:$src), 443 "movaps\t{$src, $dst|$dst, $src}", []>, 444 VEX, WIG; 445 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 446 (ins VR128:$src), 447 "movapd\t{$src, $dst|$dst, $src}", []>, 448 VEX, WIG; 449 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 450 (ins VR128:$src), 451 "movups\t{$src, $dst|$dst, $src}", []>, 452 VEX, WIG; 453 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 454 (ins VR128:$src), 455 "movupd\t{$src, $dst|$dst, $src}", []>, 456 VEX, WIG; 457} // SchedRW 458 459let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 460 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 461 (ins VR256:$src), 462 "movaps\t{$src, $dst|$dst, $src}", []>, 463 VEX, VEX_L, WIG; 464 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 465 (ins VR256:$src), 466 "movapd\t{$src, $dst|$dst, $src}", []>, 467 VEX, VEX_L, WIG; 468 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 469 (ins VR256:$src), 470 "movups\t{$src, $dst|$dst, $src}", []>, 471 VEX, VEX_L, WIG; 472 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 473 (ins VR256:$src), 474 "movupd\t{$src, $dst|$dst, $src}", []>, 475 VEX, VEX_L, WIG; 476} // SchedRW 477} // Predicate 478 479// Reversed version with ".s" suffix for GAS compatibility. 480def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 481 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 482def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 483 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 484def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 485 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 486def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 487 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 488def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 489 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 490def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 491 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 492def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 493 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 494def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 495 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 496 497let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 498def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 499 "movaps\t{$src, $dst|$dst, $src}", 500 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 501def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 502 "movapd\t{$src, $dst|$dst, $src}", 503 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 504def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 505 "movups\t{$src, $dst|$dst, $src}", 506 [(store (v4f32 VR128:$src), addr:$dst)]>; 507def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 508 "movupd\t{$src, $dst|$dst, $src}", 509 [(store (v2f64 VR128:$src), addr:$dst)]>; 510} // SchedRW 511 512// For disassembler 513let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 514 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 515 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 516 "movaps\t{$src, $dst|$dst, $src}", []>; 517 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 518 "movapd\t{$src, $dst|$dst, $src}", []>; 519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movups\t{$src, $dst|$dst, $src}", []>; 521 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 522 "movupd\t{$src, $dst|$dst, $src}", []>; 523} 524 525// Reversed version with ".s" suffix for GAS compatibility. 526def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 527 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 528def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 529 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 530def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 531 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 532def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 533 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 534 535let Predicates = [HasAVX, NoVLX] in { 536 // 256-bit load/store need to use floating point load/store in case we don't 537 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 538 // available and changing the domain is beneficial. 539 def : Pat<(alignedloadv4i64 addr:$src), 540 (VMOVAPSYrm addr:$src)>; 541 def : Pat<(alignedloadv8i32 addr:$src), 542 (VMOVAPSYrm addr:$src)>; 543 def : Pat<(alignedloadv16i16 addr:$src), 544 (VMOVAPSYrm addr:$src)>; 545 def : Pat<(alignedloadv32i8 addr:$src), 546 (VMOVAPSYrm addr:$src)>; 547 def : Pat<(loadv4i64 addr:$src), 548 (VMOVUPSYrm addr:$src)>; 549 def : Pat<(loadv8i32 addr:$src), 550 (VMOVUPSYrm addr:$src)>; 551 def : Pat<(loadv16i16 addr:$src), 552 (VMOVUPSYrm addr:$src)>; 553 def : Pat<(loadv32i8 addr:$src), 554 (VMOVUPSYrm addr:$src)>; 555 556 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 557 (VMOVAPSYmr addr:$dst, VR256:$src)>; 558 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 559 (VMOVAPSYmr addr:$dst, VR256:$src)>; 560 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 561 (VMOVAPSYmr addr:$dst, VR256:$src)>; 562 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 563 (VMOVAPSYmr addr:$dst, VR256:$src)>; 564 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 565 (VMOVUPSYmr addr:$dst, VR256:$src)>; 566 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 567 (VMOVUPSYmr addr:$dst, VR256:$src)>; 568 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 569 (VMOVUPSYmr addr:$dst, VR256:$src)>; 570 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 571 (VMOVUPSYmr addr:$dst, VR256:$src)>; 572 573 def : Pat<(alignedloadv8f16 addr:$src), 574 (VMOVAPSrm addr:$src)>; 575 def : Pat<(alignedloadv8bf16 addr:$src), 576 (VMOVAPSrm addr:$src)>; 577 def : Pat<(loadv8f16 addr:$src), 578 (VMOVUPSrm addr:$src)>; 579 def : Pat<(loadv8bf16 addr:$src), 580 (VMOVUPSrm addr:$src)>; 581 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 582 (VMOVAPSmr addr:$dst, VR128:$src)>; 583 def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst), 584 (VMOVAPSmr addr:$dst, VR128:$src)>; 585 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 586 (VMOVUPSmr addr:$dst, VR128:$src)>; 587 def : Pat<(store (v8bf16 VR128:$src), addr:$dst), 588 (VMOVUPSmr addr:$dst, VR128:$src)>; 589 590 def : Pat<(alignedloadv16f16 addr:$src), 591 (VMOVAPSYrm addr:$src)>; 592 def : Pat<(alignedloadv16bf16 addr:$src), 593 (VMOVAPSYrm addr:$src)>; 594 def : Pat<(loadv16f16 addr:$src), 595 (VMOVUPSYrm addr:$src)>; 596 def : Pat<(loadv16bf16 addr:$src), 597 (VMOVUPSYrm addr:$src)>; 598 def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), 599 (VMOVAPSYmr addr:$dst, VR256:$src)>; 600 def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst), 601 (VMOVAPSYmr addr:$dst, VR256:$src)>; 602 def : Pat<(store (v16f16 VR256:$src), addr:$dst), 603 (VMOVUPSYmr addr:$dst, VR256:$src)>; 604 def : Pat<(store (v16bf16 VR256:$src), addr:$dst), 605 (VMOVUPSYmr addr:$dst, VR256:$src)>; 606} 607 608// Use movaps / movups for SSE integer load / store (one byte shorter). 609// The instructions selected below are then converted to MOVDQA/MOVDQU 610// during the SSE domain pass. 611let Predicates = [UseSSE1] in { 612 def : Pat<(alignedloadv2i64 addr:$src), 613 (MOVAPSrm addr:$src)>; 614 def : Pat<(alignedloadv4i32 addr:$src), 615 (MOVAPSrm addr:$src)>; 616 def : Pat<(alignedloadv8i16 addr:$src), 617 (MOVAPSrm addr:$src)>; 618 def : Pat<(alignedloadv16i8 addr:$src), 619 (MOVAPSrm addr:$src)>; 620 def : Pat<(loadv2i64 addr:$src), 621 (MOVUPSrm addr:$src)>; 622 def : Pat<(loadv4i32 addr:$src), 623 (MOVUPSrm addr:$src)>; 624 def : Pat<(loadv8i16 addr:$src), 625 (MOVUPSrm addr:$src)>; 626 def : Pat<(loadv16i8 addr:$src), 627 (MOVUPSrm addr:$src)>; 628 629 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 630 (MOVAPSmr addr:$dst, VR128:$src)>; 631 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 632 (MOVAPSmr addr:$dst, VR128:$src)>; 633 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 634 (MOVAPSmr addr:$dst, VR128:$src)>; 635 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 636 (MOVAPSmr addr:$dst, VR128:$src)>; 637 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 638 (MOVUPSmr addr:$dst, VR128:$src)>; 639 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 640 (MOVUPSmr addr:$dst, VR128:$src)>; 641 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 642 (MOVUPSmr addr:$dst, VR128:$src)>; 643 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 644 (MOVUPSmr addr:$dst, VR128:$src)>; 645} 646 647let Predicates = [UseSSE2] in { 648 def : Pat<(alignedloadv8f16 addr:$src), 649 (MOVAPSrm addr:$src)>; 650 def : Pat<(loadv8f16 addr:$src), 651 (MOVUPSrm addr:$src)>; 652 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 653 (MOVAPSmr addr:$dst, VR128:$src)>; 654 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 655 (MOVUPSmr addr:$dst, VR128:$src)>; 656} 657 658//===----------------------------------------------------------------------===// 659// SSE 1 & 2 - Move Low packed FP Instructions 660//===----------------------------------------------------------------------===// 661 662multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode, 663 string base_opc, string asm_opr> { 664 // No pattern as they need be special cased between high and low. 665 let hasSideEffects = 0, mayLoad = 1 in 666 def PSrm : PI<opc, MRMSrcMem, 667 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 668 !strconcat(base_opc, "s", asm_opr), 669 [], SSEPackedSingle>, TB, 670 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 671 672 def PDrm : PI<opc, MRMSrcMem, 673 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 674 !strconcat(base_opc, "d", asm_opr), 675 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 676 (scalar_to_vector (loadf64 addr:$src2)))))], 677 SSEPackedDouble>, TB, PD, 678 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 679} 680 681multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 682 string base_opc> { 683 let Predicates = [UseAVX] in 684 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 685 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 686 VEX, VVVV, WIG; 687 688 let Constraints = "$src1 = $dst" in 689 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 690 "\t{$src2, $dst|$dst, $src2}">; 691} 692 693defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 694 695let SchedRW = [WriteFStore] in { 696let Predicates = [UseAVX] in { 697let mayStore = 1, hasSideEffects = 0 in 698def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 699 "movlps\t{$src, $dst|$dst, $src}", 700 []>, 701 VEX, WIG; 702def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 703 "movlpd\t{$src, $dst|$dst, $src}", 704 [(store (f64 (extractelt (v2f64 VR128:$src), 705 (iPTR 0))), addr:$dst)]>, 706 VEX, WIG; 707}// UseAVX 708let mayStore = 1, hasSideEffects = 0 in 709def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 710 "movlps\t{$src, $dst|$dst, $src}", 711 []>; 712def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 713 "movlpd\t{$src, $dst|$dst, $src}", 714 [(store (f64 (extractelt (v2f64 VR128:$src), 715 (iPTR 0))), addr:$dst)]>; 716} // SchedRW 717 718let Predicates = [UseSSE1] in { 719 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 720 // end up with a movsd or blend instead of shufp. 721 // No need for aligned load, we're only loading 64-bits. 722 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 723 (i8 -28)), 724 (MOVLPSrm VR128:$src1, addr:$src2)>; 725 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 726 (MOVLPSrm VR128:$src1, addr:$src2)>; 727 728 def : Pat<(v4f32 (X86vzload64 addr:$src)), 729 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 730 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 731 (MOVLPSmr addr:$dst, VR128:$src)>; 732} 733 734//===----------------------------------------------------------------------===// 735// SSE 1 & 2 - Move Hi packed FP Instructions 736//===----------------------------------------------------------------------===// 737 738defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 739 740let SchedRW = [WriteFStore] in { 741// v2f64 extract element 1 is always custom lowered to unpack high to low 742// and extract element 0 so the non-store version isn't too horrible. 743let Predicates = [UseAVX] in { 744let mayStore = 1, hasSideEffects = 0 in 745def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 746 "movhps\t{$src, $dst|$dst, $src}", 747 []>, VEX, WIG; 748def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 749 "movhpd\t{$src, $dst|$dst, $src}", 750 [(store (f64 (extractelt 751 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 752 (iPTR 0))), addr:$dst)]>, VEX, WIG; 753} // UseAVX 754let mayStore = 1, hasSideEffects = 0 in 755def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 756 "movhps\t{$src, $dst|$dst, $src}", 757 []>; 758def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 759 "movhpd\t{$src, $dst|$dst, $src}", 760 [(store (f64 (extractelt 761 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 762 (iPTR 0))), addr:$dst)]>; 763} // SchedRW 764 765let Predicates = [UseAVX] in { 766 // MOVHPD patterns 767 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 768 (VMOVHPDrm VR128:$src1, addr:$src2)>; 769 770 def : Pat<(store (f64 (extractelt 771 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 772 (iPTR 0))), addr:$dst), 773 (VMOVHPDmr addr:$dst, VR128:$src)>; 774 775 // MOVLPD patterns 776 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 777 (VMOVLPDrm VR128:$src1, addr:$src2)>; 778} 779 780let Predicates = [UseSSE1] in { 781 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 782 // end up with a movsd or blend instead of shufp. 783 // No need for aligned load, we're only loading 64-bits. 784 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 785 (MOVHPSrm VR128:$src1, addr:$src2)>; 786 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 787 (MOVHPSrm VR128:$src1, addr:$src2)>; 788 789 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 790 addr:$dst), 791 (MOVHPSmr addr:$dst, VR128:$src)>; 792} 793 794let Predicates = [UseSSE2] in { 795 // MOVHPD patterns 796 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 797 (MOVHPDrm VR128:$src1, addr:$src2)>; 798 799 def : Pat<(store (f64 (extractelt 800 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 801 (iPTR 0))), addr:$dst), 802 (MOVHPDmr addr:$dst, VR128:$src)>; 803 804 // MOVLPD patterns 805 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 806 (MOVLPDrm VR128:$src1, addr:$src2)>; 807} 808 809let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 810 // Use MOVLPD to load into the low bits from a full vector unless we can use 811 // BLENDPD. 812 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 813 (MOVLPDrm VR128:$src1, addr:$src2)>; 814} 815 816//===----------------------------------------------------------------------===// 817// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 818//===----------------------------------------------------------------------===// 819 820let Predicates = [UseAVX] in { 821 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 822 (ins VR128:$src1, VR128:$src2), 823 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 824 [(set VR128:$dst, 825 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 826 VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG; 827 let isCommutable = 1 in 828 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 829 (ins VR128:$src1, VR128:$src2), 830 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 831 [(set VR128:$dst, 832 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 833 VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG; 834} 835let Constraints = "$src1 = $dst" in { 836 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 837 (ins VR128:$src1, VR128:$src2), 838 "movlhps\t{$src2, $dst|$dst, $src2}", 839 [(set VR128:$dst, 840 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 841 Sched<[SchedWriteFShuffle.XMM]>; 842 let isCommutable = 1 in 843 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 844 (ins VR128:$src1, VR128:$src2), 845 "movhlps\t{$src2, $dst|$dst, $src2}", 846 [(set VR128:$dst, 847 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 848 Sched<[SchedWriteFShuffle.XMM]>; 849} 850 851//===----------------------------------------------------------------------===// 852// SSE 1 & 2 - Conversion Instructions 853//===----------------------------------------------------------------------===// 854 855multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 856 SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag, 857 string asm, string mem, X86FoldableSchedWrite sched, 858 Domain d, 859 SchedRead Int2Fpu = ReadDefault> { 860 let ExeDomain = d in { 861 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 862 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 863 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 864 Sched<[sched, Int2Fpu]>; 865 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 866 mem#"\t{$src, $dst|$dst, $src}", 867 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 868 Sched<[sched.Folded]>; 869 } 870} 871 872multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 873 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 874 string asm, Domain d, X86FoldableSchedWrite sched> { 875let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 876 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 877 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 878 Sched<[sched]>; 879 let mayLoad = 1 in 880 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 881 [(set RC:$dst, (DstTy (any_sint_to_fp 882 (SrcTy (ld_frag addr:$src)))))], d>, 883 Sched<[sched.Folded]>; 884} 885} 886 887multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 888 X86MemOperand x86memop, string asm, string mem, 889 X86FoldableSchedWrite sched, Domain d> { 890let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 891 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 892 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 893 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 894 let mayLoad = 1 in 895 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 896 (ins DstRC:$src1, x86memop:$src), 897 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 898 Sched<[sched.Folded, sched.ReadAfterFold]>; 899} // hasSideEffects = 0 900} 901 902let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 903defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 904 "cvttss2si", "cvttss2si", 905 WriteCvtSS2I, SSEPackedSingle>, 906 TB, XS, VEX, VEX_LIG; 907defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 908 "cvttss2si", "cvttss2si", 909 WriteCvtSS2I, SSEPackedSingle>, 910 TB, XS, VEX, REX_W, VEX_LIG; 911defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 912 "cvttsd2si", "cvttsd2si", 913 WriteCvtSD2I, SSEPackedDouble>, 914 TB, XD, VEX, VEX_LIG; 915defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 916 "cvttsd2si", "cvttsd2si", 917 WriteCvtSD2I, SSEPackedDouble>, 918 TB, XD, VEX, REX_W, VEX_LIG; 919 920defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 921 "cvtss2si", "cvtss2si", 922 WriteCvtSS2I, SSEPackedSingle>, 923 TB, XS, VEX, VEX_LIG; 924defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 925 "cvtss2si", "cvtss2si", 926 WriteCvtSS2I, SSEPackedSingle>, 927 TB, XS, VEX, REX_W, VEX_LIG; 928defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 929 "cvtsd2si", "cvtsd2si", 930 WriteCvtSD2I, SSEPackedDouble>, 931 TB, XD, VEX, VEX_LIG; 932defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 933 "cvtsd2si", "cvtsd2si", 934 WriteCvtSD2I, SSEPackedDouble>, 935 TB, XD, VEX, REX_W, VEX_LIG; 936} 937 938// The assembler can recognize rr 64-bit instructions by seeing a rxx 939// register, but the same isn't true when only using memory operands, 940// provide other assembly "l" and "q" forms to address this explicitly 941// where appropriate to do so. 942let isCodeGenOnly = 1 in { 943defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 944 WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV, 945 VEX_LIG, SIMD_EXC; 946defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 947 WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV, 948 REX_W, VEX_LIG, SIMD_EXC; 949defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 950 WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV, 951 VEX_LIG; 952defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 953 WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV, 954 REX_W, VEX_LIG, SIMD_EXC; 955} // isCodeGenOnly = 1 956 957let Predicates = [UseAVX] in { 958 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 959 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 960 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 961 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 962 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 963 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 964 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 965 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 966 967 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 968 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 969 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 970 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 971 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 972 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 973 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 974 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 975 976 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; 977 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; 978 979 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; 980 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; 981} 982 983let isCodeGenOnly = 1 in { 984defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 985 "cvttss2si", "cvttss2si", 986 WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC; 987defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 988 "cvttss2si", "cvttss2si", 989 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC; 990defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 991 "cvttsd2si", "cvttsd2si", 992 WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC; 993defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 994 "cvttsd2si", "cvttsd2si", 995 WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC; 996 997defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 998 "cvtss2si", "cvtss2si", 999 WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC; 1000defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 1001 "cvtss2si", "cvtss2si", 1002 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC; 1003defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 1004 "cvtsd2si", "cvtsd2si", 1005 WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC; 1006defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 1007 "cvtsd2si", "cvtsd2si", 1008 WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC; 1009 1010defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 1011 "cvtsi2ss", "cvtsi2ss{l}", 1012 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, SIMD_EXC; 1013defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 1014 "cvtsi2ss", "cvtsi2ss{q}", 1015 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, REX_W, SIMD_EXC; 1016defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 1017 "cvtsi2sd", "cvtsi2sd{l}", 1018 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD; 1019defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 1020 "cvtsi2sd", "cvtsi2sd{q}", 1021 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD, REX_W, SIMD_EXC; 1022} // isCodeGenOnly = 1 1023 1024let Predicates = [UseSSE1] in { 1025 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; 1026 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; 1027} 1028 1029let Predicates = [UseSSE2] in { 1030 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; 1031 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; 1032} 1033 1034// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1035// and/or XMM operand(s). 1036 1037multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1038 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 1039 Operand memop, PatFrags mem_frags, string asm, 1040 X86FoldableSchedWrite sched, Domain d> { 1041let ExeDomain = d in { 1042 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1043 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1044 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 1045 Sched<[sched]>; 1046 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1047 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1048 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, 1049 Sched<[sched.Folded]>; 1050} 1051} 1052 1053multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1054 RegisterClass DstRC, X86MemOperand x86memop, 1055 string asm, string mem, X86FoldableSchedWrite sched, 1056 Domain d, bit Is2Addr = 1> { 1057let hasSideEffects = 0, ExeDomain = d in { 1058 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1059 !if(Is2Addr, 1060 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1061 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1062 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 1063 let mayLoad = 1 in 1064 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1065 (ins DstRC:$src1, x86memop:$src2), 1066 !if(Is2Addr, 1067 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 1068 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 1069 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 1070} 1071} 1072 1073let Uses = [MXCSR], mayRaiseFPException = 1 in { 1074let Predicates = [UseAVX] in { 1075defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 1076 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1077 WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG; 1078defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1079 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1080 WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, REX_W, VEX_LIG; 1081} 1082defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1083 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1084 SSEPackedDouble>, TB, XD; 1085defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1086 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1087 SSEPackedDouble>, TB, XD, REX_W; 1088} 1089 1090let Predicates = [UseAVX] in { 1091defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1092 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1093 TB, XS, VEX, VVVV, VEX_LIG, SIMD_EXC; 1094defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1095 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1096 TB, XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC; 1097defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1098 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1099 TB, XD, VEX, VVVV, VEX_LIG; 1100defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1101 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1102 TB, XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC; 1103} 1104let Constraints = "$src1 = $dst" in { 1105 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1106 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1107 TB, XS, SIMD_EXC; 1108 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1109 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1110 TB, XS, REX_W, SIMD_EXC; 1111 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1112 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1113 TB, XD; 1114 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1115 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1116 TB, XD, REX_W, SIMD_EXC; 1117} 1118 1119def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1120 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1121def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1122 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1123def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1124 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1125def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1126 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1127 1128def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1129 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1130def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1131 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1132 1133def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1134 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1135def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1136 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1137def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1138 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1139def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1140 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1141 1142def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1143 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1144def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1145 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1146 1147/// SSE 1 Only 1148 1149// Aliases for intrinsics 1150let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1151defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1152 ssmem, sse_load_f32, "cvttss2si", 1153 WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG; 1154defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1155 X86cvtts2Int, ssmem, sse_load_f32, 1156 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1157 TB, XS, VEX, VEX_LIG, REX_W; 1158defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1159 sdmem, sse_load_f64, "cvttsd2si", 1160 WriteCvtSS2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG; 1161defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1162 X86cvtts2Int, sdmem, sse_load_f64, 1163 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1164 TB, XD, VEX, VEX_LIG, REX_W; 1165} 1166let Uses = [MXCSR], mayRaiseFPException = 1 in { 1167defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1168 ssmem, sse_load_f32, "cvttss2si", 1169 WriteCvtSS2I, SSEPackedSingle>, TB, XS; 1170defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1171 X86cvtts2Int, ssmem, sse_load_f32, 1172 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1173 TB, XS, REX_W; 1174defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1175 sdmem, sse_load_f64, "cvttsd2si", 1176 WriteCvtSD2I, SSEPackedDouble>, TB, XD; 1177defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1178 X86cvtts2Int, sdmem, sse_load_f64, 1179 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1180 TB, XD, REX_W; 1181} 1182 1183def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1184 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1185def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1186 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1187def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1188 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1189def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1190 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1191def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1192 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1193def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1194 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1195def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1196 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1197def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1198 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1199 1200def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1201 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1202def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1203 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1204def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1205 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1206def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1207 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1208def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1209 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1210def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1211 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1212def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1213 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1214def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1215 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1216 1217let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1218defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1219 ssmem, sse_load_f32, "cvtss2si", 1220 WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG; 1221defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1222 ssmem, sse_load_f32, "cvtss2si", 1223 WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, REX_W, VEX_LIG; 1224} 1225let Uses = [MXCSR], mayRaiseFPException = 1 in { 1226defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1227 ssmem, sse_load_f32, "cvtss2si", 1228 WriteCvtSS2I, SSEPackedSingle>, TB, XS; 1229defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1230 ssmem, sse_load_f32, "cvtss2si", 1231 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W; 1232 1233defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1234 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1235 SSEPackedSingle, WriteCvtI2PS>, 1236 TB, VEX, Requires<[HasAVX, NoVLX]>, WIG; 1237defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1238 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1239 SSEPackedSingle, WriteCvtI2PSY>, 1240 TB, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG; 1241 1242defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1243 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1244 SSEPackedSingle, WriteCvtI2PS>, 1245 TB, Requires<[UseSSE2]>; 1246} 1247 1248// AVX aliases 1249def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1250 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1251def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1252 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1253def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1254 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1255def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1256 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1257def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1258 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1259def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1260 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1261def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1262 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1263def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1264 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1265 1266// SSE aliases 1267def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1268 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1269def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1270 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1271def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1272 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1273def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1274 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1275def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1276 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1277def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1278 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1279def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1280 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1281def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1282 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1283 1284/// SSE 2 Only 1285 1286// Convert scalar double to scalar single 1287let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], 1288 ExeDomain = SSEPackedSingle in { 1289def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1290 (ins FR32:$src1, FR64:$src2), 1291 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1292 VEX, VVVV, VEX_LIG, WIG, 1293 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1294let mayLoad = 1 in 1295def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1296 (ins FR32:$src1, f64mem:$src2), 1297 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1298 TB, XD, VEX, VVVV, VEX_LIG, WIG, 1299 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1300} 1301 1302def : Pat<(f32 (any_fpround FR64:$src)), 1303 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1304 Requires<[UseAVX]>; 1305 1306let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1307def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1308 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1309 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1310 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1311def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1312 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1313 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1314 TB, XD, Requires<[UseSSE2, OptForSize]>, 1315 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1316} 1317 1318let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { 1319def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1320 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1321 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1322 [(set VR128:$dst, 1323 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1324 TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>, 1325 Sched<[WriteCvtSD2SS]>; 1326def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1327 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1328 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1329 [(set VR128:$dst, 1330 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1331 TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>, 1332 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1333let Constraints = "$src1 = $dst" in { 1334def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1335 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1336 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1337 [(set VR128:$dst, 1338 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1339 TB, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1340def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1341 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1342 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1343 [(set VR128:$dst, 1344 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1345 TB, XD, Requires<[UseSSE2]>, 1346 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1347} 1348} 1349 1350// Convert scalar single to scalar double 1351// SSE2 instructions with XS prefix 1352let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 1353def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1354 (ins FR64:$src1, FR32:$src2), 1355 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1356 TB, XS, VEX, VVVV, VEX_LIG, WIG, 1357 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1358let mayLoad = 1 in 1359def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1360 (ins FR64:$src1, f32mem:$src2), 1361 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1362 TB, XS, VEX, VVVV, VEX_LIG, WIG, 1363 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1364 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1365} // isCodeGenOnly = 1, hasSideEffects = 0 1366 1367def : Pat<(f64 (any_fpextend FR32:$src)), 1368 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1369def : Pat<(any_fpextend (loadf32 addr:$src)), 1370 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1371 1372let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1373def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1374 "cvtss2sd\t{$src, $dst|$dst, $src}", 1375 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1376 TB, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1377def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1378 "cvtss2sd\t{$src, $dst|$dst, $src}", 1379 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1380 TB, XS, Requires<[UseSSE2, OptForSize]>, 1381 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC; 1382} // isCodeGenOnly = 1 1383 1384let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, 1385 ExeDomain = SSEPackedSingle in { 1386def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1387 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1388 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1389 []>, TB, XS, VEX, VVVV, VEX_LIG, WIG, 1390 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1391let mayLoad = 1 in 1392def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1393 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1394 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1395 []>, TB, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>, 1396 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1397let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1398def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1399 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1400 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1401 []>, TB, XS, Requires<[UseSSE2]>, 1402 Sched<[WriteCvtSS2SD]>; 1403let mayLoad = 1 in 1404def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1405 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1406 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1407 []>, TB, XS, Requires<[UseSSE2]>, 1408 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1409} 1410} // hasSideEffects = 0 1411 1412// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1413// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1414// vmovs{s,d} instructions 1415let Predicates = [UseAVX] in { 1416def : Pat<(v4f32 (X86Movss 1417 (v4f32 VR128:$dst), 1418 (v4f32 (scalar_to_vector 1419 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1420 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1421 1422def : Pat<(v2f64 (X86Movsd 1423 (v2f64 VR128:$dst), 1424 (v2f64 (scalar_to_vector 1425 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1426 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1427 1428def : Pat<(v4f32 (X86Movss 1429 (v4f32 VR128:$dst), 1430 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1431 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1432 1433def : Pat<(v4f32 (X86Movss 1434 (v4f32 VR128:$dst), 1435 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1436 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1437 1438def : Pat<(v4f32 (X86Movss 1439 (v4f32 VR128:$dst), 1440 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1441 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1442 1443def : Pat<(v4f32 (X86Movss 1444 (v4f32 VR128:$dst), 1445 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1446 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1447 1448def : Pat<(v2f64 (X86Movsd 1449 (v2f64 VR128:$dst), 1450 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1451 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1452 1453def : Pat<(v2f64 (X86Movsd 1454 (v2f64 VR128:$dst), 1455 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1456 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1457 1458def : Pat<(v2f64 (X86Movsd 1459 (v2f64 VR128:$dst), 1460 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1461 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1462 1463def : Pat<(v2f64 (X86Movsd 1464 (v2f64 VR128:$dst), 1465 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1466 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1467} // Predicates = [UseAVX] 1468 1469let Predicates = [UseSSE2] in { 1470def : Pat<(v4f32 (X86Movss 1471 (v4f32 VR128:$dst), 1472 (v4f32 (scalar_to_vector 1473 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1474 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1475 1476def : Pat<(v2f64 (X86Movsd 1477 (v2f64 VR128:$dst), 1478 (v2f64 (scalar_to_vector 1479 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1480 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1481 1482def : Pat<(v2f64 (X86Movsd 1483 (v2f64 VR128:$dst), 1484 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1485 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1486 1487def : Pat<(v2f64 (X86Movsd 1488 (v2f64 VR128:$dst), 1489 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1490 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1491 1492def : Pat<(v2f64 (X86Movsd 1493 (v2f64 VR128:$dst), 1494 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1495 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1496 1497def : Pat<(v2f64 (X86Movsd 1498 (v2f64 VR128:$dst), 1499 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1500 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1501} // Predicates = [UseSSE2] 1502 1503let Predicates = [UseSSE1] in { 1504def : Pat<(v4f32 (X86Movss 1505 (v4f32 VR128:$dst), 1506 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1507 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1508 1509def : Pat<(v4f32 (X86Movss 1510 (v4f32 VR128:$dst), 1511 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1512 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1513 1514def : Pat<(v4f32 (X86Movss 1515 (v4f32 VR128:$dst), 1516 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1517 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1518 1519def : Pat<(v4f32 (X86Movss 1520 (v4f32 VR128:$dst), 1521 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1522 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1523} // Predicates = [UseSSE1] 1524 1525let Predicates = [HasAVX, NoVLX] in { 1526// Convert packed single/double fp to doubleword 1527def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1528 "cvtps2dq\t{$src, $dst|$dst, $src}", 1529 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1530 VEX, Sched<[WriteCvtPS2I]>, WIG, SIMD_EXC; 1531def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1532 "cvtps2dq\t{$src, $dst|$dst, $src}", 1533 [(set VR128:$dst, 1534 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1535 VEX, Sched<[WriteCvtPS2ILd]>, WIG, SIMD_EXC; 1536def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1537 "cvtps2dq\t{$src, $dst|$dst, $src}", 1538 [(set VR256:$dst, 1539 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1540 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG, SIMD_EXC; 1541def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1542 "cvtps2dq\t{$src, $dst|$dst, $src}", 1543 [(set VR256:$dst, 1544 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1545 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, WIG, SIMD_EXC; 1546} 1547def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1548 "cvtps2dq\t{$src, $dst|$dst, $src}", 1549 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1550 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1551def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1552 "cvtps2dq\t{$src, $dst|$dst, $src}", 1553 [(set VR128:$dst, 1554 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1555 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1556 1557// Convert Packed Double FP to Packed DW Integers 1558let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1559// The assembler can recognize rr 256-bit instructions by seeing a ymm 1560// register, but the same isn't true when using memory operands instead. 1561// Provide other assembly rr and rm forms to address this explicitly. 1562def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1563 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1564 [(set VR128:$dst, 1565 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1566 VEX, Sched<[WriteCvtPD2I]>, WIG; 1567 1568// XMM only 1569def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1570 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1571 [(set VR128:$dst, 1572 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1573 Sched<[WriteCvtPD2ILd]>, WIG; 1574 1575// YMM only 1576def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1577 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1578 [(set VR128:$dst, 1579 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1580 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG; 1581def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1582 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1583 [(set VR128:$dst, 1584 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1585 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG; 1586} 1587 1588let Predicates = [HasAVX] in { 1589 def : Pat<(v4i32 (lrint VR128:$src)), (VCVTPS2DQrr VR128:$src)>; 1590 def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (VCVTPS2DQrm addr:$src)>; 1591 def : Pat<(v8i32 (lrint VR256:$src)), (VCVTPS2DQYrr VR256:$src)>; 1592 def : Pat<(v8i32 (lrint (loadv8f32 addr:$src))), (VCVTPS2DQYrm addr:$src)>; 1593 def : Pat<(v4i32 (lrint VR256:$src)), (VCVTPD2DQYrr VR256:$src)>; 1594 def : Pat<(v4i32 (lrint (loadv4f64 addr:$src))), (VCVTPD2DQYrm addr:$src)>; 1595} 1596 1597let Predicates = [UseSSE2] in { 1598 def : Pat<(v4i32 (lrint VR128:$src)), (CVTPS2DQrr VR128:$src)>; 1599 def : Pat<(v4i32 (lrint (loadv4f32 addr:$src))), (CVTPS2DQrm addr:$src)>; 1600} 1601 1602def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1603 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1604def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1605 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1606 1607def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1608 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1609 [(set VR128:$dst, 1610 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1611 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1612def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1613 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1614 [(set VR128:$dst, 1615 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1616 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1617 1618// Convert with truncation packed single/double fp to doubleword 1619// SSE2 packed instructions with XS prefix 1620let Uses = [MXCSR], mayRaiseFPException = 1 in { 1621let Predicates = [HasAVX, NoVLX] in { 1622def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1623 "cvttps2dq\t{$src, $dst|$dst, $src}", 1624 [(set VR128:$dst, 1625 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1626 VEX, Sched<[WriteCvtPS2I]>, WIG; 1627def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1628 "cvttps2dq\t{$src, $dst|$dst, $src}", 1629 [(set VR128:$dst, 1630 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1631 VEX, Sched<[WriteCvtPS2ILd]>, WIG; 1632def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1633 "cvttps2dq\t{$src, $dst|$dst, $src}", 1634 [(set VR256:$dst, 1635 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1636 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG; 1637def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1638 "cvttps2dq\t{$src, $dst|$dst, $src}", 1639 [(set VR256:$dst, 1640 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1641 VEX, VEX_L, 1642 Sched<[WriteCvtPS2IYLd]>, WIG; 1643} 1644 1645def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1646 "cvttps2dq\t{$src, $dst|$dst, $src}", 1647 [(set VR128:$dst, 1648 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1649 Sched<[WriteCvtPS2I]>; 1650def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1651 "cvttps2dq\t{$src, $dst|$dst, $src}", 1652 [(set VR128:$dst, 1653 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1654 Sched<[WriteCvtPS2ILd]>; 1655} 1656 1657// The assembler can recognize rr 256-bit instructions by seeing a ymm 1658// register, but the same isn't true when using memory operands instead. 1659// Provide other assembly rr and rm forms to address this explicitly. 1660let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1661// XMM only 1662def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1663 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1664 [(set VR128:$dst, 1665 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1666 VEX, Sched<[WriteCvtPD2I]>, WIG; 1667def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1668 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1669 [(set VR128:$dst, 1670 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1671 VEX, Sched<[WriteCvtPD2ILd]>, WIG; 1672 1673// YMM only 1674def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1675 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1676 [(set VR128:$dst, 1677 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1678 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG; 1679def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1680 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1681 [(set VR128:$dst, 1682 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1683 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG; 1684} // Predicates = [HasAVX, NoVLX] 1685 1686def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1687 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1688def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1689 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1690 1691let Predicates = [HasAVX, NoVLX] in { 1692 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1693 (VCVTTPD2DQYrr VR256:$src)>; 1694 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1695 (VCVTTPD2DQYrm addr:$src)>; 1696} 1697 1698def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1699 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1700 [(set VR128:$dst, 1701 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1702 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1703def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1704 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1705 [(set VR128:$dst, 1706 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1707 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1708 1709// Convert packed single to packed double 1710let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1711 // SSE2 instructions without OpSize prefix 1712def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1713 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1714 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1715 TB, VEX, Sched<[WriteCvtPS2PD]>, WIG; 1716def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1717 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1718 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1719 TB, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG; 1720def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1721 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1722 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1723 TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG; 1724def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1725 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1726 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1727 TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG; 1728} 1729 1730let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1731def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1732 "cvtps2pd\t{$src, $dst|$dst, $src}", 1733 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1734 TB, Sched<[WriteCvtPS2PD]>; 1735def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1736 "cvtps2pd\t{$src, $dst|$dst, $src}", 1737 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1738 TB, Sched<[WriteCvtPS2PD.Folded]>; 1739} 1740 1741// Convert Packed DW Integers to Packed Double FP 1742let Predicates = [HasAVX, NoVLX] in { 1743let hasSideEffects = 0, mayLoad = 1 in 1744def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1745 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1746 [(set VR128:$dst, 1747 (v2f64 (X86any_VSintToFP 1748 (bc_v4i32 1749 (v2i64 (scalar_to_vector 1750 (loadi64 addr:$src)))))))]>, 1751 VEX, Sched<[WriteCvtI2PDLd]>, WIG; 1752def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1753 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1754 [(set VR128:$dst, 1755 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1756 VEX, Sched<[WriteCvtI2PD]>, WIG; 1757def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1758 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1759 [(set VR256:$dst, 1760 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1761 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1762 WIG; 1763def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1764 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1765 [(set VR256:$dst, 1766 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1767 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, WIG; 1768} 1769 1770let hasSideEffects = 0, mayLoad = 1 in 1771def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1772 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1773 [(set VR128:$dst, 1774 (v2f64 (X86any_VSintToFP 1775 (bc_v4i32 1776 (v2i64 (scalar_to_vector 1777 (loadi64 addr:$src)))))))]>, 1778 Sched<[WriteCvtI2PDLd]>; 1779def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1780 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1781 [(set VR128:$dst, 1782 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1783 Sched<[WriteCvtI2PD]>; 1784 1785// AVX register conversion intrinsics 1786let Predicates = [HasAVX, NoVLX] in { 1787 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1788 (VCVTDQ2PDrm addr:$src)>; 1789} // Predicates = [HasAVX, NoVLX] 1790 1791// SSE2 register conversion intrinsics 1792let Predicates = [UseSSE2] in { 1793 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1794 (CVTDQ2PDrm addr:$src)>; 1795} // Predicates = [UseSSE2] 1796 1797// Convert packed double to packed single 1798// The assembler can recognize rr 256-bit instructions by seeing a ymm 1799// register, but the same isn't true when using memory operands instead. 1800// Provide other assembly rr and rm forms to address this explicitly. 1801let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1802// XMM only 1803def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1804 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1805 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1806 VEX, Sched<[WriteCvtPD2PS]>, WIG; 1807def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1808 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1809 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>, 1810 VEX, Sched<[WriteCvtPD2PS.Folded]>, WIG; 1811 1812def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1813 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1814 [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>, 1815 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, WIG; 1816def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1817 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1818 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>, 1819 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, WIG; 1820} // Predicates = [HasAVX, NoVLX] 1821 1822def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1823 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1824def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1825 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1826 1827def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1828 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1829 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1830 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1831def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1832 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1833 [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>, 1834 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1835 1836//===----------------------------------------------------------------------===// 1837// SSE 1 & 2 - Compare Instructions 1838//===----------------------------------------------------------------------===// 1839 1840// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1841multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1842 Operand memop, SDNode OpNode, ValueType VT, 1843 PatFrag ld_frag, string asm, 1844 X86FoldableSchedWrite sched, 1845 PatFrags mem_frags> { 1846 def rri_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1847 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, 1848 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1849 VR128:$src2, timm:$cc))]>, 1850 Sched<[sched]>, SIMD_EXC; 1851 let mayLoad = 1 in 1852 def rmi_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1853 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, 1854 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1855 (mem_frags addr:$src2), timm:$cc))]>, 1856 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1857 1858 let isCodeGenOnly = 1 in { 1859 let isCommutable = 1 in 1860 def rri : SIi8<0xC2, MRMSrcReg, 1861 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1862 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, 1863 Sched<[sched]>, SIMD_EXC; 1864 def rmi : SIi8<0xC2, MRMSrcMem, 1865 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1866 [(set RC:$dst, (OpNode RC:$src1, 1867 (ld_frag addr:$src2), timm:$cc))]>, 1868 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1869 } 1870} 1871 1872let ExeDomain = SSEPackedSingle in 1873defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1874 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1875 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1876 TB, XS, VEX, VVVV, VEX_LIG, WIG; 1877let ExeDomain = SSEPackedDouble in 1878defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1879 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1880 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1881 TB, XD, VEX, VVVV, VEX_LIG, WIG; 1882 1883let Constraints = "$src1 = $dst" in { 1884 let ExeDomain = SSEPackedSingle in 1885 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1886 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1887 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, TB, XS; 1888 let ExeDomain = SSEPackedDouble in 1889 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1890 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1891 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, TB, XD; 1892} 1893 1894// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1895multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode, 1896 ValueType vt, X86MemOperand x86memop, 1897 PatFrag ld_frag, string OpcodeStr, Domain d, 1898 X86FoldableSchedWrite sched = WriteFComX> { 1899 let ExeDomain = d in { 1900 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1901 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1902 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1903 Sched<[sched]>, SIMD_EXC; 1904 let mayLoad = 1 in 1905 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1906 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1907 [(set EFLAGS, (OpNode (vt RC:$src1), 1908 (ld_frag addr:$src2)))]>, 1909 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1910} 1911} 1912 1913// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1914multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1915 ValueType vt, Operand memop, 1916 PatFrags mem_frags, string OpcodeStr, 1917 Domain d, 1918 X86FoldableSchedWrite sched = WriteFComX> { 1919let ExeDomain = d in { 1920 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1921 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1922 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1923 Sched<[sched]>, SIMD_EXC; 1924let mayLoad = 1 in 1925 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1926 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1927 [(set EFLAGS, (OpNode (vt RC:$src1), 1928 (mem_frags addr:$src2)))]>, 1929 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1930} 1931} 1932 1933let Defs = [EFLAGS] in { 1934 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1935 "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1936 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1937 "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1938 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1939 "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1940 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1941 "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1942 1943 let isCodeGenOnly = 1 in { 1944 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1945 sse_load_f32, "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1946 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1947 sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1948 1949 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1950 sse_load_f32, "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1951 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1952 sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1953 } 1954 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1955 "ucomiss", SSEPackedSingle>, TB; 1956 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1957 "ucomisd", SSEPackedDouble>, TB, PD; 1958 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1959 "comiss", SSEPackedSingle>, TB; 1960 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1961 "comisd", SSEPackedDouble>, TB, PD; 1962 1963 let isCodeGenOnly = 1 in { 1964 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1965 sse_load_f32, "ucomiss", SSEPackedSingle>, TB; 1966 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1967 sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD; 1968 1969 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1970 sse_load_f32, "comiss", SSEPackedSingle>, TB; 1971 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1972 sse_load_f64, "comisd", SSEPackedDouble>, TB, PD; 1973 } 1974} // Defs = [EFLAGS] 1975 1976// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1977multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1978 ValueType VT, string asm, 1979 X86FoldableSchedWrite sched, 1980 Domain d, PatFrag ld_frag> { 1981 let isCommutable = 1 in 1982 def rri : PIi8<0xC2, MRMSrcReg, 1983 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1984 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1985 Sched<[sched]>, SIMD_EXC; 1986 def rmi : PIi8<0xC2, MRMSrcMem, 1987 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1988 [(set RC:$dst, 1989 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1990 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1991} 1992 1993defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1994 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1995 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, TB, VEX, VVVV, WIG; 1996defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1997 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1998 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, TB, PD, VEX, VVVV, WIG; 1999defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 2000 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2001 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, TB, VEX, VVVV, VEX_L, WIG; 2002defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 2003 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 2004 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, TB, PD, VEX, VVVV, VEX_L, WIG; 2005let Constraints = "$src1 = $dst" in { 2006 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 2007 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2008 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, TB; 2009 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 2010 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 2011 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, TB, PD; 2012} 2013 2014def CommutableCMPCC : PatLeaf<(timm), [{ 2015 uint64_t Imm = N->getZExtValue() & 0x7; 2016 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 2017}]>; 2018 2019// Patterns to select compares with loads in first operand. 2020let Predicates = [HasAVX] in { 2021 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 2022 CommutableCMPCC:$cc)), 2023 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2024 2025 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 2026 CommutableCMPCC:$cc)), 2027 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2028 2029 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 2030 CommutableCMPCC:$cc)), 2031 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2032 2033 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 2034 CommutableCMPCC:$cc)), 2035 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2036 2037 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2038 CommutableCMPCC:$cc)), 2039 (VCMPSDrmi FR64:$src1, addr:$src2, timm:$cc)>; 2040 2041 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2042 CommutableCMPCC:$cc)), 2043 (VCMPSSrmi FR32:$src1, addr:$src2, timm:$cc)>; 2044} 2045 2046let Predicates = [UseSSE2] in { 2047 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 2048 CommutableCMPCC:$cc)), 2049 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2050 2051 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2052 CommutableCMPCC:$cc)), 2053 (CMPSDrmi FR64:$src1, addr:$src2, timm:$cc)>; 2054} 2055 2056let Predicates = [UseSSE1] in { 2057 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 2058 CommutableCMPCC:$cc)), 2059 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2060 2061 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2062 CommutableCMPCC:$cc)), 2063 (CMPSSrmi FR32:$src1, addr:$src2, timm:$cc)>; 2064} 2065 2066//===----------------------------------------------------------------------===// 2067// SSE 1 & 2 - Shuffle Instructions 2068//===----------------------------------------------------------------------===// 2069 2070/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2071multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2072 ValueType vt, string asm, PatFrag mem_frag, 2073 X86FoldableSchedWrite sched, Domain d, 2074 bit IsCommutable = 0> { 2075 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2076 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2077 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2078 (i8 timm:$src3))))], d>, 2079 Sched<[sched.Folded, sched.ReadAfterFold]>; 2080 let isCommutable = IsCommutable in 2081 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2082 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2083 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2084 (i8 timm:$src3))))], d>, 2085 Sched<[sched]>; 2086} 2087 2088let Predicates = [HasAVX, NoVLX] in { 2089 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2090 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2091 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2092 TB, VEX, VVVV, WIG; 2093 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2094 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2095 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2096 TB, VEX, VVVV, VEX_L, WIG; 2097 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2098 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2099 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2100 TB, PD, VEX, VVVV, WIG; 2101 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2102 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2103 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2104 TB, PD, VEX, VVVV, VEX_L, WIG; 2105} 2106let Constraints = "$src1 = $dst" in { 2107 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2108 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2109 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, TB; 2110 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2111 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2112 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD; 2113} 2114 2115//===----------------------------------------------------------------------===// 2116// SSE 1 & 2 - Unpack FP Instructions 2117//===----------------------------------------------------------------------===// 2118 2119/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2120multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2121 PatFrag mem_frag, RegisterClass RC, 2122 X86MemOperand x86memop, string asm, 2123 X86FoldableSchedWrite sched, Domain d, 2124 bit IsCommutable = 0> { 2125 let isCommutable = IsCommutable in 2126 def rr : PI<opc, MRMSrcReg, 2127 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2128 asm, [(set RC:$dst, 2129 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2130 Sched<[sched]>; 2131 def rm : PI<opc, MRMSrcMem, 2132 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2133 asm, [(set RC:$dst, 2134 (vt (OpNode RC:$src1, 2135 (mem_frag addr:$src2))))], d>, 2136 Sched<[sched.Folded, sched.ReadAfterFold]>; 2137} 2138 2139let Predicates = [HasAVX, NoVLX] in { 2140defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2141 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2142 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG; 2143defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2144 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2145 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD, VEX, VVVV, WIG; 2146defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2147 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2148 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG; 2149defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2150 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2151 SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD, VEX, VVVV, WIG; 2152 2153defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2154 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2155 SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG; 2156defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2157 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2158 SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG; 2159defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2160 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2161 SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG; 2162defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2163 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2164 SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG; 2165}// Predicates = [HasAVX, NoVLX] 2166 2167let Constraints = "$src1 = $dst" in { 2168 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2169 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2170 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB; 2171 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2172 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2173 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD; 2174 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2175 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2176 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB; 2177 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2178 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2179 SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD; 2180} // Constraints = "$src1 = $dst" 2181 2182let Predicates = [HasAVX1Only] in { 2183 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2184 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2185 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2186 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2187 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2188 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2189 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2190 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2191 2192 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2193 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2194 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2195 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2196 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2197 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2198 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2199 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2200} 2201 2202let Predicates = [UseSSE2] in { 2203 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2204 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2205 (v2f64 (simple_load addr:$src2)))), 2206 (MOVHPDrm VR128:$src1, addr:$src2)>; 2207} 2208 2209//===----------------------------------------------------------------------===// 2210// SSE 1 & 2 - Extract Floating-Point Sign mask 2211//===----------------------------------------------------------------------===// 2212 2213/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2214multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2215 string asm, Domain d> { 2216 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2217 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2218 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2219 Sched<[WriteFMOVMSK]>; 2220} 2221 2222let Predicates = [HasAVX] in { 2223 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2224 SSEPackedSingle>, TB, VEX, WIG; 2225 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2226 SSEPackedDouble>, TB, PD, VEX, WIG; 2227 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2228 SSEPackedSingle>, TB, VEX, VEX_L, WIG; 2229 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2230 SSEPackedDouble>, TB, PD, VEX, VEX_L, WIG; 2231 2232 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2233 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2234 (VMOVMSKPSrr VR128:$src)>; 2235 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2236 (VMOVMSKPDrr VR128:$src)>; 2237 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2238 (VMOVMSKPSYrr VR256:$src)>; 2239 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2240 (VMOVMSKPDYrr VR256:$src)>; 2241} 2242 2243defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2244 SSEPackedSingle>, TB; 2245defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2246 SSEPackedDouble>, TB, PD; 2247 2248let Predicates = [UseSSE2] in { 2249 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2250 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2251 (MOVMSKPSrr VR128:$src)>; 2252 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2253 (MOVMSKPDrr VR128:$src)>; 2254} 2255 2256//===---------------------------------------------------------------------===// 2257// SSE2 - Packed Integer Logical Instructions 2258//===---------------------------------------------------------------------===// 2259 2260let ExeDomain = SSEPackedInt in { // SSE integer instructions 2261 2262/// PDI_binop_rm - Simple SSE2 binary operator. 2263multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2264 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2265 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2266 bit IsCommutable, bit Is2Addr> { 2267 let isCommutable = IsCommutable in 2268 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2269 (ins RC:$src1, RC:$src2), 2270 !if(Is2Addr, 2271 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2272 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2273 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2274 Sched<[sched]>; 2275 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2276 (ins RC:$src1, x86memop:$src2), 2277 !if(Is2Addr, 2278 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2279 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2280 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2281 Sched<[sched.Folded, sched.ReadAfterFold]>; 2282} 2283} // ExeDomain = SSEPackedInt 2284 2285multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2286 ValueType OpVT128, ValueType OpVT256, 2287 X86SchedWriteWidths sched, bit IsCommutable, 2288 Predicate prd> { 2289let Predicates = [HasAVX, prd] in 2290 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2291 VR128, load, i128mem, sched.XMM, 2292 IsCommutable, 0>, VEX, VVVV, WIG; 2293 2294let Constraints = "$src1 = $dst" in 2295 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2296 memop, i128mem, sched.XMM, IsCommutable, 1>; 2297 2298let Predicates = [HasAVX2, prd] in 2299 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2300 OpVT256, VR256, load, i256mem, sched.YMM, 2301 IsCommutable, 0>, VEX, VVVV, VEX_L, WIG; 2302} 2303 2304// These are ordered here for pattern ordering requirements with the fp versions 2305 2306defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2307 SchedWriteVecLogic, 1, NoVLX>; 2308defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2309 SchedWriteVecLogic, 1, NoVLX>; 2310defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2311 SchedWriteVecLogic, 1, NoVLX>; 2312defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2313 SchedWriteVecLogic, 0, NoVLX>; 2314 2315//===----------------------------------------------------------------------===// 2316// SSE 1 & 2 - Logical Instructions 2317//===----------------------------------------------------------------------===// 2318 2319/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2320/// 2321/// There are no patterns here because isel prefers integer versions for SSE2 2322/// and later. There are SSE1 v4f32 patterns later. 2323multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2324 X86SchedWriteWidths sched> { 2325 let Predicates = [HasAVX, NoVLX] in { 2326 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2327 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2328 [], [], 0>, TB, VEX, VVVV, VEX_L, WIG; 2329 2330 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2331 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2332 [], [], 0>, TB, PD, VEX, VVVV, VEX_L, WIG; 2333 2334 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2335 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2336 [], [], 0>, TB, VEX, VVVV, WIG; 2337 2338 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2339 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2340 [], [], 0>, TB, PD, VEX, VVVV, WIG; 2341 } 2342 2343 let Constraints = "$src1 = $dst" in { 2344 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2345 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2346 [], []>, TB; 2347 2348 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2349 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2350 [], []>, TB, PD; 2351 } 2352} 2353 2354defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>; 2355defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>; 2356defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>; 2357let isCommutable = 0 in 2358 defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>; 2359 2360let Predicates = [HasAVX2, NoVLX] in { 2361 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2362 (VPANDYrr VR256:$src1, VR256:$src2)>; 2363 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2364 (VPANDYrr VR256:$src1, VR256:$src2)>; 2365 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2366 (VPANDYrr VR256:$src1, VR256:$src2)>; 2367 2368 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2369 (VPORYrr VR256:$src1, VR256:$src2)>; 2370 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2371 (VPORYrr VR256:$src1, VR256:$src2)>; 2372 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2373 (VPORYrr VR256:$src1, VR256:$src2)>; 2374 2375 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2376 (VPXORYrr VR256:$src1, VR256:$src2)>; 2377 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2378 (VPXORYrr VR256:$src1, VR256:$src2)>; 2379 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2380 (VPXORYrr VR256:$src1, VR256:$src2)>; 2381 2382 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2383 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2384 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2385 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2386 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2387 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2388 2389 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2390 (VPANDYrm VR256:$src1, addr:$src2)>; 2391 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2392 (VPANDYrm VR256:$src1, addr:$src2)>; 2393 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2394 (VPANDYrm VR256:$src1, addr:$src2)>; 2395 2396 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2397 (VPORYrm VR256:$src1, addr:$src2)>; 2398 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2399 (VPORYrm VR256:$src1, addr:$src2)>; 2400 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2401 (VPORYrm VR256:$src1, addr:$src2)>; 2402 2403 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2404 (VPXORYrm VR256:$src1, addr:$src2)>; 2405 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2406 (VPXORYrm VR256:$src1, addr:$src2)>; 2407 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2408 (VPXORYrm VR256:$src1, addr:$src2)>; 2409 2410 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2411 (VPANDNYrm VR256:$src1, addr:$src2)>; 2412 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2413 (VPANDNYrm VR256:$src1, addr:$src2)>; 2414 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2415 (VPANDNYrm VR256:$src1, addr:$src2)>; 2416} 2417 2418// If only AVX1 is supported, we need to handle integer operations with 2419// floating point instructions since the integer versions aren't available. 2420let Predicates = [HasAVX1Only] in { 2421 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2422 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2423 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2424 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2425 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2426 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2427 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2428 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2429 2430 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2431 (VORPSYrr VR256:$src1, VR256:$src2)>; 2432 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2433 (VORPSYrr VR256:$src1, VR256:$src2)>; 2434 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2435 (VORPSYrr VR256:$src1, VR256:$src2)>; 2436 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2437 (VORPSYrr VR256:$src1, VR256:$src2)>; 2438 2439 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2440 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2441 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2442 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2443 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2444 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2445 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2446 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2447 2448 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2449 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2450 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2451 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2452 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2453 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2454 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2455 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2456 2457 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2458 (VANDPSYrm VR256:$src1, addr:$src2)>; 2459 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2460 (VANDPSYrm VR256:$src1, addr:$src2)>; 2461 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2462 (VANDPSYrm VR256:$src1, addr:$src2)>; 2463 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2464 (VANDPSYrm VR256:$src1, addr:$src2)>; 2465 2466 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2467 (VORPSYrm VR256:$src1, addr:$src2)>; 2468 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2469 (VORPSYrm VR256:$src1, addr:$src2)>; 2470 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2471 (VORPSYrm VR256:$src1, addr:$src2)>; 2472 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2473 (VORPSYrm VR256:$src1, addr:$src2)>; 2474 2475 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2476 (VXORPSYrm VR256:$src1, addr:$src2)>; 2477 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2478 (VXORPSYrm VR256:$src1, addr:$src2)>; 2479 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2480 (VXORPSYrm VR256:$src1, addr:$src2)>; 2481 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2482 (VXORPSYrm VR256:$src1, addr:$src2)>; 2483 2484 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2485 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2486 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2487 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2488 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2489 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2490 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2491 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2492} 2493 2494let Predicates = [HasAVX, NoVLX] in { 2495 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2496 (VPANDrr VR128:$src1, VR128:$src2)>; 2497 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2498 (VPANDrr VR128:$src1, VR128:$src2)>; 2499 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2500 (VPANDrr VR128:$src1, VR128:$src2)>; 2501 2502 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2503 (VPORrr VR128:$src1, VR128:$src2)>; 2504 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2505 (VPORrr VR128:$src1, VR128:$src2)>; 2506 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2507 (VPORrr VR128:$src1, VR128:$src2)>; 2508 2509 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2510 (VPXORrr VR128:$src1, VR128:$src2)>; 2511 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2512 (VPXORrr VR128:$src1, VR128:$src2)>; 2513 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2514 (VPXORrr VR128:$src1, VR128:$src2)>; 2515 2516 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2517 (VPANDNrr VR128:$src1, VR128:$src2)>; 2518 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2519 (VPANDNrr VR128:$src1, VR128:$src2)>; 2520 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2521 (VPANDNrr VR128:$src1, VR128:$src2)>; 2522 2523 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2524 (VPANDrm VR128:$src1, addr:$src2)>; 2525 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2526 (VPANDrm VR128:$src1, addr:$src2)>; 2527 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2528 (VPANDrm VR128:$src1, addr:$src2)>; 2529 2530 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2531 (VPORrm VR128:$src1, addr:$src2)>; 2532 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2533 (VPORrm VR128:$src1, addr:$src2)>; 2534 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2535 (VPORrm VR128:$src1, addr:$src2)>; 2536 2537 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2538 (VPXORrm VR128:$src1, addr:$src2)>; 2539 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2540 (VPXORrm VR128:$src1, addr:$src2)>; 2541 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2542 (VPXORrm VR128:$src1, addr:$src2)>; 2543 2544 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2545 (VPANDNrm VR128:$src1, addr:$src2)>; 2546 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2547 (VPANDNrm VR128:$src1, addr:$src2)>; 2548 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2549 (VPANDNrm VR128:$src1, addr:$src2)>; 2550} 2551 2552let Predicates = [UseSSE2] in { 2553 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2554 (PANDrr VR128:$src1, VR128:$src2)>; 2555 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2556 (PANDrr VR128:$src1, VR128:$src2)>; 2557 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2558 (PANDrr VR128:$src1, VR128:$src2)>; 2559 2560 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2561 (PORrr VR128:$src1, VR128:$src2)>; 2562 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2563 (PORrr VR128:$src1, VR128:$src2)>; 2564 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2565 (PORrr VR128:$src1, VR128:$src2)>; 2566 2567 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2568 (PXORrr VR128:$src1, VR128:$src2)>; 2569 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2570 (PXORrr VR128:$src1, VR128:$src2)>; 2571 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2572 (PXORrr VR128:$src1, VR128:$src2)>; 2573 2574 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2575 (PANDNrr VR128:$src1, VR128:$src2)>; 2576 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2577 (PANDNrr VR128:$src1, VR128:$src2)>; 2578 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2579 (PANDNrr VR128:$src1, VR128:$src2)>; 2580 2581 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2582 (PANDrm VR128:$src1, addr:$src2)>; 2583 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2584 (PANDrm VR128:$src1, addr:$src2)>; 2585 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2586 (PANDrm VR128:$src1, addr:$src2)>; 2587 2588 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2589 (PORrm VR128:$src1, addr:$src2)>; 2590 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2591 (PORrm VR128:$src1, addr:$src2)>; 2592 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2593 (PORrm VR128:$src1, addr:$src2)>; 2594 2595 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2596 (PXORrm VR128:$src1, addr:$src2)>; 2597 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2598 (PXORrm VR128:$src1, addr:$src2)>; 2599 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2600 (PXORrm VR128:$src1, addr:$src2)>; 2601 2602 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2603 (PANDNrm VR128:$src1, addr:$src2)>; 2604 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2605 (PANDNrm VR128:$src1, addr:$src2)>; 2606 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2607 (PANDNrm VR128:$src1, addr:$src2)>; 2608} 2609 2610// Patterns for packed operations when we don't have integer type available. 2611def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2612 (ANDPSrr VR128:$src1, VR128:$src2)>; 2613def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2614 (ORPSrr VR128:$src1, VR128:$src2)>; 2615def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2616 (XORPSrr VR128:$src1, VR128:$src2)>; 2617def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2618 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2619 2620def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2621 (ANDPSrm VR128:$src1, addr:$src2)>; 2622def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2623 (ORPSrm VR128:$src1, addr:$src2)>; 2624def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2625 (XORPSrm VR128:$src1, addr:$src2)>; 2626def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2627 (ANDNPSrm VR128:$src1, addr:$src2)>; 2628 2629//===----------------------------------------------------------------------===// 2630// SSE 1 & 2 - Arithmetic Instructions 2631//===----------------------------------------------------------------------===// 2632 2633/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2634/// vector forms. 2635/// 2636/// In addition, we also have a special variant of the scalar form here to 2637/// represent the associated intrinsic operation. This form is unlike the 2638/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2639/// and leaves the top elements unmodified (therefore these cannot be commuted). 2640/// 2641/// These three forms can each be reg+reg or reg+mem. 2642/// 2643 2644/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2645/// classes below 2646multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2647 SDPatternOperator OpNode, X86SchedWriteSizes sched> { 2648let Uses = [MXCSR], mayRaiseFPException = 1 in { 2649 let Predicates = [HasAVX, NoVLX] in { 2650 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2651 VR128, v4f32, f128mem, loadv4f32, 2652 SSEPackedSingle, sched.PS.XMM, 0>, TB, VEX, VVVV, WIG; 2653 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2654 VR128, v2f64, f128mem, loadv2f64, 2655 SSEPackedDouble, sched.PD.XMM, 0>, TB, PD, VEX, VVVV, WIG; 2656 2657 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2658 OpNode, VR256, v8f32, f256mem, loadv8f32, 2659 SSEPackedSingle, sched.PS.YMM, 0>, TB, VEX, VVVV, VEX_L, WIG; 2660 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2661 OpNode, VR256, v4f64, f256mem, loadv4f64, 2662 SSEPackedDouble, sched.PD.YMM, 0>, TB, PD, VEX, VVVV, VEX_L, WIG; 2663 } 2664 2665 let Constraints = "$src1 = $dst" in { 2666 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2667 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2668 sched.PS.XMM>, TB; 2669 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2670 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2671 sched.PD.XMM>, TB, PD; 2672 } 2673} 2674} 2675 2676multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2677 X86SchedWriteSizes sched> { 2678let Uses = [MXCSR], mayRaiseFPException = 1 in { 2679 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2680 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2681 TB, XS, VEX, VVVV, VEX_LIG, WIG; 2682 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2683 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2684 TB, XD, VEX, VVVV, VEX_LIG, WIG; 2685 2686 let Constraints = "$src1 = $dst" in { 2687 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2688 OpNode, FR32, f32mem, SSEPackedSingle, 2689 sched.PS.Scl>, TB, XS; 2690 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2691 OpNode, FR64, f64mem, SSEPackedDouble, 2692 sched.PD.Scl>, TB, XD; 2693 } 2694} 2695} 2696 2697multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2698 SDPatternOperator OpNode, 2699 X86SchedWriteSizes sched> { 2700let Uses = [MXCSR], mayRaiseFPException = 1 in { 2701 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2702 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2703 SSEPackedSingle, sched.PS.Scl, 0>, TB, XS, VEX, VVVV, VEX_LIG, WIG; 2704 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2705 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2706 SSEPackedDouble, sched.PD.Scl, 0>, TB, XD, VEX, VVVV, VEX_LIG, WIG; 2707 2708 let Constraints = "$src1 = $dst" in { 2709 defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2710 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2711 SSEPackedSingle, sched.PS.Scl>, TB, XS; 2712 defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2713 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2714 SSEPackedDouble, sched.PD.Scl>, TB, XD; 2715 } 2716} 2717} 2718 2719// Binary Arithmetic instructions 2720defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2721 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2722 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2723defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2724 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2725 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2726let isCommutable = 0 in { 2727 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2728 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2729 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2730 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2731 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2732 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2733 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2734 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2735 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2736 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2737 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2738 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2739} 2740 2741let isCodeGenOnly = 1 in { 2742 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2743 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2744 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2745 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2746} 2747 2748// Patterns used to select SSE scalar fp arithmetic instructions from 2749// either: 2750// 2751// (1) a scalar fp operation followed by a blend 2752// 2753// The effect is that the backend no longer emits unnecessary vector 2754// insert instructions immediately after SSE scalar fp instructions 2755// like addss or mulss. 2756// 2757// For example, given the following code: 2758// __m128 foo(__m128 A, __m128 B) { 2759// A[0] += B[0]; 2760// return A; 2761// } 2762// 2763// Previously we generated: 2764// addss %xmm0, %xmm1 2765// movss %xmm1, %xmm0 2766// 2767// We now generate: 2768// addss %xmm1, %xmm0 2769// 2770// (2) a vector packed single/double fp operation followed by a vector insert 2771// 2772// The effect is that the backend converts the packed fp instruction 2773// followed by a vector insert into a single SSE scalar fp instruction. 2774// 2775// For example, given the following code: 2776// __m128 foo(__m128 A, __m128 B) { 2777// __m128 C = A + B; 2778// return (__m128) {c[0], a[1], a[2], a[3]}; 2779// } 2780// 2781// Previously we generated: 2782// addps %xmm0, %xmm1 2783// movss %xmm1, %xmm0 2784// 2785// We now generate: 2786// addss %xmm1, %xmm0 2787 2788// TODO: Some canonicalization in lowering would simplify the number of 2789// patterns we have to try to match. 2790multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move, 2791 ValueType VT, ValueType EltTy, 2792 RegisterClass RC, PatFrag ld_frag, 2793 Predicate BasePredicate> { 2794 let Predicates = [BasePredicate] in { 2795 // extracted scalar math op with insert via movss/movsd 2796 def : Pat<(VT (Move (VT VR128:$dst), 2797 (VT (scalar_to_vector 2798 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2799 RC:$src))))), 2800 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2801 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2802 def : Pat<(VT (Move (VT VR128:$dst), 2803 (VT (scalar_to_vector 2804 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2805 (ld_frag addr:$src)))))), 2806 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2807 } 2808 2809 // Repeat for AVX versions of the instructions. 2810 let Predicates = [UseAVX] in { 2811 // extracted scalar math op with insert via movss/movsd 2812 def : Pat<(VT (Move (VT VR128:$dst), 2813 (VT (scalar_to_vector 2814 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2815 RC:$src))))), 2816 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2817 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2818 def : Pat<(VT (Move (VT VR128:$dst), 2819 (VT (scalar_to_vector 2820 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2821 (ld_frag addr:$src)))))), 2822 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2823 } 2824} 2825 2826defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2827defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2828defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2829defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2830 2831defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2832defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2833defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2834defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2835 2836/// Unop Arithmetic 2837/// In addition, we also have a special variant of the scalar form here to 2838/// represent the associated intrinsic operation. This form is unlike the 2839/// plain scalar form, in that it takes an entire vector (instead of a 2840/// scalar) and leaves the top elements undefined. 2841/// 2842/// And, we have a special variant form for a full-vector intrinsic form. 2843 2844/// sse_fp_unop_s - SSE1 unops in scalar form 2845/// For the non-AVX defs, we need $src1 to be tied to $dst because 2846/// the HW instructions are 2 operand / destructive. 2847multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2848 X86MemOperand x86memop, Operand intmemop, 2849 SDPatternOperator OpNode, Domain d, 2850 X86FoldableSchedWrite sched, Predicate target> { 2851 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2852 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2853 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2854 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2855 Requires<[target]>; 2856 let mayLoad = 1 in 2857 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2858 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2859 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2860 Sched<[sched.Folded]>, 2861 Requires<[target, OptForSize]>; 2862 } 2863 2864 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2865 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2866 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2867 Sched<[sched]>; 2868 let mayLoad = 1 in 2869 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2870 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2871 Sched<[sched.Folded, sched.ReadAfterFold]>; 2872 } 2873 2874} 2875 2876multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2877 Intrinsic Intr, Predicate target> { 2878 let Predicates = [target] in { 2879 // These are unary operations, but they are modeled as having 2 source operands 2880 // because the high elements of the destination are unchanged in SSE. 2881 def : Pat<(Intr VR128:$src), 2882 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2883 } 2884 // We don't want to fold scalar loads into these instructions unless 2885 // optimizing for size. This is because the folded instruction will have a 2886 // partial register update, while the unfolded sequence will not, e.g. 2887 // movss mem, %xmm0 2888 // rcpss %xmm0, %xmm0 2889 // which has a clobber before the rcp, vs. 2890 // rcpss mem, %xmm0 2891 let Predicates = [target, OptForSize] in { 2892 def : Pat<(Intr (mem_frags addr:$src2)), 2893 (!cast<Instruction>(NAME#m_Int) 2894 (vt (IMPLICIT_DEF)), addr:$src2)>; 2895 } 2896} 2897 2898multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2899 Intrinsic Intr, Predicate target> { 2900 let Predicates = [target] in { 2901 def : Pat<(Intr VR128:$src), 2902 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2903 VR128:$src)>; 2904 } 2905 let Predicates = [target, OptForSize] in { 2906 def : Pat<(Intr (mem_frags addr:$src2)), 2907 (!cast<Instruction>(NAME#m_Int) 2908 (vt (IMPLICIT_DEF)), addr:$src2)>; 2909 } 2910} 2911 2912multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2913 ValueType ScalarVT, X86MemOperand x86memop, 2914 Operand intmemop, SDPatternOperator OpNode, Domain d, 2915 X86FoldableSchedWrite sched, Predicate target> { 2916 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2917 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2918 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2919 [], d>, Sched<[sched]>; 2920 let mayLoad = 1 in 2921 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2922 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2923 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2924 } 2925 let hasSideEffects = 0, ExeDomain = d in { 2926 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2927 (ins VR128:$src1, VR128:$src2), 2928 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2929 []>, Sched<[sched]>; 2930 let mayLoad = 1 in 2931 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2932 (ins VR128:$src1, intmemop:$src2), 2933 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2934 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2935 } 2936 2937 // We don't want to fold scalar loads into these instructions unless 2938 // optimizing for size. This is because the folded instruction will have a 2939 // partial register update, while the unfolded sequence will not, e.g. 2940 // vmovss mem, %xmm0 2941 // vrcpss %xmm0, %xmm0, %xmm0 2942 // which has a clobber before the rcp, vs. 2943 // vrcpss mem, %xmm0, %xmm0 2944 // TODO: In theory, we could fold the load, and avoid the stall caused by 2945 // the partial register store, either in BreakFalseDeps or with smarter RA. 2946 let Predicates = [target] in { 2947 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2948 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2949 } 2950 let Predicates = [target, OptForSize] in { 2951 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2952 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2953 addr:$src)>; 2954 } 2955} 2956 2957/// sse1_fp_unop_p - SSE1 unops in packed form. 2958multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2959 X86SchedWriteWidths sched, list<Predicate> prds> { 2960let Predicates = prds in { 2961 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2962 !strconcat("v", OpcodeStr, 2963 "ps\t{$src, $dst|$dst, $src}"), 2964 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2965 VEX, Sched<[sched.XMM]>, WIG; 2966 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2967 !strconcat("v", OpcodeStr, 2968 "ps\t{$src, $dst|$dst, $src}"), 2969 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2970 VEX, Sched<[sched.XMM.Folded]>, WIG; 2971 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2972 !strconcat("v", OpcodeStr, 2973 "ps\t{$src, $dst|$dst, $src}"), 2974 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2975 VEX, VEX_L, Sched<[sched.YMM]>, WIG; 2976 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2977 !strconcat("v", OpcodeStr, 2978 "ps\t{$src, $dst|$dst, $src}"), 2979 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2980 VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG; 2981} 2982 2983 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2984 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2985 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2986 Sched<[sched.XMM]>; 2987 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2988 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2989 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2990 Sched<[sched.XMM.Folded]>; 2991} 2992 2993/// sse2_fp_unop_p - SSE2 unops in vector forms. 2994multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2995 SDPatternOperator OpNode, X86SchedWriteWidths sched> { 2996let Predicates = [HasAVX, NoVLX] in { 2997 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2998 !strconcat("v", OpcodeStr, 2999 "pd\t{$src, $dst|$dst, $src}"), 3000 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 3001 VEX, Sched<[sched.XMM]>, WIG; 3002 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3003 !strconcat("v", OpcodeStr, 3004 "pd\t{$src, $dst|$dst, $src}"), 3005 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 3006 VEX, Sched<[sched.XMM.Folded]>, WIG; 3007 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3008 !strconcat("v", OpcodeStr, 3009 "pd\t{$src, $dst|$dst, $src}"), 3010 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 3011 VEX, VEX_L, Sched<[sched.YMM]>, WIG; 3012 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3013 !strconcat("v", OpcodeStr, 3014 "pd\t{$src, $dst|$dst, $src}"), 3015 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 3016 VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG; 3017} 3018 3019 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3020 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3021 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 3022 Sched<[sched.XMM]>; 3023 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3024 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3025 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 3026 Sched<[sched.XMM.Folded]>; 3027} 3028 3029multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> { 3030 defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32, 3031 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3032 UseSSE1>, TB, XS; 3033 defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32, 3034 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3035 AVXTarget>, 3036 TB, XS, VEX, VVVV, VEX_LIG, WIG; 3037} 3038 3039multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3040 X86SchedWriteWidths sched, Predicate AVXTarget> { 3041 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem, 3042 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, TB, XS; 3043 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, 3044 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 3045 TB, XS, VEX, VVVV, VEX_LIG, WIG; 3046} 3047 3048multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3049 X86SchedWriteWidths sched, Predicate AVXTarget> { 3050 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem, 3051 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, TB, XD; 3052 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, 3053 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 3054 TB, XD, VEX, VVVV, VEX_LIG, WIG; 3055} 3056 3057// Square root. 3058defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 3059 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3060 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3061 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3062 3063// Reciprocal approximations. Note that these typically require refinement 3064// in order to obtain suitable precision. 3065defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3066 sse1_fp_unop_s_intr<"rsqrt", HasAVX>, 3067 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3068defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3069 sse1_fp_unop_s_intr<"rcp", HasAVX>, 3070 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3071 3072// There is no f64 version of the reciprocal approximation instructions. 3073 3074multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, 3075 ValueType VT, Predicate BasePredicate> { 3076 let Predicates = [BasePredicate] in { 3077 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3078 (OpNode (extractelt VT:$src, 0))))), 3079 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3080 } 3081 3082 // Repeat for AVX versions of the instructions. 3083 let Predicates = [UseAVX] in { 3084 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3085 (OpNode (extractelt VT:$src, 0))))), 3086 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3087 } 3088} 3089 3090defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3091defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3092 3093multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3094 SDNode Move, ValueType VT, 3095 Predicate BasePredicate> { 3096 let Predicates = [BasePredicate] in { 3097 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3098 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3099 } 3100 3101 // Repeat for AVX versions of the instructions. 3102 let Predicates = [HasAVX] in { 3103 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3104 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3105 } 3106} 3107 3108defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3109 v4f32, UseSSE1>; 3110defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3111 v4f32, UseSSE1>; 3112 3113 3114//===----------------------------------------------------------------------===// 3115// SSE 1 & 2 - Non-temporal stores 3116//===----------------------------------------------------------------------===// 3117 3118let AddedComplexity = 400 in { // Prefer non-temporal versions 3119let Predicates = [HasAVX, NoVLX] in { 3120let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3121def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3122 (ins f128mem:$dst, VR128:$src), 3123 "movntps\t{$src, $dst|$dst, $src}", 3124 [(alignednontemporalstore (v4f32 VR128:$src), 3125 addr:$dst)]>, VEX, WIG; 3126def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3127 (ins f128mem:$dst, VR128:$src), 3128 "movntpd\t{$src, $dst|$dst, $src}", 3129 [(alignednontemporalstore (v2f64 VR128:$src), 3130 addr:$dst)]>, VEX, WIG; 3131} // SchedRW 3132 3133let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3134def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3135 (ins f256mem:$dst, VR256:$src), 3136 "movntps\t{$src, $dst|$dst, $src}", 3137 [(alignednontemporalstore (v8f32 VR256:$src), 3138 addr:$dst)]>, VEX, VEX_L, WIG; 3139def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3140 (ins f256mem:$dst, VR256:$src), 3141 "movntpd\t{$src, $dst|$dst, $src}", 3142 [(alignednontemporalstore (v4f64 VR256:$src), 3143 addr:$dst)]>, VEX, VEX_L, WIG; 3144} // SchedRW 3145 3146let ExeDomain = SSEPackedInt in { 3147def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3148 (ins i128mem:$dst, VR128:$src), 3149 "movntdq\t{$src, $dst|$dst, $src}", 3150 [(alignednontemporalstore (v2i64 VR128:$src), 3151 addr:$dst)]>, VEX, WIG, 3152 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3153def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3154 (ins i256mem:$dst, VR256:$src), 3155 "movntdq\t{$src, $dst|$dst, $src}", 3156 [(alignednontemporalstore (v4i64 VR256:$src), 3157 addr:$dst)]>, VEX, VEX_L, WIG, 3158 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3159} // ExeDomain 3160} // Predicates 3161 3162let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3163def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3164 "movntps\t{$src, $dst|$dst, $src}", 3165 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3166def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3167 "movntpd\t{$src, $dst|$dst, $src}", 3168 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3169} // SchedRW 3170 3171let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3172def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3173 "movntdq\t{$src, $dst|$dst, $src}", 3174 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3175 3176let SchedRW = [WriteStoreNT] in { 3177// There is no AVX form for instructions below this point 3178def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3179 "movnti{l}\t{$src, $dst|$dst, $src}", 3180 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3181 TB, Requires<[HasSSE2]>; 3182def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3183 "movnti{q}\t{$src, $dst|$dst, $src}", 3184 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3185 TB, Requires<[HasSSE2]>; 3186} // SchedRW = [WriteStoreNT] 3187 3188let Predicates = [HasAVX, NoVLX] in { 3189 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3190 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3191 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3192 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3193 def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), 3194 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3195 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3196 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3197 3198 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3199 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3200 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3201 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3202 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3203 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3204 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3205 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3206} 3207 3208let Predicates = [UseSSE2] in { 3209 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3210 (MOVNTDQmr addr:$dst, VR128:$src)>; 3211 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3212 (MOVNTDQmr addr:$dst, VR128:$src)>; 3213 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3214 (MOVNTDQmr addr:$dst, VR128:$src)>; 3215 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3216 (MOVNTDQmr addr:$dst, VR128:$src)>; 3217} 3218 3219} // AddedComplexity 3220 3221//===----------------------------------------------------------------------===// 3222// SSE 1 & 2 - Prefetch and memory fence 3223//===----------------------------------------------------------------------===// 3224 3225// Prefetch intrinsic. 3226let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3227def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3228 "prefetcht0\t$src", [(prefetch addr:$src, timm, (i32 3), (i32 1))]>, TB; 3229def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3230 "prefetcht1\t$src", [(prefetch addr:$src, timm, (i32 2), (i32 1))]>, TB; 3231def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3232 "prefetcht2\t$src", [(prefetch addr:$src, timm, (i32 1), (i32 1))]>, TB; 3233def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3234 "prefetchnta\t$src", [(prefetch addr:$src, timm, (i32 0), (i32 1))]>, TB; 3235} 3236 3237// FIXME: How should flush instruction be modeled? 3238let SchedRW = [WriteLoad] in { 3239// Flush cache 3240def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3241 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3242 TB, Requires<[HasCLFLUSH]>; 3243} 3244 3245let SchedRW = [WriteNop] in { 3246// Pause. This "instruction" is encoded as "rep; nop", so even though it 3247// was introduced with SSE2, it's backward compatible. 3248def PAUSE : I<0x90, RawFrm, (outs), (ins), 3249 "pause", [(int_x86_sse2_pause)]>, XS; 3250} 3251 3252let SchedRW = [WriteFence] in { 3253// Load, store, and memory fence 3254// TODO: As with mfence, we may want to ease the availability of sfence/lfence 3255// to include any 64-bit target. 3256def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3257 TB, Requires<[HasSSE1]>; 3258def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3259 TB, Requires<[HasSSE2]>; 3260def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3261 TB, Requires<[HasMFence]>; 3262} // SchedRW 3263 3264def : Pat<(X86MFence), (MFENCE)>; 3265 3266//===----------------------------------------------------------------------===// 3267// SSE 1 & 2 - Load/Store XCSR register 3268//===----------------------------------------------------------------------===// 3269 3270let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3271def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3272 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3273 VEX, Sched<[WriteLDMXCSR]>, WIG; 3274let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3275def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3276 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3277 VEX, Sched<[WriteSTMXCSR]>, WIG; 3278 3279let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3280def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3281 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3282 TB, Sched<[WriteLDMXCSR]>; 3283let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3284def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3285 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3286 TB, Sched<[WriteSTMXCSR]>; 3287 3288//===---------------------------------------------------------------------===// 3289// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3290//===---------------------------------------------------------------------===// 3291 3292let ExeDomain = SSEPackedInt in { // SSE integer instructions 3293 3294let hasSideEffects = 0 in { 3295def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3296 "movdqa\t{$src, $dst|$dst, $src}", []>, 3297 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG; 3298def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3299 "movdqu\t{$src, $dst|$dst, $src}", []>, 3300 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG; 3301def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3302 "movdqa\t{$src, $dst|$dst, $src}", []>, 3303 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG; 3304def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3305 "movdqu\t{$src, $dst|$dst, $src}", []>, 3306 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG; 3307} 3308 3309// For Disassembler 3310let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3311def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3312 "movdqa\t{$src, $dst|$dst, $src}", []>, 3313 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3314 VEX, WIG; 3315def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3316 "movdqa\t{$src, $dst|$dst, $src}", []>, 3317 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3318 VEX, VEX_L, WIG; 3319def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3320 "movdqu\t{$src, $dst|$dst, $src}", []>, 3321 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3322 VEX, WIG; 3323def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3324 "movdqu\t{$src, $dst|$dst, $src}", []>, 3325 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3326 VEX, VEX_L, WIG; 3327} 3328 3329let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3330 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3331def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3332 "movdqa\t{$src, $dst|$dst, $src}", 3333 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3334 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG; 3335def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3336 "movdqa\t{$src, $dst|$dst, $src}", []>, 3337 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3338 VEX, VEX_L, WIG; 3339def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3340 "vmovdqu\t{$src, $dst|$dst, $src}", 3341 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3342 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3343 TB, XS, VEX, WIG; 3344def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3345 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3346 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3347 TB, XS, VEX, VEX_L, WIG; 3348} 3349 3350let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3351def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3352 (ins i128mem:$dst, VR128:$src), 3353 "movdqa\t{$src, $dst|$dst, $src}", 3354 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3355 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, WIG; 3356def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3357 (ins i256mem:$dst, VR256:$src), 3358 "movdqa\t{$src, $dst|$dst, $src}", []>, 3359 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, WIG; 3360def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3361 "vmovdqu\t{$src, $dst|$dst, $src}", 3362 [(store (v2i64 VR128:$src), addr:$dst)]>, 3363 Sched<[SchedWriteVecMoveLS.XMM.MR]>, TB, XS, VEX, WIG; 3364def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3365 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3366 Sched<[SchedWriteVecMoveLS.YMM.MR]>, TB, XS, VEX, VEX_L, WIG; 3367} 3368 3369let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3370let hasSideEffects = 0 in { 3371def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3372 "movdqa\t{$src, $dst|$dst, $src}", []>; 3373 3374def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3375 "movdqu\t{$src, $dst|$dst, $src}", []>, 3376 TB, XS, Requires<[UseSSE2]>; 3377} 3378 3379// For Disassembler 3380let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3381def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3382 "movdqa\t{$src, $dst|$dst, $src}", []>; 3383 3384def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3385 "movdqu\t{$src, $dst|$dst, $src}", []>, 3386 TB, XS, Requires<[UseSSE2]>; 3387} 3388} // SchedRW 3389 3390let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3391 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3392def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3393 "movdqa\t{$src, $dst|$dst, $src}", 3394 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3395def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3396 "movdqu\t{$src, $dst|$dst, $src}", 3397 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3398 TB, XS, Requires<[UseSSE2]>; 3399} 3400 3401let mayStore = 1, hasSideEffects = 0, 3402 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3403def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3404 "movdqa\t{$src, $dst|$dst, $src}", 3405 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3406def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3407 "movdqu\t{$src, $dst|$dst, $src}", 3408 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3409 TB, XS, Requires<[UseSSE2]>; 3410} 3411 3412} // ExeDomain = SSEPackedInt 3413 3414// Reversed version with ".s" suffix for GAS compatibility. 3415def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3416 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3417def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3418 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3419def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3420 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3421def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3422 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3423 3424// Reversed version with ".s" suffix for GAS compatibility. 3425def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3426 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3427def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3428 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3429 3430let Predicates = [HasAVX, NoVLX] in { 3431 // Additional patterns for other integer sizes. 3432 def : Pat<(alignedloadv4i32 addr:$src), 3433 (VMOVDQArm addr:$src)>; 3434 def : Pat<(alignedloadv8i16 addr:$src), 3435 (VMOVDQArm addr:$src)>; 3436 def : Pat<(alignedloadv8f16 addr:$src), 3437 (VMOVDQArm addr:$src)>; 3438 def : Pat<(alignedloadv16i8 addr:$src), 3439 (VMOVDQArm addr:$src)>; 3440 def : Pat<(loadv4i32 addr:$src), 3441 (VMOVDQUrm addr:$src)>; 3442 def : Pat<(loadv8i16 addr:$src), 3443 (VMOVDQUrm addr:$src)>; 3444 def : Pat<(loadv8f16 addr:$src), 3445 (VMOVDQUrm addr:$src)>; 3446 def : Pat<(loadv16i8 addr:$src), 3447 (VMOVDQUrm addr:$src)>; 3448 3449 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3450 (VMOVDQAmr addr:$dst, VR128:$src)>; 3451 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3452 (VMOVDQAmr addr:$dst, VR128:$src)>; 3453 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 3454 (VMOVDQAmr addr:$dst, VR128:$src)>; 3455 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3456 (VMOVDQAmr addr:$dst, VR128:$src)>; 3457 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3458 (VMOVDQUmr addr:$dst, VR128:$src)>; 3459 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3460 (VMOVDQUmr addr:$dst, VR128:$src)>; 3461 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 3462 (VMOVDQUmr addr:$dst, VR128:$src)>; 3463 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3464 (VMOVDQUmr addr:$dst, VR128:$src)>; 3465} 3466 3467//===---------------------------------------------------------------------===// 3468// SSE2 - Packed Integer Arithmetic Instructions 3469//===---------------------------------------------------------------------===// 3470 3471let ExeDomain = SSEPackedInt in { // SSE integer instructions 3472 3473/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3474multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3475 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3476 PatFrag memop_frag, X86MemOperand x86memop, 3477 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3478 let isCommutable = 1 in 3479 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3480 (ins RC:$src1, RC:$src2), 3481 !if(Is2Addr, 3482 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3483 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3484 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3485 Sched<[sched]>; 3486 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3487 (ins RC:$src1, x86memop:$src2), 3488 !if(Is2Addr, 3489 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3490 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3491 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3492 (memop_frag addr:$src2))))]>, 3493 Sched<[sched.Folded, sched.ReadAfterFold]>; 3494} 3495} // ExeDomain = SSEPackedInt 3496 3497defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3498 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3499defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3500 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3501defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3502 SchedWriteVecALU, 1, NoVLX>; 3503defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3504 SchedWriteVecALU, 1, NoVLX>; 3505defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3506 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3507defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3508 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3509defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3510 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3511defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3512 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3513defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3514 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3515defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3516 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3517defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3518 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3519defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3520 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3521defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3522 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3523defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3524 SchedWriteVecALU, 0, NoVLX>; 3525defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3526 SchedWriteVecALU, 0, NoVLX>; 3527defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3528 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3529defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3530 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3531defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3532 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3533defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3534 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3535defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3536 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3537defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3538 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3539defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3540 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3541defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3542 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3543defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, 3544 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3545defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, 3546 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3547defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3548 SchedWriteVecIMul, 1, NoVLX>; 3549 3550let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3551defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3552 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3553 VEX, VVVV, WIG; 3554 3555let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3556defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3557 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3558 0>, VEX, VVVV, VEX_L, WIG; 3559let Constraints = "$src1 = $dst" in 3560defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3561 memop, i128mem, SchedWriteVecIMul.XMM>; 3562 3563let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3564defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3565 load, i128mem, SchedWritePSADBW.XMM, 0>, 3566 VEX, VVVV, WIG; 3567let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3568defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3569 load, i256mem, SchedWritePSADBW.YMM, 0>, 3570 VEX, VVVV, VEX_L, WIG; 3571let Constraints = "$src1 = $dst" in 3572defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3573 memop, i128mem, SchedWritePSADBW.XMM>; 3574 3575//===---------------------------------------------------------------------===// 3576// SSE2 - Packed Integer Logical Instructions 3577//===---------------------------------------------------------------------===// 3578 3579multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3580 string OpcodeStr, SDNode OpNode, 3581 SDNode OpNode2, RegisterClass RC, 3582 X86FoldableSchedWrite sched, 3583 X86FoldableSchedWrite schedImm, 3584 ValueType DstVT, ValueType SrcVT, 3585 PatFrag ld_frag, bit Is2Addr = 1> { 3586 // src2 is always 128-bit 3587 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3588 (ins RC:$src1, VR128:$src2), 3589 !if(Is2Addr, 3590 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3591 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3592 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3593 Sched<[sched]>; 3594 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3595 (ins RC:$src1, i128mem:$src2), 3596 !if(Is2Addr, 3597 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3598 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3599 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3600 (SrcVT (ld_frag addr:$src2)))))]>, 3601 Sched<[sched.Folded, sched.ReadAfterFold]>; 3602 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3603 (ins RC:$src1, u8imm:$src2), 3604 !if(Is2Addr, 3605 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3606 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3607 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3608 Sched<[schedImm]>; 3609} 3610 3611multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3612 string OpcodeStr, SDNode OpNode, 3613 SDNode OpNode2, ValueType DstVT128, 3614 ValueType DstVT256, ValueType SrcVT, 3615 X86SchedWriteWidths sched, 3616 X86SchedWriteWidths schedImm, Predicate prd> { 3617let Predicates = [HasAVX, prd] in 3618 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3619 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3620 DstVT128, SrcVT, load, 0>, VEX, VVVV, WIG; 3621let Predicates = [HasAVX2, prd] in 3622 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3623 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3624 DstVT256, SrcVT, load, 0>, VEX, VVVV, VEX_L, 3625 WIG; 3626let Constraints = "$src1 = $dst" in 3627 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3628 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3629 memop>; 3630} 3631 3632multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3633 SDNode OpNode, RegisterClass RC, ValueType VT, 3634 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3635 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3636 !if(Is2Addr, 3637 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3638 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3639 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3640 Sched<[sched]>; 3641} 3642 3643multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3644 SDNode OpNode, X86SchedWriteWidths sched> { 3645let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3646 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3647 VR128, v16i8, sched.XMM, 0>, VEX, VVVV, WIG; 3648let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3649 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3650 VR256, v32i8, sched.YMM, 0>, 3651 VEX, VVVV, VEX_L, WIG; 3652let Constraints = "$src1 = $dst" in 3653 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3654 sched.XMM>; 3655} 3656 3657let ExeDomain = SSEPackedInt in { 3658 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3659 v8i16, v16i16, v8i16, SchedWriteVecShift, 3660 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3661 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3662 v4i32, v8i32, v4i32, SchedWriteVecShift, 3663 SchedWriteVecShiftImm, NoVLX>; 3664 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3665 v2i64, v4i64, v2i64, SchedWriteVecShift, 3666 SchedWriteVecShiftImm, NoVLX>; 3667 3668 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3669 v8i16, v16i16, v8i16, SchedWriteVecShift, 3670 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3671 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3672 v4i32, v8i32, v4i32, SchedWriteVecShift, 3673 SchedWriteVecShiftImm, NoVLX>; 3674 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3675 v2i64, v4i64, v2i64, SchedWriteVecShift, 3676 SchedWriteVecShiftImm, NoVLX>; 3677 3678 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3679 v8i16, v16i16, v8i16, SchedWriteVecShift, 3680 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3681 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3682 v4i32, v8i32, v4i32, SchedWriteVecShift, 3683 SchedWriteVecShiftImm, NoVLX>; 3684 3685 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3686 SchedWriteShuffle>; 3687 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3688 SchedWriteShuffle>; 3689} // ExeDomain = SSEPackedInt 3690 3691//===---------------------------------------------------------------------===// 3692// SSE2 - Packed Integer Comparison Instructions 3693//===---------------------------------------------------------------------===// 3694 3695defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3696 SchedWriteVecALU, 1, TruePredicate>; 3697defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3698 SchedWriteVecALU, 1, TruePredicate>; 3699defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3700 SchedWriteVecALU, 1, TruePredicate>; 3701defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3702 SchedWriteVecALU, 0, TruePredicate>; 3703defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3704 SchedWriteVecALU, 0, TruePredicate>; 3705defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3706 SchedWriteVecALU, 0, TruePredicate>; 3707 3708//===---------------------------------------------------------------------===// 3709// SSE2 - Packed Integer Shuffle Instructions 3710//===---------------------------------------------------------------------===// 3711 3712let ExeDomain = SSEPackedInt in { 3713multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3714 SDNode OpNode, X86SchedWriteWidths sched, 3715 Predicate prd> { 3716let Predicates = [HasAVX, prd] in { 3717 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3718 (ins VR128:$src1, u8imm:$src2), 3719 !strconcat("v", OpcodeStr, 3720 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3721 [(set VR128:$dst, 3722 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3723 VEX, Sched<[sched.XMM]>, WIG; 3724 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3725 (ins i128mem:$src1, u8imm:$src2), 3726 !strconcat("v", OpcodeStr, 3727 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3728 [(set VR128:$dst, 3729 (vt128 (OpNode (load addr:$src1), 3730 (i8 timm:$src2))))]>, VEX, 3731 Sched<[sched.XMM.Folded]>, WIG; 3732} 3733 3734let Predicates = [HasAVX2, prd] in { 3735 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3736 (ins VR256:$src1, u8imm:$src2), 3737 !strconcat("v", OpcodeStr, 3738 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3739 [(set VR256:$dst, 3740 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3741 VEX, VEX_L, Sched<[sched.YMM]>, WIG; 3742 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3743 (ins i256mem:$src1, u8imm:$src2), 3744 !strconcat("v", OpcodeStr, 3745 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3746 [(set VR256:$dst, 3747 (vt256 (OpNode (load addr:$src1), 3748 (i8 timm:$src2))))]>, VEX, VEX_L, 3749 Sched<[sched.YMM.Folded]>, WIG; 3750} 3751 3752let Predicates = [UseSSE2] in { 3753 def ri : Ii8<0x70, MRMSrcReg, 3754 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3755 !strconcat(OpcodeStr, 3756 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3757 [(set VR128:$dst, 3758 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3759 Sched<[sched.XMM]>; 3760 def mi : Ii8<0x70, MRMSrcMem, 3761 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3762 !strconcat(OpcodeStr, 3763 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3764 [(set VR128:$dst, 3765 (vt128 (OpNode (memop addr:$src1), 3766 (i8 timm:$src2))))]>, 3767 Sched<[sched.XMM.Folded]>; 3768} 3769} 3770} // ExeDomain = SSEPackedInt 3771 3772defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3773 SchedWriteShuffle, NoVLX>, TB, PD; 3774defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3775 SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XS; 3776defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3777 SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XD; 3778 3779//===---------------------------------------------------------------------===// 3780// Packed Integer Pack Instructions (SSE & AVX) 3781//===---------------------------------------------------------------------===// 3782 3783let ExeDomain = SSEPackedInt in { 3784multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3785 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3786 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3787 PatFrag ld_frag, bit Is2Addr = 1> { 3788 def rr : PDI<opc, MRMSrcReg, 3789 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3790 !if(Is2Addr, 3791 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3792 !strconcat(OpcodeStr, 3793 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3794 [(set RC:$dst, 3795 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3796 Sched<[sched]>; 3797 def rm : PDI<opc, MRMSrcMem, 3798 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3799 !if(Is2Addr, 3800 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3801 !strconcat(OpcodeStr, 3802 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3803 [(set RC:$dst, 3804 (OutVT (OpNode (ArgVT RC:$src1), 3805 (ld_frag addr:$src2))))]>, 3806 Sched<[sched.Folded, sched.ReadAfterFold]>; 3807} 3808 3809multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3810 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3811 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3812 PatFrag ld_frag, bit Is2Addr = 1> { 3813 def rr : SS48I<opc, MRMSrcReg, 3814 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3815 !if(Is2Addr, 3816 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3817 !strconcat(OpcodeStr, 3818 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3819 [(set RC:$dst, 3820 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3821 Sched<[sched]>; 3822 def rm : SS48I<opc, MRMSrcMem, 3823 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3824 !if(Is2Addr, 3825 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3826 !strconcat(OpcodeStr, 3827 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3828 [(set RC:$dst, 3829 (OutVT (OpNode (ArgVT RC:$src1), 3830 (ld_frag addr:$src2))))]>, 3831 Sched<[sched.Folded, sched.ReadAfterFold]>; 3832} 3833 3834let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3835 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3836 i128mem, SchedWriteShuffle.XMM, load, 0>, 3837 VEX, VVVV, WIG; 3838 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3839 i128mem, SchedWriteShuffle.XMM, load, 0>, 3840 VEX, VVVV, WIG; 3841 3842 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3843 i128mem, SchedWriteShuffle.XMM, load, 0>, 3844 VEX, VVVV, WIG; 3845 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3846 i128mem, SchedWriteShuffle.XMM, load, 0>, 3847 VEX, VVVV, WIG; 3848} 3849 3850let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3851 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3852 i256mem, SchedWriteShuffle.YMM, load, 0>, 3853 VEX, VVVV, VEX_L, WIG; 3854 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3855 i256mem, SchedWriteShuffle.YMM, load, 0>, 3856 VEX, VVVV, VEX_L, WIG; 3857 3858 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3859 i256mem, SchedWriteShuffle.YMM, load, 0>, 3860 VEX, VVVV, VEX_L, WIG; 3861 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3862 i256mem, SchedWriteShuffle.YMM, load, 0>, 3863 VEX, VVVV, VEX_L, WIG; 3864} 3865 3866let Constraints = "$src1 = $dst" in { 3867 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3868 i128mem, SchedWriteShuffle.XMM, memop>; 3869 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3870 i128mem, SchedWriteShuffle.XMM, memop>; 3871 3872 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3873 i128mem, SchedWriteShuffle.XMM, memop>; 3874 3875 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3876 i128mem, SchedWriteShuffle.XMM, memop>; 3877} 3878} // ExeDomain = SSEPackedInt 3879 3880//===---------------------------------------------------------------------===// 3881// SSE2 - Packed Integer Unpack Instructions 3882//===---------------------------------------------------------------------===// 3883 3884let ExeDomain = SSEPackedInt in { 3885multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3886 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3887 X86FoldableSchedWrite sched, PatFrag ld_frag, 3888 bit Is2Addr = 1> { 3889 def rr : PDI<opc, MRMSrcReg, 3890 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3891 !if(Is2Addr, 3892 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3893 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3894 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3895 Sched<[sched]>; 3896 def rm : PDI<opc, MRMSrcMem, 3897 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3898 !if(Is2Addr, 3899 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3900 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3901 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3902 Sched<[sched.Folded, sched.ReadAfterFold]>; 3903} 3904 3905let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3906 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3907 i128mem, SchedWriteShuffle.XMM, load, 0>, 3908 VEX, VVVV, WIG; 3909 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3910 i128mem, SchedWriteShuffle.XMM, load, 0>, 3911 VEX, VVVV, WIG; 3912 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3913 i128mem, SchedWriteShuffle.XMM, load, 0>, 3914 VEX, VVVV, WIG; 3915 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3916 i128mem, SchedWriteShuffle.XMM, load, 0>, 3917 VEX, VVVV, WIG; 3918} 3919 3920let Predicates = [HasAVX, NoVLX] in { 3921 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3922 i128mem, SchedWriteShuffle.XMM, load, 0>, 3923 VEX, VVVV, WIG; 3924 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3925 i128mem, SchedWriteShuffle.XMM, load, 0>, 3926 VEX, VVVV, WIG; 3927 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3928 i128mem, SchedWriteShuffle.XMM, load, 0>, 3929 VEX, VVVV, WIG; 3930 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3931 i128mem, SchedWriteShuffle.XMM, load, 0>, 3932 VEX, VVVV, WIG; 3933} 3934 3935let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3936 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3937 i256mem, SchedWriteShuffle.YMM, load, 0>, 3938 VEX, VVVV, VEX_L, WIG; 3939 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3940 i256mem, SchedWriteShuffle.YMM, load, 0>, 3941 VEX, VVVV, VEX_L, WIG; 3942 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3943 i256mem, SchedWriteShuffle.YMM, load, 0>, 3944 VEX, VVVV, VEX_L, WIG; 3945 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3946 i256mem, SchedWriteShuffle.YMM, load, 0>, 3947 VEX, VVVV, VEX_L, WIG; 3948} 3949 3950let Predicates = [HasAVX2, NoVLX] in { 3951 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3952 i256mem, SchedWriteShuffle.YMM, load, 0>, 3953 VEX, VVVV, VEX_L, WIG; 3954 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3955 i256mem, SchedWriteShuffle.YMM, load, 0>, 3956 VEX, VVVV, VEX_L, WIG; 3957 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3958 i256mem, SchedWriteShuffle.YMM, load, 0>, 3959 VEX, VVVV, VEX_L, WIG; 3960 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3961 i256mem, SchedWriteShuffle.YMM, load, 0>, 3962 VEX, VVVV, VEX_L, WIG; 3963} 3964 3965let Constraints = "$src1 = $dst" in { 3966 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3967 i128mem, SchedWriteShuffle.XMM, memop>; 3968 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3969 i128mem, SchedWriteShuffle.XMM, memop>; 3970 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3971 i128mem, SchedWriteShuffle.XMM, memop>; 3972 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3973 i128mem, SchedWriteShuffle.XMM, memop>; 3974 3975 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3976 i128mem, SchedWriteShuffle.XMM, memop>; 3977 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3978 i128mem, SchedWriteShuffle.XMM, memop>; 3979 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3980 i128mem, SchedWriteShuffle.XMM, memop>; 3981 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3982 i128mem, SchedWriteShuffle.XMM, memop>; 3983} 3984} // ExeDomain = SSEPackedInt 3985 3986//===---------------------------------------------------------------------===// 3987// SSE2 - Packed Integer Extract and Insert 3988//===---------------------------------------------------------------------===// 3989 3990let ExeDomain = SSEPackedInt in { 3991multiclass sse2_pinsrw<bit Is2Addr = 1> { 3992 def rr : Ii8<0xC4, MRMSrcReg, 3993 (outs VR128:$dst), (ins VR128:$src1, 3994 GR32orGR64:$src2, u8imm:$src3), 3995 !if(Is2Addr, 3996 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3997 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3998 [(set VR128:$dst, 3999 (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 4000 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 4001 def rm : Ii8<0xC4, MRMSrcMem, 4002 (outs VR128:$dst), (ins VR128:$src1, 4003 i16mem:$src2, u8imm:$src3), 4004 !if(Is2Addr, 4005 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 4006 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 4007 [(set VR128:$dst, 4008 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 4009 timm:$src3))]>, 4010 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 4011} 4012 4013// Extract 4014let Predicates = [HasAVX, NoBWI] in 4015def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 4016 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4017 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4018 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4019 timm:$src2))]>, 4020 TB, PD, VEX, WIG, Sched<[WriteVecExtract]>; 4021def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 4022 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4023 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4024 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4025 timm:$src2))]>, 4026 Sched<[WriteVecExtract]>; 4027 4028// Insert 4029let Predicates = [HasAVX, NoBWI] in 4030defm VPINSRW : sse2_pinsrw<0>, TB, PD, VEX, VVVV, WIG; 4031 4032let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 4033defm PINSRW : sse2_pinsrw, TB, PD; 4034 4035} // ExeDomain = SSEPackedInt 4036 4037// Always select FP16 instructions if available. 4038let Predicates = [UseSSE2], AddedComplexity = -10 in { 4039 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4040 def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; 4041 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4042 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4043} 4044 4045let Predicates = [HasAVX, NoBWI] in { 4046 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4047 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4048 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4049} 4050 4051//===---------------------------------------------------------------------===// 4052// SSE2 - Packed Mask Creation 4053//===---------------------------------------------------------------------===// 4054 4055let ExeDomain = SSEPackedInt in { 4056 4057def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4058 (ins VR128:$src), 4059 "pmovmskb\t{$src, $dst|$dst, $src}", 4060 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4061 Sched<[WriteVecMOVMSK]>, VEX, WIG; 4062 4063let Predicates = [HasAVX2] in { 4064def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4065 (ins VR256:$src), 4066 "pmovmskb\t{$src, $dst|$dst, $src}", 4067 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 4068 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, WIG; 4069} 4070 4071def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 4072 "pmovmskb\t{$src, $dst|$dst, $src}", 4073 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4074 Sched<[WriteVecMOVMSK]>; 4075 4076} // ExeDomain = SSEPackedInt 4077 4078//===---------------------------------------------------------------------===// 4079// SSE2 - Conditional Store 4080//===---------------------------------------------------------------------===// 4081 4082let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 4083// As VEX does not have separate instruction contexts for address size 4084// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict. 4085// Prefer VMASKMODDQU64. 4086let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4087def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4088 (ins VR128:$src, VR128:$mask), 4089 "maskmovdqu\t{$mask, $src|$src, $mask}", 4090 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4091 VEX, WIG; 4092let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in 4093def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4094 (ins VR128:$src, VR128:$mask), 4095 "maskmovdqu\t{$mask, $src|$src, $mask}", 4096 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4097 VEX, WIG; 4098 4099let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4100def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4101 "maskmovdqu\t{$mask, $src|$src, $mask}", 4102 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4103let Uses = [EDI], Predicates = [UseSSE2] in 4104def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4105 "maskmovdqu\t{$mask, $src|$src, $mask}", 4106 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4107 4108} // ExeDomain = SSEPackedInt 4109 4110//===---------------------------------------------------------------------===// 4111// SSE2 - Move Doubleword/Quadword 4112//===---------------------------------------------------------------------===// 4113 4114//===---------------------------------------------------------------------===// 4115// Move Int Doubleword to Packed Double Int 4116// 4117let ExeDomain = SSEPackedInt in { 4118def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4119 "movd\t{$src, $dst|$dst, $src}", 4120 [(set VR128:$dst, 4121 (v4i32 (scalar_to_vector GR32:$src)))]>, 4122 VEX, Sched<[WriteVecMoveFromGpr]>; 4123def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4124 "movd\t{$src, $dst|$dst, $src}", 4125 [(set VR128:$dst, 4126 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4127 VEX, Sched<[WriteVecLoad]>; 4128def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4129 "movq\t{$src, $dst|$dst, $src}", 4130 [(set VR128:$dst, 4131 (v2i64 (scalar_to_vector GR64:$src)))]>, 4132 VEX, Sched<[WriteVecMoveFromGpr]>; 4133let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4134def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4135 "movq\t{$src, $dst|$dst, $src}", []>, 4136 VEX, Sched<[WriteVecLoad]>; 4137let isCodeGenOnly = 1 in 4138def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4139 "movq\t{$src, $dst|$dst, $src}", 4140 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4141 VEX, Sched<[WriteVecMoveFromGpr]>; 4142 4143def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4144 "movd\t{$src, $dst|$dst, $src}", 4145 [(set VR128:$dst, 4146 (v4i32 (scalar_to_vector GR32:$src)))]>, 4147 Sched<[WriteVecMoveFromGpr]>; 4148def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4149 "movd\t{$src, $dst|$dst, $src}", 4150 [(set VR128:$dst, 4151 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4152 Sched<[WriteVecLoad]>; 4153def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4154 "movq\t{$src, $dst|$dst, $src}", 4155 [(set VR128:$dst, 4156 (v2i64 (scalar_to_vector GR64:$src)))]>, 4157 Sched<[WriteVecMoveFromGpr]>; 4158let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4159def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4160 "movq\t{$src, $dst|$dst, $src}", []>, 4161 Sched<[WriteVecLoad]>; 4162let isCodeGenOnly = 1 in 4163def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4164 "movq\t{$src, $dst|$dst, $src}", 4165 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4166 Sched<[WriteVecMoveFromGpr]>; 4167} // ExeDomain = SSEPackedInt 4168 4169//===---------------------------------------------------------------------===// 4170// Move Int Doubleword to Single Scalar 4171// 4172let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4173 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4174 "movd\t{$src, $dst|$dst, $src}", 4175 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4176 VEX, Sched<[WriteVecMoveFromGpr]>; 4177 4178 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4179 "movd\t{$src, $dst|$dst, $src}", 4180 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4181 Sched<[WriteVecMoveFromGpr]>; 4182 4183} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4184 4185//===---------------------------------------------------------------------===// 4186// Move Packed Doubleword Int to Packed Double Int 4187// 4188let ExeDomain = SSEPackedInt in { 4189def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4190 "movd\t{$src, $dst|$dst, $src}", 4191 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4192 (iPTR 0)))]>, VEX, 4193 Sched<[WriteVecMoveToGpr]>; 4194def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4195 (ins i32mem:$dst, VR128:$src), 4196 "movd\t{$src, $dst|$dst, $src}", 4197 [(store (i32 (extractelt (v4i32 VR128:$src), 4198 (iPTR 0))), addr:$dst)]>, 4199 VEX, Sched<[WriteVecStore]>; 4200def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4201 "movd\t{$src, $dst|$dst, $src}", 4202 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4203 (iPTR 0)))]>, 4204 Sched<[WriteVecMoveToGpr]>; 4205def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4206 "movd\t{$src, $dst|$dst, $src}", 4207 [(store (i32 (extractelt (v4i32 VR128:$src), 4208 (iPTR 0))), addr:$dst)]>, 4209 Sched<[WriteVecStore]>; 4210} // ExeDomain = SSEPackedInt 4211 4212//===---------------------------------------------------------------------===// 4213// Move Packed Doubleword Int first element to Doubleword Int 4214// 4215let ExeDomain = SSEPackedInt in { 4216let SchedRW = [WriteVecMoveToGpr] in { 4217def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4218 "movq\t{$src, $dst|$dst, $src}", 4219 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4220 (iPTR 0)))]>, 4221 VEX; 4222 4223def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4224 "movq\t{$src, $dst|$dst, $src}", 4225 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4226 (iPTR 0)))]>; 4227} //SchedRW 4228 4229let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4230def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4231 (ins i64mem:$dst, VR128:$src), 4232 "movq\t{$src, $dst|$dst, $src}", []>, 4233 VEX, Sched<[WriteVecStore]>; 4234let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4235def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4236 "movq\t{$src, $dst|$dst, $src}", []>, 4237 Sched<[WriteVecStore]>; 4238} // ExeDomain = SSEPackedInt 4239 4240//===---------------------------------------------------------------------===// 4241// Bitcast FR64 <-> GR64 4242// 4243let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4244 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4245 "movq\t{$src, $dst|$dst, $src}", 4246 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4247 VEX, Sched<[WriteVecMoveToGpr]>; 4248 4249 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4250 "movq\t{$src, $dst|$dst, $src}", 4251 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4252 Sched<[WriteVecMoveToGpr]>; 4253} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4254 4255//===---------------------------------------------------------------------===// 4256// Move Scalar Single to Double Int 4257// 4258let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4259 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4260 "movd\t{$src, $dst|$dst, $src}", 4261 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4262 VEX, Sched<[WriteVecMoveToGpr]>; 4263 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4264 "movd\t{$src, $dst|$dst, $src}", 4265 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4266 Sched<[WriteVecMoveToGpr]>; 4267} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4268 4269let Predicates = [UseAVX] in { 4270 def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))), 4271 (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 4272 GR8:$src, sub_8bit)))>; 4273 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4274 (VMOVDI2PDIrr GR32:$src)>; 4275 4276 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4277 (VMOV64toPQIrr GR64:$src)>; 4278 4279 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4280 // These instructions also write zeros in the high part of a 256-bit register. 4281 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4282 (VMOVDI2PDIrm addr:$src)>; 4283 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4284 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4285} 4286 4287let Predicates = [UseSSE2] in { 4288 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4289 (MOVDI2PDIrr GR32:$src)>; 4290 4291 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4292 (MOV64toPQIrr GR64:$src)>; 4293 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4294 (MOVDI2PDIrm addr:$src)>; 4295} 4296 4297// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4298// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4299// these aliases. 4300def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4301 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4302def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4303 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4304// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4305def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4306 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4307def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4308 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4309 4310//===---------------------------------------------------------------------===// 4311// SSE2 - Move Quadword 4312//===---------------------------------------------------------------------===// 4313 4314//===---------------------------------------------------------------------===// 4315// Move Quadword Int to Packed Quadword Int 4316// 4317 4318let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4319def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4320 "vmovq\t{$src, $dst|$dst, $src}", 4321 [(set VR128:$dst, 4322 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, TB, XS, 4323 VEX, Requires<[UseAVX]>, WIG; 4324def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4325 "movq\t{$src, $dst|$dst, $src}", 4326 [(set VR128:$dst, 4327 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4328 TB, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4329} // ExeDomain, SchedRW 4330 4331//===---------------------------------------------------------------------===// 4332// Move Packed Quadword Int to Quadword Int 4333// 4334let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4335def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4336 "movq\t{$src, $dst|$dst, $src}", 4337 [(store (i64 (extractelt (v2i64 VR128:$src), 4338 (iPTR 0))), addr:$dst)]>, 4339 VEX, WIG; 4340def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4341 "movq\t{$src, $dst|$dst, $src}", 4342 [(store (i64 (extractelt (v2i64 VR128:$src), 4343 (iPTR 0))), addr:$dst)]>; 4344} // ExeDomain, SchedRW 4345 4346// For disassembler only 4347let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4348 SchedRW = [SchedWriteVecLogic.XMM] in { 4349def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4350 "movq\t{$src, $dst|$dst, $src}", []>, VEX, WIG; 4351def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4352 "movq\t{$src, $dst|$dst, $src}", []>; 4353} 4354 4355def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4356 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4357def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4358 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4359 4360let Predicates = [UseAVX] in { 4361 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4362 (VMOVQI2PQIrm addr:$src)>; 4363 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4364 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4365 4366 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4367 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4368} 4369 4370let Predicates = [UseSSE2] in { 4371 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4372 4373 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4374 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4375} 4376 4377//===---------------------------------------------------------------------===// 4378// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4379// IA32 document. movq xmm1, xmm2 does clear the high bits. 4380// 4381let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4382def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4383 "vmovq\t{$src, $dst|$dst, $src}", 4384 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4385 TB, XS, VEX, Requires<[UseAVX]>, WIG; 4386def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4387 "movq\t{$src, $dst|$dst, $src}", 4388 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4389 TB, XS, Requires<[UseSSE2]>; 4390} // ExeDomain, SchedRW 4391 4392let Predicates = [UseAVX] in { 4393 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4394 (VMOVZPQILo2PQIrr VR128:$src)>; 4395} 4396let Predicates = [UseSSE2] in { 4397 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4398 (MOVZPQILo2PQIrr VR128:$src)>; 4399} 4400 4401let Predicates = [UseAVX] in { 4402 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4403 (SUBREG_TO_REG (i32 0), 4404 (v2f64 (VMOVZPQILo2PQIrr 4405 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4406 sub_xmm)>; 4407 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4408 (SUBREG_TO_REG (i32 0), 4409 (v2i64 (VMOVZPQILo2PQIrr 4410 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4411 sub_xmm)>; 4412} 4413 4414//===---------------------------------------------------------------------===// 4415// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4416//===---------------------------------------------------------------------===// 4417 4418multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4419 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4420 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4421def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4422 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4423 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4424 Sched<[sched]>; 4425def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4426 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4427 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4428 Sched<[sched.Folded]>; 4429} 4430 4431let Predicates = [HasAVX, NoVLX] in { 4432 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4433 v4f32, VR128, loadv4f32, f128mem, 4434 SchedWriteFShuffle.XMM>, VEX, WIG; 4435 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4436 v4f32, VR128, loadv4f32, f128mem, 4437 SchedWriteFShuffle.XMM>, VEX, WIG; 4438 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4439 v8f32, VR256, loadv8f32, f256mem, 4440 SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG; 4441 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4442 v8f32, VR256, loadv8f32, f256mem, 4443 SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG; 4444} 4445defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4446 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4447defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4448 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4449 4450let Predicates = [HasAVX, NoVLX] in { 4451 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4452 (VMOVSHDUPrr VR128:$src)>; 4453 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4454 (VMOVSHDUPrm addr:$src)>; 4455 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4456 (VMOVSLDUPrr VR128:$src)>; 4457 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4458 (VMOVSLDUPrm addr:$src)>; 4459 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4460 (VMOVSHDUPYrr VR256:$src)>; 4461 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4462 (VMOVSHDUPYrm addr:$src)>; 4463 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4464 (VMOVSLDUPYrr VR256:$src)>; 4465 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4466 (VMOVSLDUPYrm addr:$src)>; 4467} 4468 4469let Predicates = [UseSSE3] in { 4470 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4471 (MOVSHDUPrr VR128:$src)>; 4472 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4473 (MOVSHDUPrm addr:$src)>; 4474 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4475 (MOVSLDUPrr VR128:$src)>; 4476 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4477 (MOVSLDUPrm addr:$src)>; 4478} 4479 4480//===---------------------------------------------------------------------===// 4481// SSE3 - Replicate Double FP - MOVDDUP 4482//===---------------------------------------------------------------------===// 4483 4484multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4485def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4486 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4487 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4488 Sched<[sched.XMM]>; 4489def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4490 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4491 [(set VR128:$dst, 4492 (v2f64 (X86Movddup 4493 (scalar_to_vector (loadf64 addr:$src)))))]>, 4494 Sched<[sched.XMM.Folded]>; 4495} 4496 4497// FIXME: Merge with above classes when there are patterns for the ymm version 4498multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4499def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4500 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4501 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4502 Sched<[sched.YMM]>; 4503def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4504 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4505 [(set VR256:$dst, 4506 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4507 Sched<[sched.YMM.Folded]>; 4508} 4509 4510let Predicates = [HasAVX, NoVLX] in { 4511 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4512 VEX, WIG; 4513 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4514 VEX, VEX_L, WIG; 4515} 4516 4517defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4518 4519 4520let Predicates = [HasAVX, NoVLX] in { 4521 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4522 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4523} 4524 4525let Predicates = [UseSSE3] in { 4526 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4527 (MOVDDUPrm addr:$src)>; 4528} 4529 4530//===---------------------------------------------------------------------===// 4531// SSE3 - Move Unaligned Integer 4532//===---------------------------------------------------------------------===// 4533 4534let Predicates = [HasAVX] in { 4535 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4536 "vlddqu\t{$src, $dst|$dst, $src}", 4537 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4538 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG; 4539 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4540 "vlddqu\t{$src, $dst|$dst, $src}", 4541 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4542 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, WIG; 4543} // Predicates 4544 4545def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4546 "lddqu\t{$src, $dst|$dst, $src}", 4547 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4548 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4549 4550//===---------------------------------------------------------------------===// 4551// SSE3 - Arithmetic 4552//===---------------------------------------------------------------------===// 4553 4554multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4555 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4556 PatFrag ld_frag, bit Is2Addr = 1> { 4557let Uses = [MXCSR], mayRaiseFPException = 1 in { 4558 def rr : I<0xD0, MRMSrcReg, 4559 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4560 !if(Is2Addr, 4561 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4562 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4563 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4564 Sched<[sched]>; 4565 def rm : I<0xD0, MRMSrcMem, 4566 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4567 !if(Is2Addr, 4568 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4569 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4570 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4571 Sched<[sched.Folded, sched.ReadAfterFold]>; 4572} 4573} 4574 4575let Predicates = [HasAVX] in { 4576 let ExeDomain = SSEPackedSingle in { 4577 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4578 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4579 TB, XD, VEX, VVVV, WIG; 4580 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4581 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4582 TB, XD, VEX, VVVV, VEX_L, WIG; 4583 } 4584 let ExeDomain = SSEPackedDouble in { 4585 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4586 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4587 TB, PD, VEX, VVVV, WIG; 4588 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4589 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4590 TB, PD, VEX, VVVV, VEX_L, WIG; 4591 } 4592} 4593let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4594 let ExeDomain = SSEPackedSingle in 4595 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4596 SchedWriteFAddSizes.PS.XMM, memopv4f32>, TB, XD; 4597 let ExeDomain = SSEPackedDouble in 4598 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4599 SchedWriteFAddSizes.PD.XMM, memopv2f64>, TB, PD; 4600} 4601 4602//===---------------------------------------------------------------------===// 4603// SSE3 Instructions 4604//===---------------------------------------------------------------------===// 4605 4606// Horizontal ops 4607multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4608 X86MemOperand x86memop, SDNode OpNode, 4609 X86FoldableSchedWrite sched, PatFrag ld_frag, 4610 bit Is2Addr = 1> { 4611let Uses = [MXCSR], mayRaiseFPException = 1 in { 4612 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4613 !if(Is2Addr, 4614 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4615 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4616 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4617 Sched<[sched]>; 4618 4619 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4620 !if(Is2Addr, 4621 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4622 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4623 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4624 Sched<[sched.Folded, sched.ReadAfterFold]>; 4625} 4626} 4627multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4628 X86MemOperand x86memop, SDNode OpNode, 4629 X86FoldableSchedWrite sched, PatFrag ld_frag, 4630 bit Is2Addr = 1> { 4631let Uses = [MXCSR], mayRaiseFPException = 1 in { 4632 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4633 !if(Is2Addr, 4634 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4635 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4636 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4637 Sched<[sched]>; 4638 4639 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4640 !if(Is2Addr, 4641 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4642 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4643 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4644 Sched<[sched.Folded, sched.ReadAfterFold]>; 4645} 4646} 4647 4648let Predicates = [HasAVX] in { 4649 let ExeDomain = SSEPackedSingle in { 4650 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4651 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG; 4652 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4653 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG; 4654 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4655 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG; 4656 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4657 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG; 4658 } 4659 let ExeDomain = SSEPackedDouble in { 4660 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4661 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG; 4662 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4663 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG; 4664 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4665 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG; 4666 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4667 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG; 4668 } 4669} 4670 4671let Constraints = "$src1 = $dst" in { 4672 let ExeDomain = SSEPackedSingle in { 4673 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4674 WriteFHAdd, memopv4f32>; 4675 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4676 WriteFHAdd, memopv4f32>; 4677 } 4678 let ExeDomain = SSEPackedDouble in { 4679 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4680 WriteFHAdd, memopv2f64>; 4681 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4682 WriteFHAdd, memopv2f64>; 4683 } 4684} 4685 4686//===---------------------------------------------------------------------===// 4687// SSSE3 - Packed Absolute Instructions 4688//===---------------------------------------------------------------------===// 4689 4690/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4691multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4692 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4693 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4694 (ins VR128:$src), 4695 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4696 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4697 Sched<[sched.XMM]>; 4698 4699 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4700 (ins i128mem:$src), 4701 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4702 [(set VR128:$dst, 4703 (vt (OpNode (ld_frag addr:$src))))]>, 4704 Sched<[sched.XMM.Folded]>; 4705} 4706 4707/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4708multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4709 SDNode OpNode, X86SchedWriteWidths sched> { 4710 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4711 (ins VR256:$src), 4712 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4713 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4714 Sched<[sched.YMM]>; 4715 4716 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4717 (ins i256mem:$src), 4718 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4719 [(set VR256:$dst, 4720 (vt (OpNode (load addr:$src))))]>, 4721 Sched<[sched.YMM.Folded]>; 4722} 4723 4724let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4725 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4726 load>, VEX, WIG; 4727 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4728 load>, VEX, WIG; 4729} 4730let Predicates = [HasAVX, NoVLX] in { 4731 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4732 load>, VEX, WIG; 4733} 4734let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4735 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4736 VEX, VEX_L, WIG; 4737 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4738 VEX, VEX_L, WIG; 4739} 4740let Predicates = [HasAVX2, NoVLX] in { 4741 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4742 VEX, VEX_L, WIG; 4743} 4744 4745defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4746 memop>; 4747defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4748 memop>; 4749defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4750 memop>; 4751 4752//===---------------------------------------------------------------------===// 4753// SSSE3 - Packed Binary Operator Instructions 4754//===---------------------------------------------------------------------===// 4755 4756/// SS3I_binop_rm - Simple SSSE3 bin op 4757multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4758 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4759 PatFrag memop_frag, X86MemOperand x86memop, 4760 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4761 let isCommutable = 1 in 4762 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4763 (ins RC:$src1, RC:$src2), 4764 !if(Is2Addr, 4765 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4766 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4767 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4768 Sched<[sched]>; 4769 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4770 (ins RC:$src1, x86memop:$src2), 4771 !if(Is2Addr, 4772 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4773 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4774 [(set RC:$dst, 4775 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4776 Sched<[sched.Folded, sched.ReadAfterFold]>; 4777} 4778 4779/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4780multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4781 Intrinsic IntId128, X86FoldableSchedWrite sched, 4782 PatFrag ld_frag, bit Is2Addr = 1> { 4783 let isCommutable = 1 in 4784 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4785 (ins VR128:$src1, VR128:$src2), 4786 !if(Is2Addr, 4787 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4788 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4789 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4790 Sched<[sched]>; 4791 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4792 (ins VR128:$src1, i128mem:$src2), 4793 !if(Is2Addr, 4794 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4795 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4796 [(set VR128:$dst, 4797 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4798 Sched<[sched.Folded, sched.ReadAfterFold]>; 4799} 4800 4801multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4802 Intrinsic IntId256, 4803 X86FoldableSchedWrite sched> { 4804 let isCommutable = 1 in 4805 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4806 (ins VR256:$src1, VR256:$src2), 4807 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4808 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4809 Sched<[sched]>; 4810 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4811 (ins VR256:$src1, i256mem:$src2), 4812 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4813 [(set VR256:$dst, 4814 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4815 Sched<[sched.Folded, sched.ReadAfterFold]>; 4816} 4817 4818let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4819let isCommutable = 0 in { 4820 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4821 VR128, load, i128mem, 4822 SchedWriteVarShuffle.XMM, 0>, VEX, VVVV, WIG; 4823 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4824 v16i8, VR128, load, i128mem, 4825 SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG; 4826} 4827defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4828 VR128, load, i128mem, 4829 SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG; 4830} 4831 4832let ImmT = NoImm, Predicates = [HasAVX] in { 4833let isCommutable = 0 in { 4834 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4835 load, i128mem, 4836 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4837 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4838 load, i128mem, 4839 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4840 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4841 load, i128mem, 4842 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4843 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4844 load, i128mem, 4845 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4846 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4847 int_x86_ssse3_psign_b_128, 4848 SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; 4849 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4850 int_x86_ssse3_psign_w_128, 4851 SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; 4852 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4853 int_x86_ssse3_psign_d_128, 4854 SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; 4855 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4856 int_x86_ssse3_phadd_sw_128, 4857 SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; 4858 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4859 int_x86_ssse3_phsub_sw_128, 4860 SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; 4861} 4862} 4863 4864let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4865let isCommutable = 0 in { 4866 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4867 VR256, load, i256mem, 4868 SchedWriteVarShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4869 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4870 v32i8, VR256, load, i256mem, 4871 SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4872} 4873defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4874 VR256, load, i256mem, 4875 SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4876} 4877 4878let ImmT = NoImm, Predicates = [HasAVX2] in { 4879let isCommutable = 0 in { 4880 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4881 VR256, load, i256mem, 4882 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4883 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4884 load, i256mem, 4885 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4886 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4887 VR256, load, i256mem, 4888 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4889 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4890 load, i256mem, 4891 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4892 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4893 SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; 4894 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4895 SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; 4896 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4897 SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; 4898 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4899 int_x86_avx2_phadd_sw, 4900 SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; 4901 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4902 int_x86_avx2_phsub_sw, 4903 SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; 4904} 4905} 4906 4907// None of these have i8 immediate fields. 4908let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4909let isCommutable = 0 in { 4910 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4911 memop, i128mem, SchedWritePHAdd.XMM>; 4912 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4913 memop, i128mem, SchedWritePHAdd.XMM>; 4914 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4915 memop, i128mem, SchedWritePHAdd.XMM>; 4916 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4917 memop, i128mem, SchedWritePHAdd.XMM>; 4918 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4919 SchedWriteVecALU.XMM, memop>; 4920 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4921 SchedWriteVecALU.XMM, memop>; 4922 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4923 SchedWriteVecALU.XMM, memop>; 4924 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4925 memop, i128mem, SchedWriteVarShuffle.XMM>; 4926 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4927 int_x86_ssse3_phadd_sw_128, 4928 SchedWritePHAdd.XMM, memop>; 4929 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4930 int_x86_ssse3_phsub_sw_128, 4931 SchedWritePHAdd.XMM, memop>; 4932 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4933 v16i8, VR128, memop, i128mem, 4934 SchedWriteVecIMul.XMM>; 4935} 4936defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4937 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4938} 4939 4940//===---------------------------------------------------------------------===// 4941// SSSE3 - Packed Align Instruction Patterns 4942//===---------------------------------------------------------------------===// 4943 4944multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4945 PatFrag memop_frag, X86MemOperand x86memop, 4946 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4947 let hasSideEffects = 0 in { 4948 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4949 (ins RC:$src1, RC:$src2, u8imm:$src3), 4950 !if(Is2Addr, 4951 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4952 !strconcat(asm, 4953 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4954 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4955 Sched<[sched]>; 4956 let mayLoad = 1 in 4957 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4958 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4959 !if(Is2Addr, 4960 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4961 !strconcat(asm, 4962 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4963 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4964 (memop_frag addr:$src2), 4965 (i8 timm:$src3))))]>, 4966 Sched<[sched.Folded, sched.ReadAfterFold]>; 4967 } 4968} 4969 4970let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4971 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4972 SchedWriteShuffle.XMM, 0>, VEX, VVVV, WIG; 4973let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4974 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4975 SchedWriteShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4976let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4977 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4978 SchedWriteShuffle.XMM>; 4979 4980//===---------------------------------------------------------------------===// 4981// SSSE3 - Thread synchronization 4982//===---------------------------------------------------------------------===// 4983 4984let SchedRW = [WriteSystem] in { 4985let Uses = [EAX, ECX, EDX] in 4986def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4987 TB, Requires<[HasSSE3, Not64BitMode]>; 4988let Uses = [RAX, ECX, EDX] in 4989def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4990 TB, Requires<[HasSSE3, In64BitMode]>; 4991 4992let Uses = [ECX, EAX] in 4993def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4994 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4995} // SchedRW 4996 4997def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4998def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4999 5000def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 5001 Requires<[Not64BitMode]>; 5002def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 5003 Requires<[In64BitMode]>; 5004 5005//===----------------------------------------------------------------------===// 5006// SSE4.1 - Packed Move with Sign/Zero Extend 5007// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 5008//===----------------------------------------------------------------------===// 5009 5010multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5011 RegisterClass OutRC, RegisterClass InRC, 5012 X86FoldableSchedWrite sched> { 5013 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 5014 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 5015 Sched<[sched]>; 5016 5017 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 5018 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 5019 Sched<[sched.Folded]>; 5020} 5021 5022multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 5023 X86MemOperand MemOp, X86MemOperand MemYOp, 5024 Predicate prd> { 5025 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 5026 SchedWriteShuffle.XMM>; 5027 let Predicates = [HasAVX, prd] in 5028 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 5029 VR128, VR128, SchedWriteVecExtend.XMM>, 5030 VEX, WIG; 5031 let Predicates = [HasAVX2, prd] in 5032 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 5033 VR256, VR128, SchedWriteVecExtend.YMM>, 5034 VEX, VEX_L, WIG; 5035} 5036 5037multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5038 X86MemOperand MemYOp, Predicate prd> { 5039 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 5040 MemOp, MemYOp, prd>; 5041 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 5042 !strconcat("pmovzx", OpcodeStr), 5043 MemOp, MemYOp, prd>; 5044} 5045 5046defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 5047defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 5048defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 5049 5050defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 5051defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 5052 5053defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 5054 5055// AVX2 Patterns 5056multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 5057 SDNode ExtOp, SDNode InVecOp> { 5058 // Register-Register patterns 5059 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5060 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 5061 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 5062 } 5063 let Predicates = [HasAVX2, NoVLX] in { 5064 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 5065 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 5066 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 5067 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 5068 5069 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 5070 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 5071 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 5072 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 5073 5074 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 5075 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 5076 } 5077 5078 // Simple Register-Memory patterns 5079 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5080 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5081 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5082 5083 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 5084 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5085 } 5086 5087 let Predicates = [HasAVX2, NoVLX] in { 5088 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5089 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5090 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5091 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5092 5093 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5094 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5095 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5096 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5097 5098 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5099 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5100 } 5101 5102 // AVX2 Register-Memory patterns 5103 let Predicates = [HasAVX2, NoVLX] in { 5104 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5105 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5106 5107 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5108 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5109 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5110 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5111 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5112 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5113 5114 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5115 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5116 5117 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5118 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5119 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), 5120 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5121 5122 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5123 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5124 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5125 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5126 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5127 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5128 } 5129} 5130 5131defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5132defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5133 5134// SSE4.1/AVX patterns. 5135multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5136 SDNode ExtOp> { 5137 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5138 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5139 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5140 } 5141 let Predicates = [HasAVX, NoVLX] in { 5142 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5143 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5144 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5145 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5146 5147 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5148 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5149 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5150 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5151 5152 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5153 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5154 } 5155 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5156 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5157 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5158 } 5159 let Predicates = [HasAVX, NoVLX] in { 5160 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5161 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5162 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5163 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5164 5165 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5166 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5167 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5168 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5169 5170 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5171 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5172 } 5173 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5174 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5175 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5176 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5177 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5178 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5179 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5180 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5181 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5182 } 5183 let Predicates = [HasAVX, NoVLX] in { 5184 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5185 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5186 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5187 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5188 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5189 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5190 5191 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5192 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5193 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5194 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5195 5196 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5197 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5198 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5199 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5200 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5201 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5202 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5203 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5204 5205 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5206 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5207 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5208 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5209 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5210 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5211 5212 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5213 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5214 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5215 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5216 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5217 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5218 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5219 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5220 } 5221} 5222 5223defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5224defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5225 5226let Predicates = [UseSSE41] in { 5227 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5228 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5229} 5230 5231//===----------------------------------------------------------------------===// 5232// SSE4.1 - Extract Instructions 5233//===----------------------------------------------------------------------===// 5234 5235/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5236multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5237 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5238 (ins VR128:$src1, u8imm:$src2), 5239 !strconcat(OpcodeStr, 5240 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5241 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5242 timm:$src2))]>, 5243 Sched<[WriteVecExtract]>; 5244 let hasSideEffects = 0, mayStore = 1 in 5245 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5246 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5247 !strconcat(OpcodeStr, 5248 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5249 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), 5250 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5251} 5252 5253let Predicates = [HasAVX, NoBWI] in 5254 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, WIG; 5255 5256defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5257 5258 5259/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5260multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5261 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5262 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5263 (ins VR128:$src1, u8imm:$src2), 5264 !strconcat(OpcodeStr, 5265 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5266 Sched<[WriteVecExtract]>; 5267 5268 let hasSideEffects = 0, mayStore = 1 in 5269 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5270 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5271 !strconcat(OpcodeStr, 5272 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5273 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), 5274 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5275} 5276 5277let Predicates = [HasAVX, NoBWI] in 5278 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, WIG; 5279 5280defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5281 5282let Predicates = [UseSSE41] in 5283 def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5284 5285let Predicates = [HasAVX, NoBWI] in 5286 def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5287 5288 5289/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5290multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5291 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5292 (ins VR128:$src1, u8imm:$src2), 5293 !strconcat(OpcodeStr, 5294 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5295 [(set GR32:$dst, 5296 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5297 Sched<[WriteVecExtract]>; 5298 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5299 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5300 !strconcat(OpcodeStr, 5301 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5302 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5303 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5304} 5305 5306let Predicates = [HasAVX, NoDQI] in 5307 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5308 5309defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5310 5311/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5312multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5313 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5314 (ins VR128:$src1, u8imm:$src2), 5315 !strconcat(OpcodeStr, 5316 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5317 [(set GR64:$dst, 5318 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5319 Sched<[WriteVecExtract]>; 5320 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5321 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5322 !strconcat(OpcodeStr, 5323 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5324 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5325 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5326} 5327 5328let Predicates = [HasAVX, NoDQI] in 5329 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, REX_W; 5330 5331defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5332 5333/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5334/// destination 5335multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5336 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5337 (ins VR128:$src1, u8imm:$src2), 5338 !strconcat(OpcodeStr, 5339 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5340 [(set GR32orGR64:$dst, 5341 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5342 Sched<[WriteVecExtract]>; 5343 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5344 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5345 !strconcat(OpcodeStr, 5346 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5347 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5348 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5349} 5350 5351let ExeDomain = SSEPackedSingle in { 5352 let Predicates = [UseAVX] in 5353 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, WIG; 5354 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5355} 5356 5357//===----------------------------------------------------------------------===// 5358// SSE4.1 - Insert Instructions 5359//===----------------------------------------------------------------------===// 5360 5361multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5362 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5363 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5364 !if(Is2Addr, 5365 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5366 !strconcat(asm, 5367 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5368 [(set VR128:$dst, 5369 (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 5370 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5371 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5372 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5373 !if(Is2Addr, 5374 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5375 !strconcat(asm, 5376 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5377 [(set VR128:$dst, 5378 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, 5379 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5380} 5381 5382let Predicates = [HasAVX, NoBWI] in { 5383 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX, VVVV, WIG; 5384 def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3), 5385 (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 5386 GR8:$src2, sub_8bit), timm:$src3)>; 5387} 5388 5389let Constraints = "$src1 = $dst" in 5390 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5391 5392multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5393 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5394 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5395 !if(Is2Addr, 5396 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5397 !strconcat(asm, 5398 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5399 [(set VR128:$dst, 5400 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5401 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5402 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5403 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5404 !if(Is2Addr, 5405 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5406 !strconcat(asm, 5407 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5408 [(set VR128:$dst, 5409 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5410 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5411} 5412 5413let Predicates = [HasAVX, NoDQI] in 5414 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX, VVVV; 5415let Constraints = "$src1 = $dst" in 5416 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5417 5418multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5419 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5420 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5421 !if(Is2Addr, 5422 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5423 !strconcat(asm, 5424 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5425 [(set VR128:$dst, 5426 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5427 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5428 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5429 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5430 !if(Is2Addr, 5431 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5432 !strconcat(asm, 5433 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5434 [(set VR128:$dst, 5435 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5436 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5437} 5438 5439let Predicates = [HasAVX, NoDQI] in 5440 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX, VVVV, REX_W; 5441let Constraints = "$src1 = $dst" in 5442 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5443 5444// insertps has a few different modes, there's the first two here below which 5445// are optimized inserts that won't zero arbitrary elements in the destination 5446// vector. The next one matches the intrinsic and could zero arbitrary elements 5447// in the target vector. 5448multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5449 let isCommutable = 1 in 5450 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5451 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5452 !if(Is2Addr, 5453 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5454 !strconcat(asm, 5455 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5456 [(set VR128:$dst, 5457 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5458 Sched<[SchedWriteFShuffle.XMM]>; 5459 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5460 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5461 !if(Is2Addr, 5462 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5463 !strconcat(asm, 5464 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5465 [(set VR128:$dst, 5466 (X86insertps VR128:$src1, 5467 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5468 timm:$src3))]>, 5469 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5470} 5471 5472let ExeDomain = SSEPackedSingle in { 5473 let Predicates = [UseAVX] in 5474 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5475 VEX, VVVV, WIG; 5476 let Constraints = "$src1 = $dst" in 5477 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5478} 5479 5480//===----------------------------------------------------------------------===// 5481// SSE4.1 - Round Instructions 5482//===----------------------------------------------------------------------===// 5483 5484multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5485 X86MemOperand x86memop, RegisterClass RC, 5486 ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode, 5487 X86FoldableSchedWrite sched> { 5488 // Intrinsic operation, reg. 5489 // Vector intrinsic operation, reg 5490let Uses = [MXCSR], mayRaiseFPException = 1 in { 5491 def ri : SS4AIi8<opc, MRMSrcReg, 5492 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5493 !strconcat(OpcodeStr, 5494 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5495 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5496 Sched<[sched]>; 5497 5498 // Vector intrinsic operation, mem 5499 def mi : SS4AIi8<opc, MRMSrcMem, 5500 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5501 !strconcat(OpcodeStr, 5502 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5503 [(set RC:$dst, 5504 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5505 Sched<[sched.Folded]>; 5506} 5507} 5508 5509multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5510 string OpcodeStr, X86FoldableSchedWrite sched> { 5511let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5512 def SSri : SS4AIi8<opcss, MRMSrcReg, 5513 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5514 !strconcat(OpcodeStr, 5515 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5516 []>, Sched<[sched]>; 5517 5518 let mayLoad = 1 in 5519 def SSmi : SS4AIi8<opcss, MRMSrcMem, 5520 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5521 !strconcat(OpcodeStr, 5522 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5523 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5524} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5525 5526let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5527 def SDri : SS4AIi8<opcsd, MRMSrcReg, 5528 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5529 !strconcat(OpcodeStr, 5530 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5531 []>, Sched<[sched]>; 5532 5533 let mayLoad = 1 in 5534 def SDmi : SS4AIi8<opcsd, MRMSrcMem, 5535 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5536 !strconcat(OpcodeStr, 5537 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5538 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5539} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5540} 5541 5542multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5543 string OpcodeStr, X86FoldableSchedWrite sched> { 5544let Uses = [MXCSR], mayRaiseFPException = 1 in { 5545let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5546 def SSri : SS4AIi8<opcss, MRMSrcReg, 5547 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5548 !strconcat(OpcodeStr, 5549 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5550 []>, Sched<[sched]>; 5551 5552 let mayLoad = 1 in 5553 def SSmi : SS4AIi8<opcss, MRMSrcMem, 5554 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5555 !strconcat(OpcodeStr, 5556 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5557 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5558} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5559 5560let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5561 def SDri : SS4AIi8<opcsd, MRMSrcReg, 5562 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5563 !strconcat(OpcodeStr, 5564 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5565 []>, Sched<[sched]>; 5566 5567 let mayLoad = 1 in 5568 def SDmi : SS4AIi8<opcsd, MRMSrcMem, 5569 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5570 !strconcat(OpcodeStr, 5571 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5572 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5573} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5574} 5575} 5576 5577multiclass sse41_fp_unop_s_int<bits<8> opcss, bits<8> opcsd, 5578 string OpcodeStr, X86FoldableSchedWrite sched, 5579 ValueType VT32, ValueType VT64, 5580 SDNode OpNode, bit Is2Addr = 1> { 5581let Uses = [MXCSR], mayRaiseFPException = 1 in { 5582let ExeDomain = SSEPackedSingle in { 5583 def SSri_Int : SS4AIi8<opcss, MRMSrcReg, 5584 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5585 !if(Is2Addr, 5586 !strconcat(OpcodeStr, 5587 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5588 !strconcat(OpcodeStr, 5589 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5590 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5591 Sched<[sched]>; 5592 5593 def SSmi_Int : SS4AIi8<opcss, MRMSrcMem, 5594 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5595 !if(Is2Addr, 5596 !strconcat(OpcodeStr, 5597 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5598 !strconcat(OpcodeStr, 5599 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5600 [(set VR128:$dst, 5601 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, 5602 Sched<[sched.Folded, sched.ReadAfterFold]>; 5603} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5604 5605let ExeDomain = SSEPackedDouble in { 5606 def SDri_Int : SS4AIi8<opcsd, MRMSrcReg, 5607 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5608 !if(Is2Addr, 5609 !strconcat(OpcodeStr, 5610 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5611 !strconcat(OpcodeStr, 5612 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5613 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5614 Sched<[sched]>; 5615 5616 def SDmi_Int : SS4AIi8<opcsd, MRMSrcMem, 5617 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5618 !if(Is2Addr, 5619 !strconcat(OpcodeStr, 5620 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5621 !strconcat(OpcodeStr, 5622 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5623 [(set VR128:$dst, 5624 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, 5625 Sched<[sched.Folded, sched.ReadAfterFold]>; 5626} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5627} 5628} 5629 5630// FP round - roundss, roundps, roundsd, roundpd 5631let Predicates = [HasAVX, NoVLX] in { 5632 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5633 // Intrinsic form 5634 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5635 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5636 VEX, WIG; 5637 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5638 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5639 VEX, VEX_L, WIG; 5640 } 5641 5642 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5643 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5644 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5645 VEX, WIG; 5646 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5647 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5648 VEX, VEX_L, WIG; 5649 } 5650} 5651let Predicates = [UseAVX] in { 5652 defm VROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5653 v4f32, v2f64, X86RndScales, 0>, 5654 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; 5655 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5656 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; 5657} 5658 5659let Predicates = [UseAVX] in { 5660 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5661 (VROUNDSSri (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5662 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5663 (VROUNDSDri (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5664} 5665 5666let Predicates = [UseAVX, OptForSize] in { 5667 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5668 (VROUNDSSmi (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5669 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5670 (VROUNDSDmi (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5671} 5672 5673let ExeDomain = SSEPackedSingle in 5674defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5675 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5676let ExeDomain = SSEPackedDouble in 5677defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5678 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5679 5680defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5681 5682let Constraints = "$src1 = $dst" in 5683defm ROUND : sse41_fp_unop_s_int<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5684 v4f32, v2f64, X86RndScales>; 5685 5686let Predicates = [UseSSE41] in { 5687 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5688 (ROUNDSSri FR32:$src1, timm:$src2)>; 5689 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5690 (ROUNDSDri FR64:$src1, timm:$src2)>; 5691} 5692 5693let Predicates = [UseSSE41, OptForSize] in { 5694 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5695 (ROUNDSSmi addr:$src1, timm:$src2)>; 5696 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5697 (ROUNDSDmi addr:$src1, timm:$src2)>; 5698} 5699 5700//===----------------------------------------------------------------------===// 5701// SSE4.1 - Packed Bit Test 5702//===----------------------------------------------------------------------===// 5703 5704// ptest is commutable if only the Z flag is used. If the C flag is used, 5705// commuting would change which operand is inverted. 5706def X86ptest_commutable : PatFrag<(ops node:$src1, node:$src2), 5707 (X86ptest node:$src1, node:$src2), [{ 5708 return onlyUsesZeroFlag(SDValue(Node, 0)); 5709}]>; 5710 5711// ptest instruction we'll lower to this in X86ISelLowering primarily from 5712// the intel intrinsic that corresponds to this. 5713let Defs = [EFLAGS], Predicates = [HasAVX] in { 5714def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5715 "vptest\t{$src2, $src1|$src1, $src2}", 5716 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5717 Sched<[SchedWriteVecTest.XMM]>, VEX, WIG; 5718def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5719 "vptest\t{$src2, $src1|$src1, $src2}", 5720 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5721 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5722 VEX, WIG; 5723 5724def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5725 "vptest\t{$src2, $src1|$src1, $src2}", 5726 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5727 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, WIG; 5728def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5729 "vptest\t{$src2, $src1|$src1, $src2}", 5730 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5731 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5732 VEX, VEX_L, WIG; 5733} 5734 5735let Defs = [EFLAGS] in { 5736def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5737 "ptest\t{$src2, $src1|$src1, $src2}", 5738 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5739 Sched<[SchedWriteVecTest.XMM]>; 5740def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5741 "ptest\t{$src2, $src1|$src1, $src2}", 5742 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5743 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5744} 5745 5746let Predicates = [HasAVX] in { 5747 def : Pat<(X86ptest_commutable (loadv2i64 addr:$src2), VR128:$src1), 5748 (VPTESTrm VR128:$src1, addr:$src2)>; 5749 def : Pat<(X86ptest_commutable (loadv4i64 addr:$src2), VR256:$src1), 5750 (VPTESTYrm VR256:$src1, addr:$src2)>; 5751} 5752let Predicates = [UseSSE41] in { 5753 def : Pat<(X86ptest_commutable (memopv2i64 addr:$src2), VR128:$src1), 5754 (PTESTrm VR128:$src1, addr:$src2)>; 5755} 5756 5757// The bit test instructions below are AVX only 5758multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5759 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5760 X86FoldableSchedWrite sched> { 5761 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5762 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5763 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5764 Sched<[sched]>, VEX; 5765 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5766 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5767 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5768 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5769} 5770 5771// testps/testpd are commutable if only the Z flag is used. If the C flag is 5772// used, commuting would change which operand is inverted. 5773def X86testp_commutable : PatFrag<(ops node:$src1, node:$src2), 5774 (X86testp node:$src1, node:$src2), [{ 5775 return onlyUsesZeroFlag(SDValue(Node, 0)); 5776}]>; 5777 5778let Defs = [EFLAGS], Predicates = [HasAVX] in { 5779let ExeDomain = SSEPackedSingle in { 5780defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5781 SchedWriteFTest.XMM>; 5782defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5783 SchedWriteFTest.YMM>, VEX_L; 5784} 5785let ExeDomain = SSEPackedDouble in { 5786defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5787 SchedWriteFTest.XMM>; 5788defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5789 SchedWriteFTest.YMM>, VEX_L; 5790} 5791} 5792 5793let Predicates = [HasAVX] in { 5794 def : Pat<(X86testp_commutable (loadv4f32 addr:$src2), VR128:$src), 5795 (VTESTPSrm VR128:$src, addr:$src2)>; 5796 def : Pat<(X86testp_commutable (loadv8f32 addr:$src2), VR256:$src), 5797 (VTESTPSYrm VR256:$src, addr:$src2)>; 5798 5799 def : Pat<(X86testp_commutable (loadv2f64 addr:$src2), VR128:$src), 5800 (VTESTPDrm VR128:$src, addr:$src2)>; 5801 def : Pat<(X86testp_commutable (loadv4f64 addr:$src2), VR256:$src), 5802 (VTESTPDYrm VR256:$src, addr:$src2)>; 5803} 5804 5805//===----------------------------------------------------------------------===// 5806// SSE4.1 - Misc Instructions 5807//===----------------------------------------------------------------------===// 5808 5809let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5810 defm POPCNT16 : Lzcnt<0xB8, "popcnt", ctpop, Xi16, WritePOPCNT, WritePOPCNT.Folded>, OpSize16, XS; 5811 defm POPCNT32 : Lzcnt<0xB8, "popcnt", ctpop, Xi32, WritePOPCNT, WritePOPCNT.Folded>, OpSize32, XS; 5812 defm POPCNT64 : Lzcnt<0xB8, "popcnt", ctpop, Xi64, WritePOPCNT, WritePOPCNT.Folded>, XS; 5813 5814 defm POPCNT16 : Lzcnt<0x88, "popcnt", null_frag, Xi16, WritePOPCNT, WritePOPCNT.Folded, "_EVEX">, PL, PD; 5815 defm POPCNT32 : Lzcnt<0x88, "popcnt", null_frag, Xi32, WritePOPCNT, WritePOPCNT.Folded, "_EVEX">, PL; 5816 defm POPCNT64 : Lzcnt<0x88, "popcnt", null_frag, Xi64, WritePOPCNT, WritePOPCNT.Folded, "_EVEX">, PL; 5817} 5818 5819defm POPCNT16 : Lzcnt<0x88, "popcnt", null_frag, Xi16, WritePOPCNT, WritePOPCNT.Folded, "_NF">, NF, PD; 5820defm POPCNT32 : Lzcnt<0x88, "popcnt", null_frag, Xi32, WritePOPCNT, WritePOPCNT.Folded, "_NF">, NF; 5821defm POPCNT64 : Lzcnt<0x88, "popcnt", null_frag, Xi64, WritePOPCNT, WritePOPCNT.Folded, "_NF">, NF; 5822 5823// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5824multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5825 SDNode OpNode, PatFrag ld_frag, 5826 X86FoldableSchedWrite Sched> { 5827 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5828 (ins VR128:$src), 5829 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5830 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5831 Sched<[Sched]>; 5832 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5833 (ins i128mem:$src), 5834 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5835 [(set VR128:$dst, 5836 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5837 Sched<[Sched.Folded]>; 5838} 5839 5840// PHMIN has the same profile as PSAD, thus we use the same scheduling 5841// model, although the naming is misleading. 5842let Predicates = [HasAVX] in 5843defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5844 X86phminpos, load, 5845 WritePHMINPOS>, VEX, WIG; 5846defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5847 X86phminpos, memop, 5848 WritePHMINPOS>; 5849 5850/// SS48I_binop_rm - Simple SSE41 binary operator. 5851multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5852 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5853 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5854 bit Is2Addr = 1> { 5855 let isCommutable = 1 in 5856 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5857 (ins RC:$src1, RC:$src2), 5858 !if(Is2Addr, 5859 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5860 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5861 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5862 Sched<[sched]>; 5863 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5864 (ins RC:$src1, x86memop:$src2), 5865 !if(Is2Addr, 5866 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5867 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5868 [(set RC:$dst, 5869 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5870 Sched<[sched.Folded, sched.ReadAfterFold]>; 5871} 5872 5873let Predicates = [HasAVX, NoVLX] in { 5874 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5875 load, i128mem, SchedWriteVecALU.XMM, 0>, 5876 VEX, VVVV, WIG; 5877 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5878 load, i128mem, SchedWriteVecALU.XMM, 0>, 5879 VEX, VVVV, WIG; 5880 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5881 load, i128mem, SchedWriteVecALU.XMM, 0>, 5882 VEX, VVVV, WIG; 5883 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5884 load, i128mem, SchedWriteVecALU.XMM, 0>, 5885 VEX, VVVV, WIG; 5886 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5887 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5888 VEX, VVVV, WIG; 5889} 5890let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5891 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5892 load, i128mem, SchedWriteVecALU.XMM, 0>, 5893 VEX, VVVV, WIG; 5894 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5895 load, i128mem, SchedWriteVecALU.XMM, 0>, 5896 VEX, VVVV, WIG; 5897 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5898 load, i128mem, SchedWriteVecALU.XMM, 0>, 5899 VEX, VVVV, WIG; 5900 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5901 load, i128mem, SchedWriteVecALU.XMM, 0>, 5902 VEX, VVVV, WIG; 5903} 5904 5905let Predicates = [HasAVX2, NoVLX] in { 5906 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5907 load, i256mem, SchedWriteVecALU.YMM, 0>, 5908 VEX, VVVV, VEX_L, WIG; 5909 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5910 load, i256mem, SchedWriteVecALU.YMM, 0>, 5911 VEX, VVVV, VEX_L, WIG; 5912 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5913 load, i256mem, SchedWriteVecALU.YMM, 0>, 5914 VEX, VVVV, VEX_L, WIG; 5915 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5916 load, i256mem, SchedWriteVecALU.YMM, 0>, 5917 VEX, VVVV, VEX_L, WIG; 5918 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5919 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5920 VEX, VVVV, VEX_L, WIG; 5921} 5922let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5923 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5924 load, i256mem, SchedWriteVecALU.YMM, 0>, 5925 VEX, VVVV, VEX_L, WIG; 5926 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5927 load, i256mem, SchedWriteVecALU.YMM, 0>, 5928 VEX, VVVV, VEX_L, WIG; 5929 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5930 load, i256mem, SchedWriteVecALU.YMM, 0>, 5931 VEX, VVVV, VEX_L, WIG; 5932 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5933 load, i256mem, SchedWriteVecALU.YMM, 0>, 5934 VEX, VVVV, VEX_L, WIG; 5935} 5936 5937let Constraints = "$src1 = $dst" in { 5938 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5939 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5940 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5941 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5942 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5943 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5944 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5945 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5946 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5947 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5948 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5949 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5950 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5951 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5952 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5953 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5954 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5955 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5956} 5957 5958let Predicates = [HasAVX, NoVLX] in 5959 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5960 load, i128mem, SchedWritePMULLD.XMM, 0>, 5961 VEX, VVVV, WIG; 5962let Predicates = [HasAVX] in 5963 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5964 load, i128mem, SchedWriteVecALU.XMM, 0>, 5965 VEX, VVVV, WIG; 5966 5967let Predicates = [HasAVX2, NoVLX] in 5968 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5969 load, i256mem, SchedWritePMULLD.YMM, 0>, 5970 VEX, VVVV, VEX_L, WIG; 5971let Predicates = [HasAVX2] in 5972 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5973 load, i256mem, SchedWriteVecALU.YMM, 0>, 5974 VEX, VVVV, VEX_L, WIG; 5975 5976let Constraints = "$src1 = $dst" in { 5977 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5978 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5979 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5980 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5981} 5982 5983/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5984multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5985 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5986 X86MemOperand x86memop, bit Is2Addr, 5987 X86FoldableSchedWrite sched> { 5988 let isCommutable = 1 in 5989 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5990 (ins RC:$src1, RC:$src2, u8imm:$src3), 5991 !if(Is2Addr, 5992 !strconcat(OpcodeStr, 5993 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5994 !strconcat(OpcodeStr, 5995 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5996 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5997 Sched<[sched]>; 5998 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5999 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6000 !if(Is2Addr, 6001 !strconcat(OpcodeStr, 6002 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6003 !strconcat(OpcodeStr, 6004 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6005 [(set RC:$dst, 6006 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 6007 Sched<[sched.Folded, sched.ReadAfterFold]>; 6008} 6009 6010/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 6011multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6012 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6013 X86MemOperand x86memop, bit Is2Addr, 6014 X86FoldableSchedWrite sched> { 6015 let isCommutable = 1 in 6016 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6017 (ins RC:$src1, RC:$src2, u8imm:$src3), 6018 !if(Is2Addr, 6019 !strconcat(OpcodeStr, 6020 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6021 !strconcat(OpcodeStr, 6022 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6023 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6024 Sched<[sched]>; 6025 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6026 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6027 !if(Is2Addr, 6028 !strconcat(OpcodeStr, 6029 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6030 !strconcat(OpcodeStr, 6031 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6032 [(set RC:$dst, 6033 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6034 Sched<[sched.Folded, sched.ReadAfterFold]>; 6035} 6036 6037def BlendCommuteImm2 : SDNodeXForm<timm, [{ 6038 uint8_t Imm = N->getZExtValue() & 0x03; 6039 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 6040}]>; 6041 6042def BlendCommuteImm4 : SDNodeXForm<timm, [{ 6043 uint8_t Imm = N->getZExtValue() & 0x0f; 6044 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 6045}]>; 6046 6047def BlendCommuteImm8 : SDNodeXForm<timm, [{ 6048 uint8_t Imm = N->getZExtValue() & 0xff; 6049 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 6050}]>; 6051 6052// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 6053def BlendScaleImm4 : SDNodeXForm<timm, [{ 6054 uint8_t Imm = N->getZExtValue(); 6055 uint8_t NewImm = 0; 6056 for (unsigned i = 0; i != 4; ++i) { 6057 if (Imm & (1 << i)) 6058 NewImm |= 0x3 << (i * 2); 6059 } 6060 return getI8Imm(NewImm, SDLoc(N)); 6061}]>; 6062 6063// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 6064def BlendScaleImm2 : SDNodeXForm<timm, [{ 6065 uint8_t Imm = N->getZExtValue(); 6066 uint8_t NewImm = 0; 6067 for (unsigned i = 0; i != 2; ++i) { 6068 if (Imm & (1 << i)) 6069 NewImm |= 0xf << (i * 4); 6070 } 6071 return getI8Imm(NewImm, SDLoc(N)); 6072}]>; 6073 6074// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 6075def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 6076 uint8_t Imm = N->getZExtValue(); 6077 uint8_t NewImm = 0; 6078 for (unsigned i = 0; i != 2; ++i) { 6079 if (Imm & (1 << i)) 6080 NewImm |= 0x3 << (i * 2); 6081 } 6082 return getI8Imm(NewImm, SDLoc(N)); 6083}]>; 6084 6085// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 6086def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 6087 uint8_t Imm = N->getZExtValue(); 6088 uint8_t NewImm = 0; 6089 for (unsigned i = 0; i != 4; ++i) { 6090 if (Imm & (1 << i)) 6091 NewImm |= 0x3 << (i * 2); 6092 } 6093 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6094}]>; 6095 6096// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 6097def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 6098 uint8_t Imm = N->getZExtValue(); 6099 uint8_t NewImm = 0; 6100 for (unsigned i = 0; i != 2; ++i) { 6101 if (Imm & (1 << i)) 6102 NewImm |= 0xf << (i * 4); 6103 } 6104 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6105}]>; 6106 6107// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 6108def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 6109 uint8_t Imm = N->getZExtValue(); 6110 uint8_t NewImm = 0; 6111 for (unsigned i = 0; i != 2; ++i) { 6112 if (Imm & (1 << i)) 6113 NewImm |= 0x3 << (i * 2); 6114 } 6115 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 6116}]>; 6117 6118let Predicates = [HasAVX] in { 6119 let isCommutable = 0 in { 6120 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6121 VR128, load, i128mem, 0, 6122 SchedWriteMPSAD.XMM>, VEX, VVVV, WIG; 6123 } 6124 6125let Uses = [MXCSR], mayRaiseFPException = 1 in { 6126 let ExeDomain = SSEPackedSingle in 6127 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6128 VR128, load, f128mem, 0, 6129 SchedWriteDPPS.XMM>, VEX, VVVV, WIG; 6130 let ExeDomain = SSEPackedDouble in 6131 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6132 VR128, load, f128mem, 0, 6133 SchedWriteDPPD.XMM>, VEX, VVVV, WIG; 6134 let ExeDomain = SSEPackedSingle in 6135 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6136 VR256, load, i256mem, 0, 6137 SchedWriteDPPS.YMM>, VEX, VVVV, VEX_L, WIG; 6138} 6139} 6140 6141let Predicates = [HasAVX2] in { 6142 let isCommutable = 0 in { 6143 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6144 VR256, load, i256mem, 0, 6145 SchedWriteMPSAD.YMM>, VEX, VVVV, VEX_L, WIG; 6146 } 6147} 6148 6149let Constraints = "$src1 = $dst" in { 6150 let isCommutable = 0 in { 6151 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6152 VR128, memop, i128mem, 1, 6153 SchedWriteMPSAD.XMM>; 6154 } 6155 6156 let ExeDomain = SSEPackedSingle in 6157 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6158 VR128, memop, f128mem, 1, 6159 SchedWriteDPPS.XMM>, SIMD_EXC; 6160 let ExeDomain = SSEPackedDouble in 6161 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6162 VR128, memop, f128mem, 1, 6163 SchedWriteDPPD.XMM>, SIMD_EXC; 6164} 6165 6166/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6167multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6168 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6169 X86MemOperand x86memop, bit Is2Addr, Domain d, 6170 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6171let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6172 let isCommutable = 1 in 6173 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6174 (ins RC:$src1, RC:$src2, u8imm:$src3), 6175 !if(Is2Addr, 6176 !strconcat(OpcodeStr, 6177 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6178 !strconcat(OpcodeStr, 6179 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6180 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6181 Sched<[sched]>; 6182 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6183 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6184 !if(Is2Addr, 6185 !strconcat(OpcodeStr, 6186 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6187 !strconcat(OpcodeStr, 6188 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6189 [(set RC:$dst, 6190 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6191 Sched<[sched.Folded, sched.ReadAfterFold]>; 6192} 6193 6194 // Pattern to commute if load is in first source. 6195 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6196 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6197 (commuteXForm timm:$src3))>; 6198} 6199 6200let Predicates = [HasAVX] in { 6201 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6202 VR128, load, f128mem, 0, SSEPackedSingle, 6203 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6204 VEX, VVVV, WIG; 6205 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6206 VR256, load, f256mem, 0, SSEPackedSingle, 6207 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6208 VEX, VVVV, VEX_L, WIG; 6209 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6210 VR128, load, f128mem, 0, SSEPackedDouble, 6211 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6212 VEX, VVVV, WIG; 6213 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6214 VR256, load, f256mem, 0, SSEPackedDouble, 6215 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6216 VEX, VVVV, VEX_L, WIG; 6217 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6218 VR128, load, i128mem, 0, SSEPackedInt, 6219 SchedWriteBlend.XMM, BlendCommuteImm8>, 6220 VEX, VVVV, WIG; 6221} 6222 6223let Predicates = [HasAVX2] in { 6224 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6225 VR256, load, i256mem, 0, SSEPackedInt, 6226 SchedWriteBlend.YMM, BlendCommuteImm8>, 6227 VEX, VVVV, VEX_L, WIG; 6228} 6229 6230// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6231// ExecutionDomainFixPass will cleanup domains later on. 6232let Predicates = [HasAVX1Only] in { 6233def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6234 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6235def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6236 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6237def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6238 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6239 6240// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6241// it from becoming movsd via commuting under optsize. 6242def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6243 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6244def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6245 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6246def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6247 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6248 6249def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6250 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6251def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6252 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6253def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6254 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6255 6256// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6257// it from becoming movss via commuting under optsize. 6258def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6259 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6260def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6261 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6262def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6263 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6264} 6265 6266defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6267 VR128, memop, f128mem, 1, SSEPackedSingle, 6268 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6269defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6270 VR128, memop, f128mem, 1, SSEPackedDouble, 6271 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6272defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6273 VR128, memop, i128mem, 1, SSEPackedInt, 6274 SchedWriteBlend.XMM, BlendCommuteImm8>; 6275 6276let Predicates = [UseSSE41] in { 6277// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6278// it from becoming movss via commuting under optsize. 6279def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6280 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6281def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6282 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6283def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6284 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6285 6286def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6287 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6288def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6289 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6290def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6291 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6292} 6293 6294// For insertion into the zero index (low half) of a 256-bit vector, it is 6295// more efficient to generate a blend with immediate instead of an insert*128. 6296let Predicates = [HasAVX] in { 6297def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6298 (VBLENDPDYrri VR256:$src1, 6299 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6300 VR128:$src2, sub_xmm), 0x3)>; 6301def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6302 (VBLENDPSYrri VR256:$src1, 6303 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6304 VR128:$src2, sub_xmm), 0xf)>; 6305 6306def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6307 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6308 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6309def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6310 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6311 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6312} 6313 6314/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6315multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6316 X86MemOperand x86memop, ValueType VT, 6317 PatFrag mem_frag, SDNode OpNode, 6318 X86FoldableSchedWrite sched> { 6319 def rrr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6320 (ins RC:$src1, RC:$src2, RC:$src3), 6321 !strconcat(OpcodeStr, 6322 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6323 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6324 SSEPackedInt>, TA, PD, VEX, VVVV, 6325 Sched<[sched]>; 6326 6327 def rmr : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6328 (ins RC:$src1, x86memop:$src2, RC:$src3), 6329 !strconcat(OpcodeStr, 6330 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6331 [(set RC:$dst, 6332 (OpNode RC:$src3, (mem_frag addr:$src2), 6333 RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV, 6334 Sched<[sched.Folded, sched.ReadAfterFold, 6335 // x86memop:$src2 6336 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6337 ReadDefault, 6338 // RC::$src3 6339 sched.ReadAfterFold]>; 6340} 6341 6342let Predicates = [HasAVX] in { 6343let ExeDomain = SSEPackedDouble in { 6344defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6345 v2f64, loadv2f64, X86Blendv, 6346 SchedWriteFVarBlend.XMM>; 6347defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6348 v4f64, loadv4f64, X86Blendv, 6349 SchedWriteFVarBlend.YMM>, VEX_L; 6350} // ExeDomain = SSEPackedDouble 6351let ExeDomain = SSEPackedSingle in { 6352defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6353 v4f32, loadv4f32, X86Blendv, 6354 SchedWriteFVarBlend.XMM>; 6355defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6356 v8f32, loadv8f32, X86Blendv, 6357 SchedWriteFVarBlend.YMM>, VEX_L; 6358} // ExeDomain = SSEPackedSingle 6359defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6360 v16i8, loadv16i8, X86Blendv, 6361 SchedWriteVarBlend.XMM>; 6362} 6363 6364let Predicates = [HasAVX2] in { 6365defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6366 v32i8, loadv32i8, X86Blendv, 6367 SchedWriteVarBlend.YMM>, VEX_L; 6368} 6369 6370let Predicates = [HasAVX] in { 6371 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6372 (v4i32 VR128:$src2))), 6373 (VBLENDVPSrrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6374 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6375 (v2i64 VR128:$src2))), 6376 (VBLENDVPDrrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6377 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6378 (v8i32 VR256:$src2))), 6379 (VBLENDVPSYrrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6380 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6381 (v4i64 VR256:$src2))), 6382 (VBLENDVPDYrrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6383} 6384 6385// Prefer a movss or movsd over a blendps when optimizing for size. these were 6386// changed to use blends because blends have better throughput on sandybridge 6387// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6388let Predicates = [HasAVX, OptForSpeed] in { 6389 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6390 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6391 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6392 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6393 6394 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6395 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6396 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6397 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6398 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6399 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6400 6401 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6402 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6403 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6404 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6405 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6406 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6407 6408 // Move low f32 and clear high bits. 6409 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6410 (SUBREG_TO_REG (i32 0), 6411 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6412 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6413 (i8 1))), sub_xmm)>; 6414 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6415 (SUBREG_TO_REG (i32 0), 6416 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6417 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6418 (i8 3))), sub_xmm)>; 6419} 6420 6421// Prefer a movss or movsd over a blendps when optimizing for size. these were 6422// changed to use blends because blends have better throughput on sandybridge 6423// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6424let Predicates = [UseSSE41, OptForSpeed] in { 6425 // With SSE41 we can use blends for these patterns. 6426 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6427 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6428 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6429 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6430 6431 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6432 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6433 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6434 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6435 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6436 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6437 6438 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6439 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6440 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6441 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6442 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6443 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6444} 6445 6446 6447/// SS41I_ternary - SSE 4.1 ternary operator 6448let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6449 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6450 PatFrag mem_frag, X86MemOperand x86memop, 6451 SDNode OpNode, X86FoldableSchedWrite sched> { 6452 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6453 (ins VR128:$src1, VR128:$src2), 6454 !strconcat(OpcodeStr, 6455 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6456 [(set VR128:$dst, 6457 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6458 Sched<[sched]>; 6459 6460 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6461 (ins VR128:$src1, x86memop:$src2), 6462 !strconcat(OpcodeStr, 6463 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6464 [(set VR128:$dst, 6465 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6466 Sched<[sched.Folded, sched.ReadAfterFold]>; 6467 } 6468} 6469 6470let ExeDomain = SSEPackedDouble in 6471defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6472 X86Blendv, SchedWriteFVarBlend.XMM>; 6473let ExeDomain = SSEPackedSingle in 6474defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6475 X86Blendv, SchedWriteFVarBlend.XMM>; 6476defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6477 X86Blendv, SchedWriteVarBlend.XMM>; 6478 6479// Aliases with the implicit xmm0 argument 6480def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6481 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6482def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6483 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6484def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6485 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6486def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6487 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6488def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6489 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6490def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6491 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6492 6493let Predicates = [UseSSE41] in { 6494 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6495 (v4i32 VR128:$src2))), 6496 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6497 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6498 (v2i64 VR128:$src2))), 6499 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6500} 6501 6502let AddedComplexity = 400 in { // Prefer non-temporal versions 6503 6504let Predicates = [HasAVX, NoVLX] in 6505def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6506 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6507 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, WIG; 6508let Predicates = [HasAVX2, NoVLX] in 6509def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6510 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6511 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, WIG; 6512def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6513 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6514 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6515 6516let Predicates = [HasAVX2, NoVLX] in { 6517 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6518 (VMOVNTDQAYrm addr:$src)>; 6519 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6520 (VMOVNTDQAYrm addr:$src)>; 6521 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6522 (VMOVNTDQAYrm addr:$src)>; 6523 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6524 (VMOVNTDQAYrm addr:$src)>; 6525 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6526 (VMOVNTDQAYrm addr:$src)>; 6527 def : Pat<(v16f16 (alignednontemporalload addr:$src)), 6528 (VMOVNTDQAYrm addr:$src)>; 6529 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6530 (VMOVNTDQAYrm addr:$src)>; 6531} 6532 6533let Predicates = [HasAVX, NoVLX] in { 6534 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6535 (VMOVNTDQArm addr:$src)>; 6536 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6537 (VMOVNTDQArm addr:$src)>; 6538 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6539 (VMOVNTDQArm addr:$src)>; 6540 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6541 (VMOVNTDQArm addr:$src)>; 6542 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6543 (VMOVNTDQArm addr:$src)>; 6544 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6545 (VMOVNTDQArm addr:$src)>; 6546 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6547 (VMOVNTDQArm addr:$src)>; 6548} 6549 6550let Predicates = [UseSSE41] in { 6551 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6552 (MOVNTDQArm addr:$src)>; 6553 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6554 (MOVNTDQArm addr:$src)>; 6555 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6556 (MOVNTDQArm addr:$src)>; 6557 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6558 (MOVNTDQArm addr:$src)>; 6559 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6560 (MOVNTDQArm addr:$src)>; 6561 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6562 (MOVNTDQArm addr:$src)>; 6563 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6564 (MOVNTDQArm addr:$src)>; 6565} 6566 6567} // AddedComplexity 6568 6569//===----------------------------------------------------------------------===// 6570// SSE4.2 - Compare Instructions 6571//===----------------------------------------------------------------------===// 6572 6573/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6574multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6575 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6576 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6577 bit Is2Addr = 1> { 6578 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6579 (ins RC:$src1, RC:$src2), 6580 !if(Is2Addr, 6581 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6582 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6583 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6584 Sched<[sched]>; 6585 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6586 (ins RC:$src1, x86memop:$src2), 6587 !if(Is2Addr, 6588 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6589 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6590 [(set RC:$dst, 6591 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6592 Sched<[sched.Folded, sched.ReadAfterFold]>; 6593} 6594 6595let Predicates = [HasAVX] in 6596 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6597 load, i128mem, SchedWriteVecALU.XMM, 0>, 6598 VEX, VVVV, WIG; 6599 6600let Predicates = [HasAVX2] in 6601 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6602 load, i256mem, SchedWriteVecALU.YMM, 0>, 6603 VEX, VVVV, VEX_L, WIG; 6604 6605let Constraints = "$src1 = $dst" in 6606 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6607 memop, i128mem, SchedWriteVecALU.XMM>; 6608 6609//===----------------------------------------------------------------------===// 6610// SSE4.2 - String/text Processing Instructions 6611//===----------------------------------------------------------------------===// 6612 6613multiclass pcmpistrm_SS42AI<string asm> { 6614 def rri : SS42AI<0x62, MRMSrcReg, (outs), 6615 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6616 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6617 []>, Sched<[WritePCmpIStrM]>; 6618 let mayLoad = 1 in 6619 def rmi :SS42AI<0x62, MRMSrcMem, (outs), 6620 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6621 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6622 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6623} 6624 6625let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6626 let Predicates = [HasAVX] in 6627 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, WIG; 6628 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6629} 6630 6631multiclass SS42AI_pcmpestrm<string asm> { 6632 def rri : SS42AI<0x60, MRMSrcReg, (outs), 6633 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6634 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6635 []>, Sched<[WritePCmpEStrM]>; 6636 let mayLoad = 1 in 6637 def rmi : SS42AI<0x60, MRMSrcMem, (outs), 6638 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6639 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6640 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6641} 6642 6643let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6644 let Predicates = [HasAVX] in 6645 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, WIG; 6646 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6647} 6648 6649multiclass SS42AI_pcmpistri<string asm> { 6650 def rri : SS42AI<0x63, MRMSrcReg, (outs), 6651 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6652 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6653 []>, Sched<[WritePCmpIStrI]>; 6654 let mayLoad = 1 in 6655 def rmi : SS42AI<0x63, MRMSrcMem, (outs), 6656 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6657 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6658 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6659} 6660 6661let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6662 let Predicates = [HasAVX] in 6663 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, WIG; 6664 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6665} 6666 6667multiclass SS42AI_pcmpestri<string asm> { 6668 def rri : SS42AI<0x61, MRMSrcReg, (outs), 6669 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6670 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6671 []>, Sched<[WritePCmpEStrI]>; 6672 let mayLoad = 1 in 6673 def rmi : SS42AI<0x61, MRMSrcMem, (outs), 6674 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6675 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6676 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6677} 6678 6679let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6680 let Predicates = [HasAVX] in 6681 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, WIG; 6682 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6683} 6684 6685//===----------------------------------------------------------------------===// 6686// SSE4.2 - CRC Instructions 6687//===----------------------------------------------------------------------===// 6688 6689// NOTE: 'HasCRC32' is used as CRC32 instructions are GPR only and not directly 6690// controlled by the SSE42 flag. 6691// 6692// No CRC instructions have AVX equivalents 6693 6694class Crc32r<X86TypeInfo t, RegisterClass rc, SDPatternOperator node> 6695 : ITy<0xF1, MRMSrcReg, t, (outs rc:$dst), (ins rc:$src1, t.RegClass:$src2), 6696 "crc32", binop_args, [(set rc:$dst, (node rc:$src1, t.RegClass:$src2))]>, 6697 Sched<[WriteCRC32]> { 6698 let Constraints = "$src1 = $dst"; 6699} 6700 6701class Crc32m<X86TypeInfo t, RegisterClass rc, SDPatternOperator node> 6702 : ITy<0xF1, MRMSrcMem, t, (outs rc:$dst), (ins rc:$src1, t.MemOperand:$src2), 6703 "crc32", binop_args, [(set rc:$dst, (node rc:$src1, (load addr:$src2)))]>, 6704 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]> { 6705 let Constraints = "$src1 = $dst"; 6706} 6707 6708let Predicates = [HasCRC32, NoEGPR], OpMap = T8, OpPrefix = XD in { 6709 def CRC32r32r8 : Crc32r<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6710 def CRC32r32m8 : Crc32m<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6711 def CRC32r32r16 : Crc32r<Xi16, GR32, int_x86_sse42_crc32_32_16>, OpSize16; 6712 def CRC32r32m16 : Crc32m<Xi16, GR32, int_x86_sse42_crc32_32_16>, OpSize16; 6713 def CRC32r32r32 : Crc32r<Xi32, GR32, int_x86_sse42_crc32_32_32>, OpSize32; 6714 def CRC32r32m32 : Crc32m<Xi32, GR32, int_x86_sse42_crc32_32_32>, OpSize32; 6715 def CRC32r64r64 : Crc32r<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6716 def CRC32r64m64 : Crc32m<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6717 def CRC32r64r8 : Crc32r<Xi8, GR64, null_frag>, REX_W; 6718 let mayLoad = 1 in 6719 def CRC32r64m8 : Crc32m<Xi8, GR64, null_frag>, REX_W; 6720} 6721 6722let Predicates = [HasCRC32, HasEGPR, In64BitMode], OpMap = T_MAP4, OpEnc = EncEVEX in { 6723 def CRC32r32r8_EVEX : Crc32r<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6724 def CRC32r32m8_EVEX : Crc32m<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6725 def CRC32r32r16_EVEX : Crc32r<Xi16, GR32, int_x86_sse42_crc32_32_16>, PD; 6726 def CRC32r32m16_EVEX : Crc32m<Xi16, GR32, int_x86_sse42_crc32_32_16>, PD; 6727 def CRC32r32r32_EVEX : Crc32r<Xi32, GR32, int_x86_sse42_crc32_32_32>; 6728 def CRC32r32m32_EVEX : Crc32m<Xi32, GR32, int_x86_sse42_crc32_32_32>; 6729 def CRC32r64r64_EVEX : Crc32r<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6730 def CRC32r64m64_EVEX : Crc32m<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6731 def CRC32r64r8_EVEX : Crc32r<Xi8, GR64, null_frag>, REX_W; 6732 let mayLoad = 1 in 6733 def CRC32r64m8_EVEX : Crc32m<Xi8, GR64, null_frag>, REX_W; 6734} 6735 6736//===----------------------------------------------------------------------===// 6737// SHA-NI Instructions 6738//===----------------------------------------------------------------------===// 6739 6740// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6741multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6742 X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { 6743 def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), 6744 (ins VR128:$src1, VR128:$src2), 6745 !if(UsesXMM0, 6746 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6747 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6748 [!if(UsesXMM0, 6749 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6750 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6751 T8, Sched<[sched]>; 6752 6753 def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), 6754 (ins VR128:$src1, i128mem:$src2), 6755 !if(UsesXMM0, 6756 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6757 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6758 [!if(UsesXMM0, 6759 (set VR128:$dst, (IntId VR128:$src1, 6760 (memop addr:$src2), XMM0)), 6761 (set VR128:$dst, (IntId VR128:$src1, 6762 (memop addr:$src2))))]>, T8, 6763 Sched<[sched.Folded, sched.ReadAfterFold]>; 6764} 6765 6766let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { 6767 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6768 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6769 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6770 [(set VR128:$dst, 6771 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6772 (i8 timm:$src3)))]>, TA, 6773 Sched<[SchedWriteVecIMul.XMM]>; 6774 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6775 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6776 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6777 [(set VR128:$dst, 6778 (int_x86_sha1rnds4 VR128:$src1, 6779 (memop addr:$src2), 6780 (i8 timm:$src3)))]>, TA, 6781 Sched<[SchedWriteVecIMul.XMM.Folded, 6782 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6783 6784 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6785 SchedWriteVecIMul.XMM>; 6786 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6787 SchedWriteVecIMul.XMM>; 6788 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6789 SchedWriteVecIMul.XMM>; 6790 6791 let Uses=[XMM0] in 6792 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6793 SchedWriteVecIMul.XMM, 1>; 6794 6795 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6796 SchedWriteVecIMul.XMM>; 6797 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6798 SchedWriteVecIMul.XMM>; 6799} 6800 6801 6802//===----------------------------------------------------------------------===// 6803// AES-NI Instructions 6804//===----------------------------------------------------------------------===// 6805 6806multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6807 Intrinsic IntId, PatFrag ld_frag, 6808 bit Is2Addr = 0, RegisterClass RC = VR128, 6809 X86MemOperand MemOp = i128mem> { 6810 let AsmString = OpcodeStr# 6811 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6812 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6813 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6814 (ins RC:$src1, RC:$src2), "", 6815 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6816 Sched<[WriteAESDecEnc]>; 6817 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6818 (ins RC:$src1, MemOp:$src2), "", 6819 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6820 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6821 } 6822} 6823 6824// Perform One Round of an AES Encryption/Decryption Flow 6825let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6826 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6827 int_x86_aesni_aesenc, load>, VEX, VVVV, WIG; 6828 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6829 int_x86_aesni_aesenclast, load>, VEX, VVVV, WIG; 6830 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6831 int_x86_aesni_aesdec, load>, VEX, VVVV, WIG; 6832 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6833 int_x86_aesni_aesdeclast, load>, VEX, VVVV, WIG; 6834} 6835 6836let Predicates = [NoVLX, HasVAES] in { 6837 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6838 int_x86_aesni_aesenc_256, load, 0, VR256, 6839 i256mem>, VEX, VVVV, VEX_L, WIG; 6840 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6841 int_x86_aesni_aesenclast_256, load, 0, VR256, 6842 i256mem>, VEX, VVVV, VEX_L, WIG; 6843 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6844 int_x86_aesni_aesdec_256, load, 0, VR256, 6845 i256mem>, VEX, VVVV, VEX_L, WIG; 6846 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6847 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6848 i256mem>, VEX, VVVV, VEX_L, WIG; 6849} 6850 6851let Constraints = "$src1 = $dst" in { 6852 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6853 int_x86_aesni_aesenc, memop, 1>; 6854 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6855 int_x86_aesni_aesenclast, memop, 1>; 6856 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6857 int_x86_aesni_aesdec, memop, 1>; 6858 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6859 int_x86_aesni_aesdeclast, memop, 1>; 6860} 6861 6862// Perform the AES InvMixColumn Transformation 6863let Predicates = [HasAVX, HasAES] in { 6864 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6865 (ins VR128:$src1), 6866 "vaesimc\t{$src1, $dst|$dst, $src1}", 6867 [(set VR128:$dst, 6868 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6869 VEX, WIG; 6870 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6871 (ins i128mem:$src1), 6872 "vaesimc\t{$src1, $dst|$dst, $src1}", 6873 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6874 Sched<[WriteAESIMC.Folded]>, VEX, WIG; 6875} 6876def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6877 (ins VR128:$src1), 6878 "aesimc\t{$src1, $dst|$dst, $src1}", 6879 [(set VR128:$dst, 6880 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6881def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6882 (ins i128mem:$src1), 6883 "aesimc\t{$src1, $dst|$dst, $src1}", 6884 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6885 Sched<[WriteAESIMC.Folded]>; 6886 6887// AES Round Key Generation Assist 6888let Predicates = [HasAVX, HasAES] in { 6889 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6890 (ins VR128:$src1, u8imm:$src2), 6891 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6892 [(set VR128:$dst, 6893 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6894 Sched<[WriteAESKeyGen]>, VEX, WIG; 6895 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6896 (ins i128mem:$src1, u8imm:$src2), 6897 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6898 [(set VR128:$dst, 6899 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6900 Sched<[WriteAESKeyGen.Folded]>, VEX, WIG; 6901} 6902def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6903 (ins VR128:$src1, u8imm:$src2), 6904 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6905 [(set VR128:$dst, 6906 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6907 Sched<[WriteAESKeyGen]>; 6908def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6909 (ins i128mem:$src1, u8imm:$src2), 6910 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6911 [(set VR128:$dst, 6912 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6913 Sched<[WriteAESKeyGen.Folded]>; 6914 6915//===----------------------------------------------------------------------===// 6916// PCLMUL Instructions 6917//===----------------------------------------------------------------------===// 6918 6919// Immediate transform to help with commuting. 6920def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6921 uint8_t Imm = N->getZExtValue(); 6922 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6923}]>; 6924 6925// SSE carry-less Multiplication instructions 6926let Predicates = [NoAVX, HasPCLMUL] in { 6927 let Constraints = "$src1 = $dst" in { 6928 let isCommutable = 1 in 6929 def PCLMULQDQrri : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6930 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6931 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6932 [(set VR128:$dst, 6933 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6934 Sched<[WriteCLMul]>; 6935 6936 def PCLMULQDQrmi : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6937 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6938 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6939 [(set VR128:$dst, 6940 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6941 timm:$src3))]>, 6942 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6943 } // Constraints = "$src1 = $dst" 6944 6945 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6946 (i8 timm:$src3)), 6947 (PCLMULQDQrmi VR128:$src1, addr:$src2, 6948 (PCLMULCommuteImm timm:$src3))>; 6949} // Predicates = [NoAVX, HasPCLMUL] 6950 6951// SSE aliases 6952foreach HI = ["hq","lq"] in 6953foreach LO = ["hq","lq"] in { 6954 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6955 (PCLMULQDQrri VR128:$dst, VR128:$src, 6956 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6957 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6958 (PCLMULQDQrmi VR128:$dst, i128mem:$src, 6959 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6960} 6961 6962// AVX carry-less Multiplication instructions 6963multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6964 PatFrag LdFrag, Intrinsic IntId> { 6965 let isCommutable = 1 in 6966 def rri : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6967 (ins RC:$src1, RC:$src2, u8imm:$src3), 6968 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6969 [(set RC:$dst, 6970 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6971 Sched<[WriteCLMul]>; 6972 6973 def rmi : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6974 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6975 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6976 [(set RC:$dst, 6977 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6978 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6979 6980 // We can commute a load in the first operand by swapping the sources and 6981 // rotating the immediate. 6982 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6983 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6984 (PCLMULCommuteImm timm:$src3))>; 6985} 6986 6987let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6988defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6989 int_x86_pclmulqdq>, VEX, VVVV, WIG; 6990 6991let Predicates = [NoVLX, HasVPCLMULQDQ] in 6992defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 6993 int_x86_pclmulqdq_256>, VEX, VVVV, VEX_L, WIG; 6994 6995multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 6996 X86MemOperand MemOp, string Hi, string Lo> { 6997 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6998 (!cast<Instruction>(InstStr # "rri") RC:$dst, RC:$src1, RC:$src2, 6999 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7000 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7001 (!cast<Instruction>(InstStr # "rmi") RC:$dst, RC:$src1, MemOp:$src2, 7002 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7003} 7004 7005multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 7006 X86MemOperand MemOp> { 7007 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 7008 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 7009 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 7010 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 7011} 7012 7013// AVX aliases 7014defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 7015defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 7016 7017//===----------------------------------------------------------------------===// 7018// SSE4A Instructions 7019//===----------------------------------------------------------------------===// 7020 7021let Predicates = [HasSSE4A] in { 7022 7023let ExeDomain = SSEPackedInt in { 7024let Constraints = "$src = $dst" in { 7025def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7026 (ins VR128:$src, u8imm:$len, u8imm:$idx), 7027 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7028 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 7029 timm:$idx))]>, 7030 TB, PD, Sched<[SchedWriteVecALU.XMM]>; 7031def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7032 (ins VR128:$src, VR128:$mask), 7033 "extrq\t{$mask, $src|$src, $mask}", 7034 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7035 VR128:$mask))]>, 7036 TB, PD, Sched<[SchedWriteVecALU.XMM]>; 7037 7038def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7039 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7040 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7041 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7042 timm:$len, timm:$idx))]>, 7043 TB, XD, Sched<[SchedWriteVecALU.XMM]>; 7044def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7045 (ins VR128:$src, VR128:$mask), 7046 "insertq\t{$mask, $src|$src, $mask}", 7047 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7048 VR128:$mask))]>, 7049 TB, XD, Sched<[SchedWriteVecALU.XMM]>; 7050} 7051} // ExeDomain = SSEPackedInt 7052 7053// Non-temporal (unaligned) scalar stores. 7054let AddedComplexity = 400 in { // Prefer non-temporal versions 7055let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 7056def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7057 "movntss\t{$src, $dst|$dst, $src}", []>, TB, XS; 7058 7059def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7060 "movntsd\t{$src, $dst|$dst, $src}", []>, TB, XD; 7061} // SchedRW 7062 7063def : Pat<(nontemporalstore FR32:$src, addr:$dst), 7064 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7065 7066def : Pat<(nontemporalstore FR64:$src, addr:$dst), 7067 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7068 7069} // AddedComplexity 7070} // HasSSE4A 7071 7072//===----------------------------------------------------------------------===// 7073// AVX Instructions 7074//===----------------------------------------------------------------------===// 7075 7076//===----------------------------------------------------------------------===// 7077// VBROADCAST - Load from memory and broadcast to all elements of the 7078// destination operand 7079// 7080class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7081 X86MemOperand x86memop, ValueType VT, 7082 PatFrag bcast_frag, SchedWrite Sched> : 7083 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7084 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7085 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 7086 Sched<[Sched]>, VEX { 7087 let isReMaterializable = 1; 7088} 7089 7090// AVX2 adds register forms 7091class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7092 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7093 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7094 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7095 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7096 Sched<[Sched]>, VEX; 7097 7098let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 7099 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7100 f32mem, v4f32, X86VBroadcastld32, 7101 SchedWriteFShuffle.XMM.Folded>; 7102 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7103 f32mem, v8f32, X86VBroadcastld32, 7104 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7105} 7106let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 7107def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7108 v4f64, X86VBroadcastld64, 7109 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7110 7111let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 7112 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7113 v4f32, v4f32, SchedWriteFShuffle.XMM>; 7114 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7115 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7116} 7117let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 7118def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7119 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7120 7121//===----------------------------------------------------------------------===// 7122// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7123// halves of a 256-bit vector. 7124// 7125let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7126def VBROADCASTI128rm : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7127 (ins i128mem:$src), 7128 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7129 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7130 7131let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7132 ExeDomain = SSEPackedSingle in 7133def VBROADCASTF128rm : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7134 (ins f128mem:$src), 7135 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7136 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7137 7138let Predicates = [HasAVX, NoVLX] in { 7139def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), 7140 (VBROADCASTF128rm addr:$src)>; 7141def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), 7142 (VBROADCASTF128rm addr:$src)>; 7143// NOTE: We're using FP instructions here, but execution domain fixing can 7144// convert to integer when profitable. 7145def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), 7146 (VBROADCASTF128rm addr:$src)>; 7147def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), 7148 (VBROADCASTF128rm addr:$src)>; 7149def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), 7150 (VBROADCASTF128rm addr:$src)>; 7151def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), 7152 (VBROADCASTF128rm addr:$src)>; 7153def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), 7154 (VBROADCASTF128rm addr:$src)>; 7155} 7156 7157let Predicates = [HasAVXNECONVERT, NoVLX] in 7158 def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)), 7159 (VBROADCASTF128rm addr:$src)>; 7160 7161//===----------------------------------------------------------------------===// 7162// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7163// 7164 7165let ExeDomain = SSEPackedSingle in { 7166let isCommutable = 1 in 7167def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7168 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7169 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7170 VEX, VVVV, VEX_L, Sched<[WriteFShuffle256]>; 7171def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7172 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7173 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7174 VEX, VVVV, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7175} 7176 7177// Immediate transform to help with commuting. 7178def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7179 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7180}]>; 7181 7182multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { 7183 def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7184 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; 7185 def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), 7186 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; 7187 // Pattern with load in other operand. 7188 def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), 7189 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7190 (Perm2XCommuteImm timm:$imm))>; 7191} 7192 7193let Predicates = [HasAVX] in { 7194 defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; 7195 defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; 7196} 7197 7198let Predicates = [HasAVX1Only] in { 7199 defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; 7200 defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; 7201 defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; 7202 defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; 7203 defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; 7204} 7205 7206//===----------------------------------------------------------------------===// 7207// VINSERTF128 - Insert packed floating-point values 7208// 7209let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7210def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7211 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7212 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7213 []>, Sched<[WriteFShuffle256]>, VEX, VVVV, VEX_L; 7214let mayLoad = 1 in 7215def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7216 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7217 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7218 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L; 7219} 7220 7221// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7222// with YMM register containing zero. 7223// FIXME: Avoid producing vxorps to clear the fake inputs. 7224let Predicates = [HasAVX1Only] in { 7225def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7226} 7227 7228multiclass vinsert_lowering<string InstrStr, string PermStr, 7229 ValueType From, ValueType To, 7230 PatFrag frommemop_frag, PatFrag tomemop_frag> { 7231 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7232 (iPTR imm)), 7233 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7234 (INSERT_get_vinsert128_imm VR256:$ins))>; 7235 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7236 (From (frommemop_frag addr:$src2)), 7237 (iPTR imm)), 7238 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7239 (INSERT_get_vinsert128_imm VR256:$ins))>; 7240 // Folding "To" vector - convert to perm2x128 and commute inputs. 7241 def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)), 7242 (From VR128:$src2), 7243 (iPTR imm)), 7244 (!cast<Instruction>(PermStr#rm) 7245 (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 7246 addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>; 7247} 7248 7249let Predicates = [HasAVX, NoVLX] in { 7250 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>; 7251 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>; 7252} 7253 7254let Predicates = [HasAVX1Only] in { 7255 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; 7256 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; 7257 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; 7258 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; 7259 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7260 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7261} 7262 7263//===----------------------------------------------------------------------===// 7264// VEXTRACTF128 - Extract packed floating-point values 7265// 7266let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7267def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7268 (ins VR256:$src1, u8imm:$src2), 7269 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7270 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7271let mayStore = 1 in 7272def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7273 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7274 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7275 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7276} 7277 7278multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7279 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7280 (To (!cast<Instruction>(InstrStr#rr) 7281 (From VR256:$src1), 7282 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7283 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7284 (iPTR imm))), addr:$dst), 7285 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7286 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7287} 7288 7289// AVX1 patterns 7290let Predicates = [HasAVX, NoVLX] in { 7291 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7292 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7293} 7294 7295let Predicates = [HasAVX1Only] in { 7296 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7297 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7298 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7299 defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; 7300 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7301 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7302} 7303 7304//===----------------------------------------------------------------------===// 7305// VMASKMOV - Conditional SIMD Packed Loads and Stores 7306// 7307multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7308 Intrinsic IntLd, Intrinsic IntLd256, 7309 Intrinsic IntSt, Intrinsic IntSt256, 7310 X86SchedWriteMaskMove schedX, 7311 X86SchedWriteMaskMove schedY> { 7312 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7313 (ins VR128:$src1, f128mem:$src2), 7314 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7315 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7316 VEX, VVVV, Sched<[schedX.RM]>; 7317 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7318 (ins VR256:$src1, f256mem:$src2), 7319 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7320 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7321 VEX, VVVV, VEX_L, Sched<[schedY.RM]>; 7322 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7323 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7324 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7325 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7326 VEX, VVVV, Sched<[schedX.MR]>; 7327 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7328 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7329 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7330 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7331 VEX, VVVV, VEX_L, Sched<[schedY.MR]>; 7332} 7333 7334let ExeDomain = SSEPackedSingle in 7335defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7336 int_x86_avx_maskload_ps, 7337 int_x86_avx_maskload_ps_256, 7338 int_x86_avx_maskstore_ps, 7339 int_x86_avx_maskstore_ps_256, 7340 WriteFMaskMove32, WriteFMaskMove32Y>; 7341let ExeDomain = SSEPackedDouble in 7342defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7343 int_x86_avx_maskload_pd, 7344 int_x86_avx_maskload_pd_256, 7345 int_x86_avx_maskstore_pd, 7346 int_x86_avx_maskstore_pd_256, 7347 WriteFMaskMove64, WriteFMaskMove64Y>; 7348 7349//===----------------------------------------------------------------------===// 7350// AVX_VNNI 7351//===----------------------------------------------------------------------===// 7352let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst", 7353 explicitOpPrefix = ExplicitVEX in 7354multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7355 bit IsCommutable> { 7356 let isCommutable = IsCommutable in 7357 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 7358 (ins VR128:$src1, VR128:$src2, VR128:$src3), 7359 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7360 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, 7361 VR128:$src2, VR128:$src3)))]>, 7362 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 7363 7364 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 7365 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 7366 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7367 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, 7368 (loadv4i32 addr:$src3))))]>, 7369 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM.Folded, 7370 SchedWriteVecIMul.XMM.ReadAfterFold, 7371 SchedWriteVecIMul.XMM.ReadAfterFold]>; 7372 7373 let isCommutable = IsCommutable in 7374 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 7375 (ins VR256:$src1, VR256:$src2, VR256:$src3), 7376 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7377 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, 7378 VR256:$src2, VR256:$src3)))]>, 7379 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 7380 7381 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 7382 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 7383 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7384 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, 7385 (loadv8i32 addr:$src3))))]>, 7386 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded, 7387 SchedWriteVecIMul.YMM.ReadAfterFold, 7388 SchedWriteVecIMul.YMM.ReadAfterFold]>; 7389} 7390 7391defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>; 7392defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>; 7393defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>; 7394defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>; 7395 7396let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { 7397 def : Pat<(v8i32 (add VR256:$src1, 7398 (X86vpmaddwd_su VR256:$src2, VR256:$src3))), 7399 (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; 7400 def : Pat<(v8i32 (add VR256:$src1, 7401 (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), 7402 (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; 7403 def : Pat<(v4i32 (add VR128:$src1, 7404 (X86vpmaddwd_su VR128:$src2, VR128:$src3))), 7405 (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; 7406 def : Pat<(v4i32 (add VR128:$src1, 7407 (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), 7408 (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; 7409} 7410 7411//===----------------------------------------------------------------------===// 7412// VPERMIL - Permute Single and Double Floating-Point Values 7413// 7414 7415multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7416 RegisterClass RC, X86MemOperand x86memop_f, 7417 X86MemOperand x86memop_i, 7418 ValueType f_vt, ValueType i_vt, 7419 X86FoldableSchedWrite sched, 7420 X86FoldableSchedWrite varsched> { 7421 let Predicates = [HasAVX, NoVLX] in { 7422 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7423 (ins RC:$src1, RC:$src2), 7424 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7425 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX, VVVV, 7426 Sched<[varsched]>; 7427 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7428 (ins RC:$src1, x86memop_i:$src2), 7429 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7430 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7431 (i_vt (load addr:$src2)))))]>, VEX, VVVV, 7432 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7433 7434 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7435 (ins RC:$src1, u8imm:$src2), 7436 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7437 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7438 Sched<[sched]>; 7439 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7440 (ins x86memop_f:$src1, u8imm:$src2), 7441 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7442 [(set RC:$dst, 7443 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7444 Sched<[sched.Folded]>; 7445 }// Predicates = [HasAVX, NoVLX] 7446} 7447 7448let ExeDomain = SSEPackedSingle in { 7449 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7450 v4f32, v4i32, SchedWriteFShuffle.XMM, 7451 SchedWriteFVarShuffle.XMM>; 7452 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7453 v8f32, v8i32, SchedWriteFShuffle.YMM, 7454 SchedWriteFVarShuffle.YMM>, VEX_L; 7455} 7456let ExeDomain = SSEPackedDouble in { 7457 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7458 v2f64, v2i64, SchedWriteFShuffle.XMM, 7459 SchedWriteFVarShuffle.XMM>; 7460 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7461 v4f64, v4i64, SchedWriteFShuffle.YMM, 7462 SchedWriteFVarShuffle.YMM>, VEX_L; 7463} 7464 7465//===----------------------------------------------------------------------===// 7466// VZERO - Zero YMM registers 7467// Note: These instruction do not affect the YMM16-YMM31. 7468// 7469 7470let SchedRW = [WriteSystem] in { 7471let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7472 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7473 // Zero All YMM registers 7474 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7475 [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, 7476 Requires<[HasAVX]>, WIG; 7477 7478 // Zero Upper bits of YMM registers 7479 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7480 [(int_x86_avx_vzeroupper)]>, TB, VEX, 7481 Requires<[HasAVX]>, WIG; 7482} // Defs 7483} // SchedRW 7484 7485//===----------------------------------------------------------------------===// 7486// Half precision conversion instructions 7487// 7488 7489multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7490 X86FoldableSchedWrite sched> { 7491 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7492 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7493 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, 7494 T8, PD, VEX, Sched<[sched]>; 7495 let hasSideEffects = 0, mayLoad = 1 in 7496 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7497 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7498 []>, T8, PD, VEX, Sched<[sched.Folded]>; 7499} 7500 7501multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7502 SchedWrite RR, SchedWrite MR> { 7503 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7504 (ins RC:$src1, i32u8imm:$src2), 7505 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7506 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, 7507 TA, PD, VEX, Sched<[RR]>; 7508 let hasSideEffects = 0, mayStore = 1 in 7509 def mr : Ii8<0x1D, MRMDestMem, (outs), 7510 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7511 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7512 TA, PD, VEX, Sched<[MR]>; 7513} 7514 7515let Predicates = [HasF16C, NoVLX] in { 7516 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7517 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7518 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7519 WriteCvtPS2PHSt>, SIMD_EXC; 7520 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7521 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7522 7523 // Pattern match vcvtph2ps of a scalar i64 load. 7524 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7525 (VCVTPH2PSrm addr:$src)>; 7526 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 7527 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7528 (VCVTPH2PSrm addr:$src)>; 7529 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), 7530 (VCVTPH2PSYrm addr:$src)>; 7531 7532 def : Pat<(store (f64 (extractelt 7533 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7534 (iPTR 0))), addr:$dst), 7535 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7536 def : Pat<(store (i64 (extractelt 7537 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7538 (iPTR 0))), addr:$dst), 7539 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7540 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7541 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7542} 7543 7544//===----------------------------------------------------------------------===// 7545// AVX2 Instructions 7546//===----------------------------------------------------------------------===// 7547 7548/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7549multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7550 ValueType OpVT, X86FoldableSchedWrite sched, 7551 RegisterClass RC, 7552 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7553 let isCommutable = 1 in 7554 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7555 (ins RC:$src1, RC:$src2, u8imm:$src3), 7556 !strconcat(OpcodeStr, 7557 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7558 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7559 Sched<[sched]>, VEX, VVVV; 7560 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7561 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7562 !strconcat(OpcodeStr, 7563 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7564 [(set RC:$dst, 7565 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7566 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX, VVVV; 7567 7568 // Pattern to commute if load is in first source. 7569 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7570 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7571 (commuteXForm timm:$src3))>; 7572} 7573 7574let Predicates = [HasAVX2] in { 7575defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7576 SchedWriteBlend.XMM, VR128, i128mem, 7577 BlendCommuteImm4>; 7578defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7579 SchedWriteBlend.YMM, VR256, i256mem, 7580 BlendCommuteImm8>, VEX_L; 7581 7582def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7583 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7584def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7585 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7586def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7587 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7588 7589def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7590 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7591def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7592 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7593def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7594 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7595} 7596 7597// For insertion into the zero index (low half) of a 256-bit vector, it is 7598// more efficient to generate a blend with immediate instead of an insert*128. 7599// NOTE: We're using FP instructions here, but execution domain fixing should 7600// take care of using integer instructions when profitable. 7601let Predicates = [HasAVX] in { 7602def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7603 (VBLENDPSYrri VR256:$src1, 7604 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7605 VR128:$src2, sub_xmm), 0xf)>; 7606def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7607 (VBLENDPSYrri VR256:$src1, 7608 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7609 VR128:$src2, sub_xmm), 0xf)>; 7610def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7611 (VBLENDPSYrri VR256:$src1, 7612 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7613 VR128:$src2, sub_xmm), 0xf)>; 7614def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), 7615 (VBLENDPSYrri VR256:$src1, 7616 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7617 VR128:$src2, sub_xmm), 0xf)>; 7618def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7619 (VBLENDPSYrri VR256:$src1, 7620 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7621 VR128:$src2, sub_xmm), 0xf)>; 7622 7623def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7624 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7625 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7626def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7627 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7628 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7629def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7630 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7631 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7632def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), 7633 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7634 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7635def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7636 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7637 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7638} 7639 7640//===----------------------------------------------------------------------===// 7641// VPBROADCAST - Load from memory and broadcast to all elements of the 7642// destination operand 7643// 7644multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7645 X86MemOperand x86memop, PatFrag bcast_frag, 7646 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7647 let Predicates = [HasAVX2, prd] in { 7648 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7649 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7650 [(set VR128:$dst, 7651 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7652 Sched<[SchedWriteShuffle.XMM]>, VEX; 7653 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7654 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7655 [(set VR128:$dst, 7656 (OpVT128 (bcast_frag addr:$src)))]>, 7657 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7658 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7659 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7660 [(set VR256:$dst, 7661 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7662 Sched<[WriteShuffle256]>, VEX, VEX_L; 7663 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7664 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7665 [(set VR256:$dst, 7666 (OpVT256 (bcast_frag addr:$src)))]>, 7667 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7668 7669 // Provide aliases for broadcast from the same register class that 7670 // automatically does the extract. 7671 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7672 (!cast<Instruction>(NAME#"Yrr") 7673 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7674 } 7675} 7676 7677defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7678 v16i8, v32i8, NoVLX_Or_NoBWI>; 7679defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7680 v8i16, v16i16, NoVLX_Or_NoBWI>; 7681defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7682 v4i32, v8i32, NoVLX>; 7683defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7684 v2i64, v4i64, NoVLX>; 7685 7686let Predicates = [HasAVX2, NoVLX] in { 7687 // Provide fallback in case the load node that is used in the patterns above 7688 // is used by additional users, which prevents the pattern selection. 7689 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7690 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7691 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7692 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7693 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7694 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7695} 7696 7697let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7698 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7699 (VPBROADCASTBrr (VMOVDI2PDIrr 7700 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7701 GR8:$src, sub_8bit))))>; 7702 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7703 (VPBROADCASTBYrr (VMOVDI2PDIrr 7704 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7705 GR8:$src, sub_8bit))))>; 7706 7707 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7708 (VPBROADCASTWrr (VMOVDI2PDIrr 7709 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7710 GR16:$src, sub_16bit))))>; 7711 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7712 (VPBROADCASTWYrr (VMOVDI2PDIrr 7713 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7714 GR16:$src, sub_16bit))))>; 7715 7716 def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), 7717 (VPBROADCASTWrm addr:$src)>; 7718 def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), 7719 (VPBROADCASTWYrm addr:$src)>; 7720 7721 def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))), 7722 (VPBROADCASTWrr VR128:$src)>; 7723 def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))), 7724 (VPBROADCASTWYrr VR128:$src)>; 7725 7726 def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))), 7727 (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7728 def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))), 7729 (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7730} 7731let Predicates = [HasAVX2, NoVLX] in { 7732 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7733 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7734 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7735 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7736 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7737 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7738 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7739 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7740} 7741 7742// AVX1 broadcast patterns 7743let Predicates = [HasAVX1Only] in { 7744def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7745 (VBROADCASTSSYrm addr:$src)>; 7746def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7747 (VBROADCASTSDYrm addr:$src)>; 7748def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7749 (VBROADCASTSSrm addr:$src)>; 7750} 7751 7752 // Provide fallback in case the load node that is used in the patterns above 7753 // is used by additional users, which prevents the pattern selection. 7754let Predicates = [HasAVX, NoVLX] in { 7755 // 128bit broadcasts: 7756 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7757 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7758 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7759 (VMOVDDUPrm addr:$src)>; 7760 7761 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7762 (VMOVDDUPrr VR128:$src)>; 7763} 7764 7765let Predicates = [HasAVX1Only] in { 7766 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7767 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7768 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7769 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7770 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7771 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7772 def : Pat<(v8f32 (X86VBroadcast v4f32:$src)), 7773 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7774 (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm), 7775 (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>; 7776 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7777 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7778 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7779 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7780 def : Pat<(v4f64 (X86VBroadcast v2f64:$src)), 7781 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7782 (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm), 7783 (v2f64 (VMOVDDUPrr VR128:$src)), 1)>; 7784 7785 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7786 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7787 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7788 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7789 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7790 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7791 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7792 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7793 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7794 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7795 7796 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7797 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7798 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7799 (VMOVDDUPrm addr:$src)>; 7800} 7801 7802//===----------------------------------------------------------------------===// 7803// VPERM - Permute instructions 7804// 7805 7806multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7807 ValueType OpVT, X86FoldableSchedWrite Sched, 7808 X86MemOperand memOp> { 7809 let Predicates = [HasAVX2, NoVLX] in { 7810 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7811 (ins VR256:$src1, VR256:$src2), 7812 !strconcat(OpcodeStr, 7813 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7814 [(set VR256:$dst, 7815 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7816 Sched<[Sched]>, VEX, VVVV, VEX_L; 7817 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7818 (ins VR256:$src1, memOp:$src2), 7819 !strconcat(OpcodeStr, 7820 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7821 [(set VR256:$dst, 7822 (OpVT (X86VPermv VR256:$src1, 7823 (load addr:$src2))))]>, 7824 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VVVV, VEX_L; 7825 } 7826} 7827 7828defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7829let ExeDomain = SSEPackedSingle in 7830defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7831 7832multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7833 ValueType OpVT, X86FoldableSchedWrite Sched, 7834 X86MemOperand memOp> { 7835 let Predicates = [HasAVX2, NoVLX] in { 7836 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7837 (ins VR256:$src1, u8imm:$src2), 7838 !strconcat(OpcodeStr, 7839 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7840 [(set VR256:$dst, 7841 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7842 Sched<[Sched]>, VEX, VEX_L; 7843 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7844 (ins memOp:$src1, u8imm:$src2), 7845 !strconcat(OpcodeStr, 7846 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7847 [(set VR256:$dst, 7848 (OpVT (X86VPermi (mem_frag addr:$src1), 7849 (i8 timm:$src2))))]>, 7850 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7851 } 7852} 7853 7854defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7855 WriteShuffle256, i256mem>, REX_W; 7856let ExeDomain = SSEPackedDouble in 7857defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7858 WriteFShuffle256, f256mem>, REX_W; 7859 7860//===----------------------------------------------------------------------===// 7861// VPERM2I128 - Permute Integer vector Values in 128-bit chunks 7862// 7863let isCommutable = 1 in 7864def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7865 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7866 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7867 Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L; 7868def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7869 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7870 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7871 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L; 7872 7873let Predicates = [HasAVX2] in { 7874 defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; 7875 defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; 7876 defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; 7877 defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; 7878 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7879 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7880} 7881 7882//===----------------------------------------------------------------------===// 7883// VINSERTI128 - Insert packed integer values 7884// 7885let hasSideEffects = 0 in { 7886def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7887 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7888 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7889 []>, Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L; 7890let mayLoad = 1 in 7891def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7892 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7893 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7894 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L; 7895} 7896 7897let Predicates = [HasAVX2, NoVLX] in { 7898 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; 7899 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; 7900 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; 7901 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; 7902 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7903 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7904} 7905 7906let Predicates = [HasAVXNECONVERT, NoVLX] in 7907 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>; 7908 7909//===----------------------------------------------------------------------===// 7910// VEXTRACTI128 - Extract packed integer values 7911// 7912def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7913 (ins VR256:$src1, u8imm:$src2), 7914 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7915 Sched<[WriteShuffle256]>, VEX, VEX_L; 7916let hasSideEffects = 0, mayStore = 1 in 7917def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7918 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7919 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7920 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7921 7922let Predicates = [HasAVX2, NoVLX] in { 7923 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7924 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7925 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7926 defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; 7927 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7928 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7929} 7930 7931let Predicates = [HasAVXNECONVERT, NoVLX] in 7932 defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>; 7933 7934//===----------------------------------------------------------------------===// 7935// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7936// 7937multiclass avx2_pmovmask<string OpcodeStr, 7938 Intrinsic IntLd128, Intrinsic IntLd256, 7939 Intrinsic IntSt128, Intrinsic IntSt256, 7940 X86SchedWriteMaskMove schedX, 7941 X86SchedWriteMaskMove schedY> { 7942 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7943 (ins VR128:$src1, i128mem:$src2), 7944 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7945 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7946 VEX, VVVV, Sched<[schedX.RM]>; 7947 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7948 (ins VR256:$src1, i256mem:$src2), 7949 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7950 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7951 VEX, VVVV, VEX_L, Sched<[schedY.RM]>; 7952 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7953 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7954 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7955 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7956 VEX, VVVV, Sched<[schedX.MR]>; 7957 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7958 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7959 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7960 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7961 VEX, VVVV, VEX_L, Sched<[schedY.MR]>; 7962} 7963 7964defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7965 int_x86_avx2_maskload_d, 7966 int_x86_avx2_maskload_d_256, 7967 int_x86_avx2_maskstore_d, 7968 int_x86_avx2_maskstore_d_256, 7969 WriteVecMaskMove32, WriteVecMaskMove32Y>; 7970defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7971 int_x86_avx2_maskload_q, 7972 int_x86_avx2_maskload_q_256, 7973 int_x86_avx2_maskstore_q, 7974 int_x86_avx2_maskstore_q_256, 7975 WriteVecMaskMove64, WriteVecMaskMove64Y>, REX_W; 7976 7977multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7978 ValueType MaskVT> { 7979 // masked store 7980 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7981 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7982 // masked load 7983 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7984 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7985 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7986 (VT immAllZerosV))), 7987 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7988} 7989let Predicates = [HasAVX] in { 7990 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7991 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 7992 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 7993 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 7994} 7995let Predicates = [HasAVX1Only] in { 7996 // load/store i32/i64 not supported use ps/pd version 7997 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 7998 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 7999 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 8000 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 8001} 8002let Predicates = [HasAVX2] in { 8003 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 8004 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 8005 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 8006 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 8007} 8008 8009//===----------------------------------------------------------------------===// 8010// Variable Bit Shifts 8011// 8012multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 8013 ValueType vt128, ValueType vt256> { 8014 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 8015 (ins VR128:$src1, VR128:$src2), 8016 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8017 [(set VR128:$dst, 8018 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 8019 VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM]>; 8020 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 8021 (ins VR128:$src1, i128mem:$src2), 8022 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8023 [(set VR128:$dst, 8024 (vt128 (OpNode VR128:$src1, 8025 (vt128 (load addr:$src2)))))]>, 8026 VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM.Folded, 8027 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 8028 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8029 (ins VR256:$src1, VR256:$src2), 8030 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8031 [(set VR256:$dst, 8032 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 8033 VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 8034 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8035 (ins VR256:$src1, i256mem:$src2), 8036 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8037 [(set VR256:$dst, 8038 (vt256 (OpNode VR256:$src1, 8039 (vt256 (load addr:$src2)))))]>, 8040 VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 8041 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 8042} 8043 8044let Predicates = [HasAVX2, NoVLX] in { 8045 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 8046 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, REX_W; 8047 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 8048 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, REX_W; 8049 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 8050} 8051 8052//===----------------------------------------------------------------------===// 8053// VGATHER - GATHER Operations 8054 8055// FIXME: Improve scheduling of gather instructions. 8056multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, 8057 X86MemOperand memop128, X86MemOperand memop256> { 8058let mayLoad = 1, hasSideEffects = 0 in { 8059 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 8060 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8061 !strconcat(OpcodeStr, 8062 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8063 []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8064 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 8065 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8066 !strconcat(OpcodeStr, 8067 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8068 []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8069} 8070} 8071 8072let Predicates = [HasAVX2] in { 8073 let mayLoad = 1, hasSideEffects = 0, Constraints 8074 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8075 in { 8076 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", 8077 VR256, vx128mem, vx256mem>, REX_W; 8078 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", 8079 VR256, vx128mem, vy256mem>, REX_W; 8080 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", 8081 VR256, vx128mem, vy256mem>; 8082 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", 8083 VR128, vx64mem, vy128mem>; 8084 8085 let ExeDomain = SSEPackedDouble in { 8086 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", 8087 VR256, vx128mem, vx256mem>, REX_W; 8088 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", 8089 VR256, vx128mem, vy256mem>, REX_W; 8090 } 8091 8092 let ExeDomain = SSEPackedSingle in { 8093 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", 8094 VR256, vx128mem, vy256mem>; 8095 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", 8096 VR128, vx64mem, vy128mem>; 8097 } 8098 } 8099} 8100 8101//===----------------------------------------------------------------------===// 8102// GFNI instructions 8103//===----------------------------------------------------------------------===// 8104 8105multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 8106 RegisterClass RC, PatFrag MemOpFrag, 8107 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8108 bit Is2Addr = 0> { 8109 let ExeDomain = SSEPackedInt, 8110 AsmString = !if(Is2Addr, 8111 OpcodeStr#"\t{$src2, $dst|$dst, $src2}", 8112 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 8113 let isCommutable = 1 in 8114 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 8115 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 8116 Sched<[sched]>, T8; 8117 8118 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 8119 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 8120 (MemOpFrag addr:$src2))))]>, 8121 Sched<[sched.Folded, sched.ReadAfterFold]>, T8; 8122 } 8123} 8124 8125multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 8126 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 8127 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8128 bit Is2Addr = 0> { 8129 let AsmString = !if(Is2Addr, 8130 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8131 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 8132 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 8133 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 8134 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 8135 SSEPackedInt>, Sched<[sched]>; 8136 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 8137 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 8138 [(set RC:$dst, (OpVT (OpNode RC:$src1, 8139 (MemOpFrag addr:$src2), 8140 timm:$src3)))], SSEPackedInt>, 8141 Sched<[sched.Folded, sched.ReadAfterFold]>; 8142 } 8143} 8144 8145multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8146 let Constraints = "$src1 = $dst", 8147 Predicates = [HasGFNI, UseSSE2] in 8148 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8149 VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>; 8150 let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8151 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, 8152 load, i128mem, SchedWriteVecIMul.XMM>, 8153 VEX, VVVV, REX_W; 8154 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, 8155 load, i256mem, SchedWriteVecIMul.YMM>, 8156 VEX, VVVV, VEX_L, REX_W; 8157 } 8158} 8159 8160// GF2P8MULB 8161let Constraints = "$src1 = $dst", 8162 Predicates = [HasGFNI, UseSSE2] in 8163defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 8164 i128mem, SchedWriteVecALU.XMM, 1>; 8165let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8166 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 8167 i128mem, SchedWriteVecALU.XMM>, VEX, VVVV; 8168 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 8169 i256mem, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L; 8170} 8171// GF2P8AFFINEINVQB, GF2P8AFFINEQB 8172let isCommutable = 0 in { 8173 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8174 X86GF2P8affineinvqb>, TA, PD; 8175 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8176 X86GF2P8affineqb>, TA, PD; 8177} 8178 8179// AVX-IFMA 8180let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst" in 8181multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> { 8182 // NOTE: The SDNode have the multiply operands first with the add last. 8183 // This enables commuted load patterns to be autogenerated by tablegen. 8184 let isCommutable = 1 in { 8185 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 8186 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8187 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8188 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8189 VR128:$src3, VR128:$src1)))]>, 8190 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8191 } 8192 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 8193 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8194 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8195 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8196 (loadv2i64 addr:$src3), VR128:$src1)))]>, 8197 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8198 let isCommutable = 1 in { 8199 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 8200 (ins VR256:$src1, VR256:$src2, VR256:$src3), 8201 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8202 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8203 VR256:$src3, VR256:$src1)))]>, 8204 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8205 } 8206 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 8207 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 8208 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8209 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8210 (loadv4i64 addr:$src3), VR256:$src1)))]>, 8211 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8212} 8213 8214defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix; 8215defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, REX_W, ExplicitVEXPrefix; 8216 8217// AVX-VNNI-INT8 8218let Constraints = "$src1 = $dst" in 8219multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT, 8220 RegisterClass RC, PatFrag MemOpFrag, 8221 X86MemOperand X86memop, SDNode OpNode, 8222 X86FoldableSchedWrite Sched, 8223 bit IsCommutable> { 8224 let isCommutable = IsCommutable in 8225 def rr : I<Opc, MRMSrcReg, (outs RC:$dst), 8226 (ins RC:$src1, RC:$src2, RC:$src3), 8227 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8228 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, 8229 VEX, VVVV, Sched<[Sched]>; 8230 def rm : I<Opc, MRMSrcMem, (outs RC:$dst), 8231 (ins RC:$src1, RC:$src2, X86memop:$src3), 8232 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8233 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, 8234 (MemOpFrag addr:$src3))))]>, 8235 VEX, VVVV, Sched<[Sched.Folded, Sched.ReadAfterFold]>; 8236} 8237 8238let Predicates = [HasAVXVNNIINT8] in { 8239 defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32, 8240 i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM, 8241 1>, T8, XD; 8242 defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32, 8243 i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM, 8244 1>, VEX_L, T8, XD; 8245 defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32, 8246 i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM, 8247 1>, T8; 8248 defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32, 8249 i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM, 8250 1>, VEX_L, T8; 8251 defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32, 8252 i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM, 8253 1>, T8, XD; 8254 defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32, 8255 i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM, 8256 1>, VEX_L, T8, XD; 8257 defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32, 8258 i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM, 8259 1>, T8; 8260 defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32, 8261 i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM, 8262 1>, VEX_L, T8; 8263 defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32, 8264 i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM, 8265 0>, T8, XS; 8266 defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32, 8267 i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM, 8268 0>, VEX_L, T8, XS; 8269 defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32, 8270 i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM, 8271 0>, T8, XS; 8272 defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32, 8273 i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM, 8274 0>, VEX_L, T8, XS; 8275} 8276 8277// AVX-NE-CONVERT 8278multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr, 8279 X86MemOperand MemOp128, X86MemOperand MemOp256> { 8280 def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src), 8281 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8282 [(set VR128:$dst, 8283 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>, 8284 Sched<[WriteCvtPH2PS]>, VEX; 8285 def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src), 8286 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8287 [(set VR256:$dst, 8288 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>, 8289 Sched<[WriteCvtPH2PSY]>, VEX, VEX_L; 8290} 8291 8292multiclass VCVTNEPS2BF16_BASE { 8293 def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 8294 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8295 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>, 8296 Sched<[WriteCvtPH2PS]>; 8297 def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 8298 "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}", 8299 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>, 8300 Sched<[WriteCvtPH2PS]>; 8301 def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 8302 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8303 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>, 8304 Sched<[WriteCvtPH2PSY]>, VEX_L; 8305 def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 8306 "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}", 8307 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>, 8308 Sched<[WriteCvtPH2PSY]>, VEX_L; 8309} 8310 8311let Predicates = [HasAVXNECONVERT] in { 8312 defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem, 8313 f16mem>, T8, XS; 8314 defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>, 8315 T8, PD; 8316 defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem, 8317 f256mem>, T8, XS; 8318 defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem, 8319 f256mem>, T8, PD; 8320 defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem, 8321 f256mem>, T8, XD; 8322 defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem, 8323 f256mem>, T8; 8324 defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8, XS, ExplicitVEXPrefix; 8325 8326 def : Pat<(v8bf16 (X86cvtneps2bf16 (v4f32 VR128:$src))), 8327 (VCVTNEPS2BF16rr VR128:$src)>; 8328 def : Pat<(v8bf16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), 8329 (VCVTNEPS2BF16rm addr:$src)>; 8330 def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))), 8331 (VCVTNEPS2BF16Yrr VR256:$src)>; 8332 def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))), 8333 (VCVTNEPS2BF16Yrm addr:$src)>; 8334} 8335 8336def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}", 8337 (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">; 8338def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}", 8339 (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">; 8340 8341// FIXME: Is there a better scheduler class for SHA512 than WriteVecIMul? 8342let Predicates = [HasSHA512], Constraints = "$src1 = $dst" in { 8343def VSHA512MSG1rr : I<0xcc, MRMSrcReg, (outs VR256:$dst), 8344 (ins VR256:$src1, VR128:$src2), 8345 "vsha512msg1\t{$src2, $dst|$dst, $src2}", 8346 [(set VR256:$dst, 8347 (int_x86_vsha512msg1 VR256:$src1, VR128:$src2))]>, VEX_L, 8348 VEX, T8, XD, Sched<[WriteVecIMul]>; 8349def VSHA512MSG2rr : I<0xcd, MRMSrcReg, (outs VR256:$dst), 8350 (ins VR256:$src1, VR256:$src2), 8351 "vsha512msg2\t{$src2, $dst|$dst, $src2}", 8352 [(set VR256:$dst, 8353 (int_x86_vsha512msg2 VR256:$src1, VR256:$src2))]>, VEX_L, 8354 VEX, T8, XD, Sched<[WriteVecIMul]>; 8355def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst), 8356 (ins VR256:$src1, VR256:$src2, VR128:$src3), 8357 "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8358 [(set VR256:$dst, 8359 (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>, 8360 VEX_L, VEX, VVVV, T8, XD, Sched<[WriteVecIMul]>; 8361} 8362 8363// FIXME: Is there a better scheduler class for SM3 than WriteVecIMul? 8364let Predicates = [HasSM3], Constraints = "$src1 = $dst" in { 8365 multiclass SM3_Base<string OpStr> { 8366 def rr : I<0xda, MRMSrcReg, (outs VR128:$dst), 8367 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8368 !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8369 [(set VR128:$dst, 8370 (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1, 8371 VR128:$src2, VR128:$src3))]>, 8372 Sched<[WriteVecIMul]>, VEX, VVVV; 8373 def rm : I<0xda, MRMSrcMem, (outs VR128:$dst), 8374 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8375 !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8376 [(set VR128:$dst, 8377 (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1, 8378 VR128:$src2, (loadv4i32 addr:$src3)))]>, 8379 Sched<[WriteVecIMul]>, VEX, VVVV; 8380 } 8381 8382 multiclass VSM3RNDS2_Base { 8383 def rr : Ii8<0xde, MRMSrcReg, (outs VR128:$dst), 8384 (ins VR128:$src1, VR128:$src2, VR128:$src3, i32u8imm:$src4), 8385 "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}", 8386 [(set VR128:$dst, 8387 (int_x86_vsm3rnds2 VR128:$src1, 8388 VR128:$src2, VR128:$src3, timm:$src4))]>, 8389 Sched<[WriteVecIMul]>; 8390 def rm : Ii8<0xde, MRMSrcMem, (outs VR128:$dst), 8391 (ins VR128:$src1, VR128:$src2, i128mem:$src3, i32u8imm:$src4), 8392 "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}", 8393 [(set VR128:$dst, 8394 (int_x86_vsm3rnds2 VR128:$src1, 8395 VR128:$src2, (loadv4i32 addr:$src3), timm:$src4))]>, 8396 Sched<[WriteVecIMul]>; 8397 } 8398} 8399 8400defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8; 8401defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8, PD; 8402defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TA, PD; 8403 8404// FIXME: Is there a better scheduler class for SM4 than WriteVecIMul? 8405let Predicates = [HasSM4] in { 8406 multiclass SM4_Base<string OpStr, RegisterClass RC, string VL, 8407 PatFrag LD, X86MemOperand MemOp> { 8408 def rr : I<0xda, MRMSrcReg, (outs RC:$dst), 8409 (ins RC:$src1, RC:$src2), 8410 !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8411 [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1, 8412 RC:$src2))]>, 8413 Sched<[WriteVecIMul]>; 8414 def rm : I<0xda, MRMSrcMem, (outs RC:$dst), 8415 (ins RC:$src1, MemOp:$src2), 8416 !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8417 [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1, 8418 (LD addr:$src2)))]>, 8419 Sched<[WriteVecIMul]>; 8420 } 8421} 8422 8423defm VSM4KEY4 : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8, XS, VEX, VVVV; 8424defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS, VEX_L, VEX, VVVV; 8425defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV; 8426defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV; 8427 8428let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in 8429multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> { 8430 let isCommutable = IsCommutable in 8431 def rr : I<opc, MRMSrcReg, (outs VR128:$dst), 8432 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8433 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8434 [(set VR128:$dst, 8435 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128") 8436 VR128:$src1, VR128:$src2, VR128:$src3)))]>, 8437 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8438 8439 def rm : I<opc, MRMSrcMem, (outs VR128:$dst), 8440 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8441 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8442 [(set VR128:$dst, 8443 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128") 8444 VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>, 8445 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8446 8447 let isCommutable = IsCommutable in 8448 def Yrr : I<opc, MRMSrcReg, (outs VR256:$dst), 8449 (ins VR256:$src1, VR256:$src2, VR256:$src3), 8450 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8451 [(set VR256:$dst, 8452 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256") 8453 VR256:$src1, VR256:$src2, VR256:$src3)))]>, 8454 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8455 8456 def Yrm : I<opc, MRMSrcMem, (outs VR256:$dst), 8457 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 8458 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8459 [(set VR256:$dst, 8460 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256") 8461 VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>, 8462 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8463} 8464 8465defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS; 8466defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS; 8467defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD; 8468defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD; 8469defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8; 8470defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8; 8471