1//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file describes the X86 SSE instruction set, defining the instructions, 10// and properties of the instructions which are needed for code generation, 11// machine code emission, and analysis. 12// 13//===----------------------------------------------------------------------===// 14 15//===----------------------------------------------------------------------===// 16// SSE 1 & 2 Instructions Classes 17//===----------------------------------------------------------------------===// 18 19/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class 20multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 21 RegisterClass RC, X86MemOperand x86memop, 22 Domain d, X86FoldableSchedWrite sched, 23 bit Is2Addr = 1> { 24let isCodeGenOnly = 1 in { 25 let isCommutable = 1 in { 26 def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 27 !if(Is2Addr, 28 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 29 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 30 [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, 31 Sched<[sched]>; 32 } 33 def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 34 !if(Is2Addr, 35 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 36 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 37 [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, 38 Sched<[sched.Folded, sched.ReadAfterFold]>; 39} 40} 41 42/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class 43multiclass sse12_fp_scalar_int<bits<8> opc, 44 SDPatternOperator OpNode, RegisterClass RC, 45 ValueType VT, string asm, Operand memopr, 46 PatFrags mem_frags, Domain d, 47 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 48let hasSideEffects = 0 in { 49 def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 50 !if(Is2Addr, 51 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 52 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 53 [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, 54 Sched<[sched]>; 55 let mayLoad = 1 in 56 def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), 57 !if(Is2Addr, 58 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 59 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 60 [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>, 61 Sched<[sched.Folded, sched.ReadAfterFold]>; 62} 63} 64 65/// sse12_fp_packed - SSE 1 & 2 packed instructions class 66multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 67 RegisterClass RC, ValueType vt, 68 X86MemOperand x86memop, PatFrag mem_frag, 69 Domain d, X86FoldableSchedWrite sched, 70 bit Is2Addr = 1> { 71 let isCommutable = 1 in 72 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 73 !if(Is2Addr, 74 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 75 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 76 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, 77 Sched<[sched]>; 78 let mayLoad = 1 in 79 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 80 !if(Is2Addr, 81 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 82 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 83 [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], 84 d>, 85 Sched<[sched.Folded, sched.ReadAfterFold]>; 86} 87 88/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class 89multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, 90 string OpcodeStr, X86MemOperand x86memop, 91 X86FoldableSchedWrite sched, 92 list<dag> pat_rr, list<dag> pat_rm, 93 bit Is2Addr = 1> { 94 let isCommutable = 1, hasSideEffects = 0 in 95 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 96 !if(Is2Addr, 97 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 98 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 99 pat_rr, d>, 100 Sched<[sched]>; 101 let hasSideEffects = 0, mayLoad = 1 in 102 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 103 !if(Is2Addr, 104 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 105 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 106 pat_rm, d>, 107 Sched<[sched.Folded, sched.ReadAfterFold]>; 108} 109 110 111// Alias instructions that map fld0 to xorps for sse or vxorps for avx. 112// This is expanded by ExpandPostRAPseudos. 113let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 114 isPseudo = 1, SchedRW = [WriteZero] in { 115 def FsFLD0SH : I<0, Pseudo, (outs FR16:$dst), (ins), "", 116 [(set FR16:$dst, fp16imm0)]>, Requires<[HasSSE2, NoAVX512]>; 117 def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", 118 [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; 119 def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", 120 [(set FR64:$dst, fp64imm0)]>, Requires<[HasSSE2, NoAVX512]>; 121 def FsFLD0F128 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 122 [(set VR128:$dst, fp128imm0)]>, Requires<[HasSSE1, NoAVX512]>; 123} 124 125//===----------------------------------------------------------------------===// 126// AVX & SSE - Zero/One Vectors 127//===----------------------------------------------------------------------===// 128 129// Alias instruction that maps zero vector to pxor / xorp* for sse. 130// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then 131// swizzled by ExecutionDomainFix to pxor. 132// We set canFoldAsLoad because this can be converted to a constant-pool 133// load of an all-zeros value if folding it would be beneficial. 134let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 135 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 136def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", 137 [(set VR128:$dst, (v4f32 immAllZerosV))]>; 138} 139 140let Predicates = [NoAVX512] in { 141def : Pat<(v16i8 immAllZerosV), (V_SET0)>; 142def : Pat<(v8i16 immAllZerosV), (V_SET0)>; 143def : Pat<(v8f16 immAllZerosV), (V_SET0)>; 144def : Pat<(v4i32 immAllZerosV), (V_SET0)>; 145def : Pat<(v2i64 immAllZerosV), (V_SET0)>; 146def : Pat<(v2f64 immAllZerosV), (V_SET0)>; 147} 148 149 150// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, 151// and doesn't need it because on sandy bridge the register is set to zero 152// at the rename stage without using any execution unit, so SET0PSY 153// and SET0PDY can be used for vector int instructions without penalty 154let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 155 isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { 156def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", 157 [(set VR256:$dst, (v8i32 immAllZerosV))]>; 158} 159 160let Predicates = [NoAVX512] in { 161def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>; 162def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>; 163def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>; 164def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>; 165def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>; 166def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>; 167} 168 169// We set canFoldAsLoad because this can be converted to a constant-pool 170// load of an all-ones value if folding it would be beneficial. 171let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, 172 isPseudo = 1, SchedRW = [WriteZero] in { 173 def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", 174 [(set VR128:$dst, (v4i32 immAllOnesV))]>; 175 let Predicates = [HasAVX1Only, OptForMinSize] in { 176 def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", 177 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 178 } 179 let Predicates = [HasAVX2] in 180 def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", 181 [(set VR256:$dst, (v8i32 immAllOnesV))]>; 182} 183 184//===----------------------------------------------------------------------===// 185// SSE 1 & 2 - Move FP Scalar Instructions 186// 187// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 188// register copies because it's a partial register update; Register-to-register 189// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires 190// that the insert be implementable in terms of a copy, and just mentioned, we 191// don't use movss/movsd for copies. 192//===----------------------------------------------------------------------===// 193 194multiclass sse12_move_rr<SDNode OpNode, ValueType vt, string base_opc, 195 string asm_opr, Domain d> { 196 let isCommutable = 1 in 197 def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), 198 (ins VR128:$src1, VR128:$src2), 199 !strconcat(base_opc, asm_opr), 200 [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, 201 Sched<[SchedWriteFShuffle.XMM]>; 202 203 // For the disassembler 204 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 205 def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), 206 (ins VR128:$src1, VR128:$src2), 207 !strconcat(base_opc, asm_opr), []>, 208 Sched<[SchedWriteFShuffle.XMM]>; 209} 210 211multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, 212 X86MemOperand x86memop, string OpcodeStr, 213 Domain d, Predicate pred> { 214 // AVX 215 let Predicates = [UseAVX, OptForSize] in 216 defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 217 "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>, 218 VEX, VVVV, VEX_LIG, WIG; 219 220 def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 221 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 222 [(store RC:$src, addr:$dst)], d>, 223 VEX, VEX_LIG, Sched<[WriteFStore]>, WIG; 224 // SSE1 & 2 225 let Constraints = "$src1 = $dst" in { 226 let Predicates = [pred, NoSSE41_Or_OptForSize] in 227 defm NAME : sse12_move_rr<OpNode, vt, OpcodeStr, 228 "\t{$src2, $dst|$dst, $src2}", d>; 229 } 230 231 def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), 232 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 233 [(store RC:$src, addr:$dst)], d>, 234 Sched<[WriteFStore]>; 235 236 def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", 237 (!cast<Instruction>("V"#NAME#"rr_REV") 238 VR128:$dst, VR128:$src1, VR128:$src2), 0>; 239 def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", 240 (!cast<Instruction>(NAME#"rr_REV") 241 VR128:$dst, VR128:$src2), 0>; 242} 243 244// Loading from memory automatically zeroing upper bits. 245multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, 246 PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr, 247 Domain d> { 248 def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 249 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 250 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 251 VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG; 252 def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 253 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 254 [(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>, 255 Sched<[WriteFLoad]>; 256 257 // _alt version uses FR32/FR64 register class. 258 let isCodeGenOnly = 1 in { 259 def V#NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 260 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 261 [(set RC:$dst, (mem_pat addr:$src))], d>, 262 VEX, VEX_LIG, Sched<[WriteFLoad]>, WIG; 263 def NAME#rm_alt : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 264 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 265 [(set RC:$dst, (mem_pat addr:$src))], d>, 266 Sched<[WriteFLoad]>; 267 } 268} 269 270defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", 271 SSEPackedSingle, UseSSE1>, TB, XS; 272defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", 273 SSEPackedDouble, UseSSE2>, TB, XD; 274 275let canFoldAsLoad = 1, isReMaterializable = 1 in { 276 defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss", 277 SSEPackedSingle>, TB, XS; 278 defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd", 279 SSEPackedDouble>, TB, XD; 280} 281 282// Patterns 283let Predicates = [UseAVX] in { 284 def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 285 (VMOVSSrm addr:$src)>; 286 def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 287 (VMOVSDrm addr:$src)>; 288 289 // Represent the same patterns above but in the form they appear for 290 // 256-bit types 291 def : Pat<(v8f32 (X86vzload32 addr:$src)), 292 (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; 293 def : Pat<(v4f64 (X86vzload64 addr:$src)), 294 (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; 295} 296 297let Predicates = [UseAVX, OptForSize] in { 298 // Move scalar to XMM zero-extended, zeroing a VR128 then do a 299 // MOVSS to the lower bits. 300 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 301 (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 302 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 303 (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 304 305 // Move low f32 and clear high bits. 306 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 307 (SUBREG_TO_REG (i32 0), 308 (v4f32 (VMOVSSrr (v4f32 (V_SET0)), 309 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; 310 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 311 (SUBREG_TO_REG (i32 0), 312 (v4i32 (VMOVSSrr (v4i32 (V_SET0)), 313 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; 314} 315 316let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { 317// Move scalar to XMM zero-extended, zeroing a VR128 then do a 318// MOVSS to the lower bits. 319def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 320 (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; 321def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 322 (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; 323} 324 325let Predicates = [UseSSE2] in 326def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))), 327 (MOVSDrm addr:$src)>; 328 329let Predicates = [UseSSE1] in 330def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))), 331 (MOVSSrm addr:$src)>; 332 333//===----------------------------------------------------------------------===// 334// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions 335//===----------------------------------------------------------------------===// 336 337multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, 338 X86MemOperand x86memop, PatFrag ld_frag, 339 string asm, Domain d, 340 X86SchedWriteMoveLS sched> { 341let hasSideEffects = 0, isMoveReg = 1 in 342 def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 343 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, 344 Sched<[sched.RR]>; 345let canFoldAsLoad = 1, isReMaterializable = 1 in 346 def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 347 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 348 [(set RC:$dst, (ld_frag addr:$src))], d>, 349 Sched<[sched.RM]>; 350} 351 352let Predicates = [HasAVX, NoVLX] in { 353defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 354 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 355 TB, VEX, WIG; 356defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 357 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 358 TB, PD, VEX, WIG; 359defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 360 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 361 TB, VEX, WIG; 362defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 363 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 364 TB, PD, VEX, WIG; 365 366defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", 367 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 368 TB, VEX, VEX_L, WIG; 369defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", 370 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 371 TB, PD, VEX, VEX_L, WIG; 372defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", 373 SSEPackedSingle, SchedWriteFMoveLS.YMM>, 374 TB, VEX, VEX_L, WIG; 375defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", 376 SSEPackedDouble, SchedWriteFMoveLS.YMM>, 377 TB, PD, VEX, VEX_L, WIG; 378} 379 380let Predicates = [UseSSE1] in { 381defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", 382 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 383 TB; 384defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", 385 SSEPackedSingle, SchedWriteFMoveLS.XMM>, 386 TB; 387} 388let Predicates = [UseSSE2] in { 389defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", 390 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 391 TB, PD; 392defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", 393 SSEPackedDouble, SchedWriteFMoveLS.XMM>, 394 TB, PD; 395} 396 397let Predicates = [HasAVX, NoVLX] in { 398let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 399def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 400 "movaps\t{$src, $dst|$dst, $src}", 401 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, 402 VEX, WIG; 403def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 404 "movapd\t{$src, $dst|$dst, $src}", 405 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, 406 VEX, WIG; 407def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 408 "movups\t{$src, $dst|$dst, $src}", 409 [(store (v4f32 VR128:$src), addr:$dst)]>, 410 VEX, WIG; 411def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 412 "movupd\t{$src, $dst|$dst, $src}", 413 [(store (v2f64 VR128:$src), addr:$dst)]>, 414 VEX, WIG; 415} // SchedRW 416 417let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { 418def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 419 "movaps\t{$src, $dst|$dst, $src}", 420 [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, 421 VEX, VEX_L, WIG; 422def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 423 "movapd\t{$src, $dst|$dst, $src}", 424 [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, 425 VEX, VEX_L, WIG; 426def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 427 "movups\t{$src, $dst|$dst, $src}", 428 [(store (v8f32 VR256:$src), addr:$dst)]>, 429 VEX, VEX_L, WIG; 430def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), 431 "movupd\t{$src, $dst|$dst, $src}", 432 [(store (v4f64 VR256:$src), addr:$dst)]>, 433 VEX, VEX_L, WIG; 434} // SchedRW 435} // Predicate 436 437// For disassembler 438let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 439 isMoveReg = 1 in { 440let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 441 def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), 442 (ins VR128:$src), 443 "movaps\t{$src, $dst|$dst, $src}", []>, 444 VEX, WIG; 445 def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), 446 (ins VR128:$src), 447 "movapd\t{$src, $dst|$dst, $src}", []>, 448 VEX, WIG; 449 def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), 450 (ins VR128:$src), 451 "movups\t{$src, $dst|$dst, $src}", []>, 452 VEX, WIG; 453 def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), 454 (ins VR128:$src), 455 "movupd\t{$src, $dst|$dst, $src}", []>, 456 VEX, WIG; 457} // SchedRW 458 459let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { 460 def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), 461 (ins VR256:$src), 462 "movaps\t{$src, $dst|$dst, $src}", []>, 463 VEX, VEX_L, WIG; 464 def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), 465 (ins VR256:$src), 466 "movapd\t{$src, $dst|$dst, $src}", []>, 467 VEX, VEX_L, WIG; 468 def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), 469 (ins VR256:$src), 470 "movups\t{$src, $dst|$dst, $src}", []>, 471 VEX, VEX_L, WIG; 472 def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), 473 (ins VR256:$src), 474 "movupd\t{$src, $dst|$dst, $src}", []>, 475 VEX, VEX_L, WIG; 476} // SchedRW 477} // Predicate 478 479// Reversed version with ".s" suffix for GAS compatibility. 480def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 481 (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 482def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 483 (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 484def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 485 (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 486def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 487 (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 488def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", 489 (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; 490def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", 491 (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; 492def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", 493 (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; 494def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", 495 (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; 496 497let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { 498def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 499 "movaps\t{$src, $dst|$dst, $src}", 500 [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; 501def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 502 "movapd\t{$src, $dst|$dst, $src}", 503 [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; 504def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 505 "movups\t{$src, $dst|$dst, $src}", 506 [(store (v4f32 VR128:$src), addr:$dst)]>; 507def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 508 "movupd\t{$src, $dst|$dst, $src}", 509 [(store (v2f64 VR128:$src), addr:$dst)]>; 510} // SchedRW 511 512// For disassembler 513let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 514 isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { 515 def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 516 "movaps\t{$src, $dst|$dst, $src}", []>; 517 def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 518 "movapd\t{$src, $dst|$dst, $src}", []>; 519 def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 520 "movups\t{$src, $dst|$dst, $src}", []>; 521 def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 522 "movupd\t{$src, $dst|$dst, $src}", []>; 523} 524 525// Reversed version with ".s" suffix for GAS compatibility. 526def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", 527 (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; 528def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", 529 (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; 530def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", 531 (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; 532def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", 533 (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; 534 535let Predicates = [HasAVX, NoVLX] in { 536 // 256-bit load/store need to use floating point load/store in case we don't 537 // have AVX2. Execution domain fixing will convert to integer if AVX2 is 538 // available and changing the domain is beneficial. 539 def : Pat<(alignedloadv4i64 addr:$src), 540 (VMOVAPSYrm addr:$src)>; 541 def : Pat<(alignedloadv8i32 addr:$src), 542 (VMOVAPSYrm addr:$src)>; 543 def : Pat<(alignedloadv16i16 addr:$src), 544 (VMOVAPSYrm addr:$src)>; 545 def : Pat<(alignedloadv32i8 addr:$src), 546 (VMOVAPSYrm addr:$src)>; 547 def : Pat<(loadv4i64 addr:$src), 548 (VMOVUPSYrm addr:$src)>; 549 def : Pat<(loadv8i32 addr:$src), 550 (VMOVUPSYrm addr:$src)>; 551 def : Pat<(loadv16i16 addr:$src), 552 (VMOVUPSYrm addr:$src)>; 553 def : Pat<(loadv32i8 addr:$src), 554 (VMOVUPSYrm addr:$src)>; 555 556 def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), 557 (VMOVAPSYmr addr:$dst, VR256:$src)>; 558 def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), 559 (VMOVAPSYmr addr:$dst, VR256:$src)>; 560 def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), 561 (VMOVAPSYmr addr:$dst, VR256:$src)>; 562 def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), 563 (VMOVAPSYmr addr:$dst, VR256:$src)>; 564 def : Pat<(store (v4i64 VR256:$src), addr:$dst), 565 (VMOVUPSYmr addr:$dst, VR256:$src)>; 566 def : Pat<(store (v8i32 VR256:$src), addr:$dst), 567 (VMOVUPSYmr addr:$dst, VR256:$src)>; 568 def : Pat<(store (v16i16 VR256:$src), addr:$dst), 569 (VMOVUPSYmr addr:$dst, VR256:$src)>; 570 def : Pat<(store (v32i8 VR256:$src), addr:$dst), 571 (VMOVUPSYmr addr:$dst, VR256:$src)>; 572 573 def : Pat<(alignedloadv8f16 addr:$src), 574 (VMOVAPSrm addr:$src)>; 575 def : Pat<(alignedloadv8bf16 addr:$src), 576 (VMOVAPSrm addr:$src)>; 577 def : Pat<(loadv8f16 addr:$src), 578 (VMOVUPSrm addr:$src)>; 579 def : Pat<(loadv8bf16 addr:$src), 580 (VMOVUPSrm addr:$src)>; 581 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 582 (VMOVAPSmr addr:$dst, VR128:$src)>; 583 def : Pat<(alignedstore (v8bf16 VR128:$src), addr:$dst), 584 (VMOVAPSmr addr:$dst, VR128:$src)>; 585 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 586 (VMOVUPSmr addr:$dst, VR128:$src)>; 587 def : Pat<(store (v8bf16 VR128:$src), addr:$dst), 588 (VMOVUPSmr addr:$dst, VR128:$src)>; 589 590 def : Pat<(alignedloadv16f16 addr:$src), 591 (VMOVAPSYrm addr:$src)>; 592 def : Pat<(alignedloadv16bf16 addr:$src), 593 (VMOVAPSYrm addr:$src)>; 594 def : Pat<(loadv16f16 addr:$src), 595 (VMOVUPSYrm addr:$src)>; 596 def : Pat<(loadv16bf16 addr:$src), 597 (VMOVUPSYrm addr:$src)>; 598 def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst), 599 (VMOVAPSYmr addr:$dst, VR256:$src)>; 600 def : Pat<(alignedstore (v16bf16 VR256:$src), addr:$dst), 601 (VMOVAPSYmr addr:$dst, VR256:$src)>; 602 def : Pat<(store (v16f16 VR256:$src), addr:$dst), 603 (VMOVUPSYmr addr:$dst, VR256:$src)>; 604 def : Pat<(store (v16bf16 VR256:$src), addr:$dst), 605 (VMOVUPSYmr addr:$dst, VR256:$src)>; 606} 607 608// Use movaps / movups for SSE integer load / store (one byte shorter). 609// The instructions selected below are then converted to MOVDQA/MOVDQU 610// during the SSE domain pass. 611let Predicates = [UseSSE1] in { 612 def : Pat<(alignedloadv2i64 addr:$src), 613 (MOVAPSrm addr:$src)>; 614 def : Pat<(alignedloadv4i32 addr:$src), 615 (MOVAPSrm addr:$src)>; 616 def : Pat<(alignedloadv8i16 addr:$src), 617 (MOVAPSrm addr:$src)>; 618 def : Pat<(alignedloadv16i8 addr:$src), 619 (MOVAPSrm addr:$src)>; 620 def : Pat<(loadv2i64 addr:$src), 621 (MOVUPSrm addr:$src)>; 622 def : Pat<(loadv4i32 addr:$src), 623 (MOVUPSrm addr:$src)>; 624 def : Pat<(loadv8i16 addr:$src), 625 (MOVUPSrm addr:$src)>; 626 def : Pat<(loadv16i8 addr:$src), 627 (MOVUPSrm addr:$src)>; 628 629 def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), 630 (MOVAPSmr addr:$dst, VR128:$src)>; 631 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 632 (MOVAPSmr addr:$dst, VR128:$src)>; 633 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 634 (MOVAPSmr addr:$dst, VR128:$src)>; 635 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 636 (MOVAPSmr addr:$dst, VR128:$src)>; 637 def : Pat<(store (v2i64 VR128:$src), addr:$dst), 638 (MOVUPSmr addr:$dst, VR128:$src)>; 639 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 640 (MOVUPSmr addr:$dst, VR128:$src)>; 641 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 642 (MOVUPSmr addr:$dst, VR128:$src)>; 643 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 644 (MOVUPSmr addr:$dst, VR128:$src)>; 645} 646 647let Predicates = [UseSSE2] in { 648 def : Pat<(alignedloadv8f16 addr:$src), 649 (MOVAPSrm addr:$src)>; 650 def : Pat<(loadv8f16 addr:$src), 651 (MOVUPSrm addr:$src)>; 652 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 653 (MOVAPSmr addr:$dst, VR128:$src)>; 654 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 655 (MOVUPSmr addr:$dst, VR128:$src)>; 656} 657 658//===----------------------------------------------------------------------===// 659// SSE 1 & 2 - Move Low packed FP Instructions 660//===----------------------------------------------------------------------===// 661 662multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode, 663 string base_opc, string asm_opr> { 664 // No pattern as they need be special cased between high and low. 665 let hasSideEffects = 0, mayLoad = 1 in 666 def PSrm : PI<opc, MRMSrcMem, 667 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 668 !strconcat(base_opc, "s", asm_opr), 669 [], SSEPackedSingle>, TB, 670 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 671 672 def PDrm : PI<opc, MRMSrcMem, 673 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), 674 !strconcat(base_opc, "d", asm_opr), 675 [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, 676 (scalar_to_vector (loadf64 addr:$src2)))))], 677 SSEPackedDouble>, TB, PD, 678 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 679} 680 681multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, 682 string base_opc> { 683 let Predicates = [UseAVX] in 684 defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 685 "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, 686 VEX, VVVV, WIG; 687 688 let Constraints = "$src1 = $dst" in 689 defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, 690 "\t{$src2, $dst|$dst, $src2}">; 691} 692 693defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; 694 695let SchedRW = [WriteFStore] in { 696let Predicates = [UseAVX] in { 697let mayStore = 1, hasSideEffects = 0 in 698def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 699 "movlps\t{$src, $dst|$dst, $src}", 700 []>, 701 VEX, WIG; 702def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 703 "movlpd\t{$src, $dst|$dst, $src}", 704 [(store (f64 (extractelt (v2f64 VR128:$src), 705 (iPTR 0))), addr:$dst)]>, 706 VEX, WIG; 707}// UseAVX 708let mayStore = 1, hasSideEffects = 0 in 709def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 710 "movlps\t{$src, $dst|$dst, $src}", 711 []>; 712def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 713 "movlpd\t{$src, $dst|$dst, $src}", 714 [(store (f64 (extractelt (v2f64 VR128:$src), 715 (iPTR 0))), addr:$dst)]>; 716} // SchedRW 717 718let Predicates = [UseSSE1] in { 719 // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll 720 // end up with a movsd or blend instead of shufp. 721 // No need for aligned load, we're only loading 64-bits. 722 def : Pat<(X86Shufp (v4f32 (simple_load addr:$src2)), VR128:$src1, 723 (i8 -28)), 724 (MOVLPSrm VR128:$src1, addr:$src2)>; 725 def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)), 726 (MOVLPSrm VR128:$src1, addr:$src2)>; 727 728 def : Pat<(v4f32 (X86vzload64 addr:$src)), 729 (MOVLPSrm (v4f32 (V_SET0)), addr:$src)>; 730 def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst), 731 (MOVLPSmr addr:$dst, VR128:$src)>; 732} 733 734//===----------------------------------------------------------------------===// 735// SSE 1 & 2 - Move Hi packed FP Instructions 736//===----------------------------------------------------------------------===// 737 738defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; 739 740let SchedRW = [WriteFStore] in { 741// v2f64 extract element 1 is always custom lowered to unpack high to low 742// and extract element 0 so the non-store version isn't too horrible. 743let Predicates = [UseAVX] in { 744let mayStore = 1, hasSideEffects = 0 in 745def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 746 "movhps\t{$src, $dst|$dst, $src}", 747 []>, VEX, WIG; 748def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 749 "movhpd\t{$src, $dst|$dst, $src}", 750 [(store (f64 (extractelt 751 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 752 (iPTR 0))), addr:$dst)]>, VEX, WIG; 753} // UseAVX 754let mayStore = 1, hasSideEffects = 0 in 755def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 756 "movhps\t{$src, $dst|$dst, $src}", 757 []>; 758def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 759 "movhpd\t{$src, $dst|$dst, $src}", 760 [(store (f64 (extractelt 761 (v2f64 (X86Unpckh VR128:$src, VR128:$src)), 762 (iPTR 0))), addr:$dst)]>; 763} // SchedRW 764 765let Predicates = [UseAVX] in { 766 // MOVHPD patterns 767 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 768 (VMOVHPDrm VR128:$src1, addr:$src2)>; 769 770 def : Pat<(store (f64 (extractelt 771 (v2f64 (X86VPermilpi VR128:$src, (i8 1))), 772 (iPTR 0))), addr:$dst), 773 (VMOVHPDmr addr:$dst, VR128:$src)>; 774 775 // MOVLPD patterns 776 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 777 (VMOVLPDrm VR128:$src1, addr:$src2)>; 778} 779 780let Predicates = [UseSSE1] in { 781 // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll 782 // end up with a movsd or blend instead of shufp. 783 // No need for aligned load, we're only loading 64-bits. 784 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (simple_load addr:$src2))), 785 (MOVHPSrm VR128:$src1, addr:$src2)>; 786 def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))), 787 (MOVHPSrm VR128:$src1, addr:$src2)>; 788 789 def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)), 790 addr:$dst), 791 (MOVHPSmr addr:$dst, VR128:$src)>; 792} 793 794let Predicates = [UseSSE2] in { 795 // MOVHPD patterns 796 def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))), 797 (MOVHPDrm VR128:$src1, addr:$src2)>; 798 799 def : Pat<(store (f64 (extractelt 800 (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), 801 (iPTR 0))), addr:$dst), 802 (MOVHPDmr addr:$dst, VR128:$src)>; 803 804 // MOVLPD patterns 805 def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))), 806 (MOVLPDrm VR128:$src1, addr:$src2)>; 807} 808 809let Predicates = [UseSSE2, NoSSE41_Or_OptForSize] in { 810 // Use MOVLPD to load into the low bits from a full vector unless we can use 811 // BLENDPD. 812 def : Pat<(X86Movsd VR128:$src1, (v2f64 (simple_load addr:$src2))), 813 (MOVLPDrm VR128:$src1, addr:$src2)>; 814} 815 816//===----------------------------------------------------------------------===// 817// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions 818//===----------------------------------------------------------------------===// 819 820let Predicates = [UseAVX] in { 821 def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), 822 (ins VR128:$src1, VR128:$src2), 823 "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 824 [(set VR128:$dst, 825 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 826 VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG; 827 let isCommutable = 1 in 828 def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), 829 (ins VR128:$src1, VR128:$src2), 830 "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 831 [(set VR128:$dst, 832 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 833 VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG; 834} 835let Constraints = "$src1 = $dst" in { 836 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), 837 (ins VR128:$src1, VR128:$src2), 838 "movlhps\t{$src2, $dst|$dst, $src2}", 839 [(set VR128:$dst, 840 (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, 841 Sched<[SchedWriteFShuffle.XMM]>; 842 let isCommutable = 1 in 843 def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), 844 (ins VR128:$src1, VR128:$src2), 845 "movhlps\t{$src2, $dst|$dst, $src2}", 846 [(set VR128:$dst, 847 (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, 848 Sched<[SchedWriteFShuffle.XMM]>; 849} 850 851//===----------------------------------------------------------------------===// 852// SSE 1 & 2 - Conversion Instructions 853//===----------------------------------------------------------------------===// 854 855multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 856 SDPatternOperator OpNode, X86MemOperand x86memop, PatFrag ld_frag, 857 string asm, string mem, X86FoldableSchedWrite sched, 858 Domain d, 859 SchedRead Int2Fpu = ReadDefault> { 860 let ExeDomain = d in { 861 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 862 !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 863 [(set DstRC:$dst, (OpNode SrcRC:$src))]>, 864 Sched<[sched, Int2Fpu]>; 865 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), 866 mem#"\t{$src, $dst|$dst, $src}", 867 [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, 868 Sched<[sched.Folded]>; 869 } 870} 871 872multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, 873 ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, 874 string asm, Domain d, X86FoldableSchedWrite sched> { 875let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in { 876 def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, 877 [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>, 878 Sched<[sched]>; 879 let mayLoad = 1 in 880 def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, 881 [(set RC:$dst, (DstTy (any_sint_to_fp 882 (SrcTy (ld_frag addr:$src)))))], d>, 883 Sched<[sched.Folded]>; 884} 885} 886 887multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 888 X86MemOperand x86memop, string asm, string mem, 889 X86FoldableSchedWrite sched, Domain d> { 890let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { 891 def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), 892 !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, 893 Sched<[sched, ReadDefault, ReadInt2Fpu]>; 894 let mayLoad = 1 in 895 def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), 896 (ins DstRC:$src1, x86memop:$src), 897 asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>, 898 Sched<[sched.Folded, sched.ReadAfterFold]>; 899} // hasSideEffects = 0 900} 901 902let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 903defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 904 "cvttss2si", "cvttss2si", 905 WriteCvtSS2I, SSEPackedSingle>, 906 TB, XS, VEX, VEX_LIG; 907defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 908 "cvttss2si", "cvttss2si", 909 WriteCvtSS2I, SSEPackedSingle>, 910 TB, XS, VEX, REX_W, VEX_LIG; 911defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 912 "cvttsd2si", "cvttsd2si", 913 WriteCvtSD2I, SSEPackedDouble>, 914 TB, XD, VEX, VEX_LIG; 915defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 916 "cvttsd2si", "cvttsd2si", 917 WriteCvtSD2I, SSEPackedDouble>, 918 TB, XD, VEX, REX_W, VEX_LIG; 919 920defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 921 "cvtss2si", "cvtss2si", 922 WriteCvtSS2I, SSEPackedSingle>, 923 TB, XS, VEX, VEX_LIG; 924defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 925 "cvtss2si", "cvtss2si", 926 WriteCvtSS2I, SSEPackedSingle>, 927 TB, XS, VEX, REX_W, VEX_LIG; 928defm VCVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 929 "cvtsd2si", "cvtsd2si", 930 WriteCvtSD2I, SSEPackedDouble>, 931 TB, XD, VEX, VEX_LIG; 932defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 933 "cvtsd2si", "cvtsd2si", 934 WriteCvtSD2I, SSEPackedDouble>, 935 TB, XD, VEX, REX_W, VEX_LIG; 936} 937 938// The assembler can recognize rr 64-bit instructions by seeing a rxx 939// register, but the same isn't true when only using memory operands, 940// provide other assembly "l" and "q" forms to address this explicitly 941// where appropriate to do so. 942let isCodeGenOnly = 1 in { 943defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l", 944 WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV, 945 VEX_LIG, SIMD_EXC; 946defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q", 947 WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV, 948 REX_W, VEX_LIG, SIMD_EXC; 949defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l", 950 WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV, 951 VEX_LIG; 952defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q", 953 WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV, 954 REX_W, VEX_LIG, SIMD_EXC; 955} // isCodeGenOnly = 1 956 957let Predicates = [UseAVX] in { 958 def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))), 959 (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 960 def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))), 961 (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; 962 def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))), 963 (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 964 def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))), 965 (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; 966 967 def : Pat<(f32 (any_sint_to_fp GR32:$src)), 968 (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; 969 def : Pat<(f32 (any_sint_to_fp GR64:$src)), 970 (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; 971 def : Pat<(f64 (any_sint_to_fp GR32:$src)), 972 (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; 973 def : Pat<(f64 (any_sint_to_fp GR64:$src)), 974 (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; 975 976 def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>; 977 def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>; 978 979 def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>; 980 def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>; 981} 982 983let isCodeGenOnly = 1 in { 984defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, 985 "cvttss2si", "cvttss2si", 986 WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC; 987defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, 988 "cvttss2si", "cvttss2si", 989 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC; 990defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, 991 "cvttsd2si", "cvttsd2si", 992 WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC; 993defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, 994 "cvttsd2si", "cvttsd2si", 995 WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC; 996 997defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32, 998 "cvtss2si", "cvtss2si", 999 WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC; 1000defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32, 1001 "cvtss2si", "cvtss2si", 1002 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC; 1003defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64, 1004 "cvtsd2si", "cvtsd2si", 1005 WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC; 1006defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64, 1007 "cvtsd2si", "cvtsd2si", 1008 WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC; 1009 1010defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32, 1011 "cvtsi2ss", "cvtsi2ss{l}", 1012 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, SIMD_EXC; 1013defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64, 1014 "cvtsi2ss", "cvtsi2ss{q}", 1015 WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, REX_W, SIMD_EXC; 1016defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32, 1017 "cvtsi2sd", "cvtsi2sd{l}", 1018 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD; 1019defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64, 1020 "cvtsi2sd", "cvtsi2sd{q}", 1021 WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD, REX_W, SIMD_EXC; 1022} // isCodeGenOnly = 1 1023 1024let Predicates = [UseSSE1] in { 1025 def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>; 1026 def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>; 1027} 1028 1029let Predicates = [UseSSE2] in { 1030 def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>; 1031 def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>; 1032} 1033 1034// Conversion Instructions Intrinsics - Match intrinsics which expect MM 1035// and/or XMM operand(s). 1036 1037multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, 1038 ValueType DstVT, ValueType SrcVT, SDNode OpNode, 1039 Operand memop, PatFrags mem_frags, string asm, 1040 X86FoldableSchedWrite sched, Domain d> { 1041let ExeDomain = d in { 1042 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), 1043 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1044 [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>, 1045 Sched<[sched]>; 1046 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), 1047 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 1048 [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>, 1049 Sched<[sched.Folded]>; 1050} 1051} 1052 1053multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, 1054 RegisterClass DstRC, X86MemOperand x86memop, 1055 string asm, string mem, X86FoldableSchedWrite sched, 1056 Domain d, bit Is2Addr = 1> { 1057let hasSideEffects = 0, ExeDomain = d in { 1058 def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), 1059 !if(Is2Addr, 1060 !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), 1061 !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 1062 []>, Sched<[sched, ReadDefault, ReadInt2Fpu]>; 1063 let mayLoad = 1 in 1064 def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), 1065 (ins DstRC:$src1, x86memop:$src2), 1066 !if(Is2Addr, 1067 asm#"{"#mem#"}\t{$src2, $dst|$dst, $src2}", 1068 asm#"{"#mem#"}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 1069 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 1070} 1071} 1072 1073let Uses = [MXCSR], mayRaiseFPException = 1 in { 1074let Predicates = [UseAVX] in { 1075defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, 1076 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1077 WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG; 1078defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, 1079 X86cvts2si, sdmem, sse_load_f64, "cvtsd2si", 1080 WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, REX_W, VEX_LIG; 1081} 1082defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si, 1083 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1084 SSEPackedDouble>, TB, XD; 1085defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si, 1086 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I, 1087 SSEPackedDouble>, TB, XD, REX_W; 1088} 1089 1090let Predicates = [UseAVX] in { 1091defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1092 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>, 1093 TB, XS, VEX, VVVV, VEX_LIG, SIMD_EXC; 1094defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1095 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>, 1096 TB, XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC; 1097defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1098 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>, 1099 TB, XD, VEX, VVVV, VEX_LIG; 1100defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1101 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>, 1102 TB, XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC; 1103} 1104let Constraints = "$src1 = $dst" in { 1105 defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1106 i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>, 1107 TB, XS, SIMD_EXC; 1108 defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1109 i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>, 1110 TB, XS, REX_W, SIMD_EXC; 1111 defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, 1112 i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>, 1113 TB, XD; 1114 defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, 1115 i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>, 1116 TB, XD, REX_W, SIMD_EXC; 1117} 1118 1119def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1120 (VCVTSI2SSrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1121def : InstAlias<"vcvtsi2ss{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1122 (VCVTSI642SSrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1123def : InstAlias<"vcvtsi2sd{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1124 (VCVTSI2SDrr_Int VR128:$dst, VR128:$src1, GR32:$src2), 0, "att">; 1125def : InstAlias<"vcvtsi2sd{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1126 (VCVTSI642SDrr_Int VR128:$dst, VR128:$src1, GR64:$src2), 0, "att">; 1127 1128def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", 1129 (VCVTSI2SSrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1130def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", 1131 (VCVTSI2SDrm_Int VR128:$dst, VR128:$src1, i32mem:$src), 0, "att">; 1132 1133def : InstAlias<"cvtsi2ss{l}\t{$src, $dst|$dst, $src}", 1134 (CVTSI2SSrr_Int VR128:$dst, GR32:$src), 0, "att">; 1135def : InstAlias<"cvtsi2ss{q}\t{$src, $dst|$dst, $src}", 1136 (CVTSI642SSrr_Int VR128:$dst, GR64:$src), 0, "att">; 1137def : InstAlias<"cvtsi2sd{l}\t{$src, $dst|$dst, $src}", 1138 (CVTSI2SDrr_Int VR128:$dst, GR32:$src), 0, "att">; 1139def : InstAlias<"cvtsi2sd{q}\t{$src, $dst|$dst, $src}", 1140 (CVTSI642SDrr_Int VR128:$dst, GR64:$src), 0, "att">; 1141 1142def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", 1143 (CVTSI2SSrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1144def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", 1145 (CVTSI2SDrm_Int VR128:$dst, i32mem:$src), 0, "att">; 1146 1147/// SSE 1 Only 1148 1149// Aliases for intrinsics 1150let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1151defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1152 ssmem, sse_load_f32, "cvttss2si", 1153 WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG; 1154defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1155 X86cvtts2Int, ssmem, sse_load_f32, 1156 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1157 TB, XS, VEX, VEX_LIG, REX_W; 1158defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1159 sdmem, sse_load_f64, "cvttsd2si", 1160 WriteCvtSS2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG; 1161defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1162 X86cvtts2Int, sdmem, sse_load_f64, 1163 "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>, 1164 TB, XD, VEX, VEX_LIG, REX_W; 1165} 1166let Uses = [MXCSR], mayRaiseFPException = 1 in { 1167defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int, 1168 ssmem, sse_load_f32, "cvttss2si", 1169 WriteCvtSS2I, SSEPackedSingle>, TB, XS; 1170defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32, 1171 X86cvtts2Int, ssmem, sse_load_f32, 1172 "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, 1173 TB, XS, REX_W; 1174defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int, 1175 sdmem, sse_load_f64, "cvttsd2si", 1176 WriteCvtSD2I, SSEPackedDouble>, TB, XD; 1177defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64, 1178 X86cvtts2Int, sdmem, sse_load_f64, 1179 "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, 1180 TB, XD, REX_W; 1181} 1182 1183def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1184 (VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1185def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", 1186 (VCVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1187def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1188 (VCVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1189def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", 1190 (VCVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1191def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1192 (VCVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1193def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", 1194 (VCVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1195def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1196 (VCVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1197def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", 1198 (VCVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1199 1200def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1201 (CVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1202def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", 1203 (CVTTSS2SIrm_Int GR32:$dst, f32mem:$src), 0, "att">; 1204def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1205 (CVTTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1206def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", 1207 (CVTTSD2SIrm_Int GR32:$dst, f64mem:$src), 0, "att">; 1208def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1209 (CVTTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1210def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", 1211 (CVTTSS2SI64rm_Int GR64:$dst, f32mem:$src), 0, "att">; 1212def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1213 (CVTTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1214def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", 1215 (CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">; 1216 1217let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1218defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1219 ssmem, sse_load_f32, "cvtss2si", 1220 WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG; 1221defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1222 ssmem, sse_load_f32, "cvtss2si", 1223 WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, REX_W, VEX_LIG; 1224} 1225let Uses = [MXCSR], mayRaiseFPException = 1 in { 1226defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si, 1227 ssmem, sse_load_f32, "cvtss2si", 1228 WriteCvtSS2I, SSEPackedSingle>, TB, XS; 1229defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si, 1230 ssmem, sse_load_f32, "cvtss2si", 1231 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W; 1232 1233defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load, 1234 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1235 SSEPackedSingle, WriteCvtI2PS>, 1236 TB, VEX, Requires<[HasAVX, NoVLX]>, WIG; 1237defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load, 1238 "vcvtdq2ps\t{$src, $dst|$dst, $src}", 1239 SSEPackedSingle, WriteCvtI2PSY>, 1240 TB, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG; 1241 1242defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop, 1243 "cvtdq2ps\t{$src, $dst|$dst, $src}", 1244 SSEPackedSingle, WriteCvtI2PS>, 1245 TB, Requires<[UseSSE2]>; 1246} 1247 1248// AVX aliases 1249def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1250 (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1251def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", 1252 (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1253def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1254 (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1255def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", 1256 (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1257def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1258 (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1259def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", 1260 (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1261def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1262 (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1263def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", 1264 (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1265 1266// SSE aliases 1267def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1268 (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1269def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", 1270 (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; 1271def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1272 (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; 1273def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", 1274 (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; 1275def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1276 (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1277def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", 1278 (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; 1279def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1280 (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; 1281def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", 1282 (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; 1283 1284/// SSE 2 Only 1285 1286// Convert scalar double to scalar single 1287let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX], 1288 ExeDomain = SSEPackedSingle in { 1289def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), 1290 (ins FR32:$src1, FR64:$src2), 1291 "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1292 VEX, VVVV, VEX_LIG, WIG, 1293 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1294let mayLoad = 1 in 1295def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), 1296 (ins FR32:$src1, f64mem:$src2), 1297 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1298 TB, XD, VEX, VVVV, VEX_LIG, WIG, 1299 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1300} 1301 1302def : Pat<(f32 (any_fpround FR64:$src)), 1303 (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, 1304 Requires<[UseAVX]>; 1305 1306let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1307def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), 1308 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1309 [(set FR32:$dst, (any_fpround FR64:$src))]>, 1310 Sched<[WriteCvtSD2SS]>, SIMD_EXC; 1311def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), 1312 "cvtsd2ss\t{$src, $dst|$dst, $src}", 1313 [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>, 1314 TB, XD, Requires<[UseSSE2, OptForSize]>, 1315 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC; 1316} 1317 1318let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = SSEPackedSingle in { 1319def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1320 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1321 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1322 [(set VR128:$dst, 1323 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1324 TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>, 1325 Sched<[WriteCvtSD2SS]>; 1326def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1327 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1328 "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1329 [(set VR128:$dst, 1330 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1331 TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>, 1332 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1333let Constraints = "$src1 = $dst" in { 1334def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, 1335 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1336 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1337 [(set VR128:$dst, 1338 (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>, 1339 TB, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; 1340def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, 1341 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), 1342 "cvtsd2ss\t{$src2, $dst|$dst, $src2}", 1343 [(set VR128:$dst, 1344 (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>, 1345 TB, XD, Requires<[UseSSE2]>, 1346 Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>; 1347} 1348} 1349 1350// Convert scalar single to scalar double 1351// SSE2 instructions with XS prefix 1352let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 1353def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), 1354 (ins FR64:$src1, FR32:$src2), 1355 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1356 TB, XS, VEX, VVVV, VEX_LIG, WIG, 1357 Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC; 1358let mayLoad = 1 in 1359def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), 1360 (ins FR64:$src1, f32mem:$src2), 1361 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 1362 TB, XS, VEX, VVVV, VEX_LIG, WIG, 1363 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, 1364 Requires<[UseAVX, OptForSize]>, SIMD_EXC; 1365} // isCodeGenOnly = 1, hasSideEffects = 0 1366 1367def : Pat<(f64 (any_fpextend FR32:$src)), 1368 (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; 1369def : Pat<(any_fpextend (loadf32 addr:$src)), 1370 (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; 1371 1372let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in { 1373def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), 1374 "cvtss2sd\t{$src, $dst|$dst, $src}", 1375 [(set FR64:$dst, (any_fpextend FR32:$src))]>, 1376 TB, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC; 1377def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), 1378 "cvtss2sd\t{$src, $dst|$dst, $src}", 1379 [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>, 1380 TB, XS, Requires<[UseSSE2, OptForSize]>, 1381 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC; 1382} // isCodeGenOnly = 1 1383 1384let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1, 1385 ExeDomain = SSEPackedSingle in { 1386def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1387 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1388 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1389 []>, TB, XS, VEX, VVVV, VEX_LIG, WIG, 1390 Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; 1391let mayLoad = 1 in 1392def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1393 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1394 "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 1395 []>, TB, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>, 1396 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1397let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix 1398def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, 1399 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 1400 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1401 []>, TB, XS, Requires<[UseSSE2]>, 1402 Sched<[WriteCvtSS2SD]>; 1403let mayLoad = 1 in 1404def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, 1405 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), 1406 "cvtss2sd\t{$src2, $dst|$dst, $src2}", 1407 []>, TB, XS, Requires<[UseSSE2]>, 1408 Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>; 1409} 1410} // hasSideEffects = 0 1411 1412// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and 1413// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary 1414// vmovs{s,d} instructions 1415let Predicates = [UseAVX] in { 1416def : Pat<(v4f32 (X86Movss 1417 (v4f32 VR128:$dst), 1418 (v4f32 (scalar_to_vector 1419 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1420 (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1421 1422def : Pat<(v2f64 (X86Movsd 1423 (v2f64 VR128:$dst), 1424 (v2f64 (scalar_to_vector 1425 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1426 (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1427 1428def : Pat<(v4f32 (X86Movss 1429 (v4f32 VR128:$dst), 1430 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1431 (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1432 1433def : Pat<(v4f32 (X86Movss 1434 (v4f32 VR128:$dst), 1435 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1436 (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1437 1438def : Pat<(v4f32 (X86Movss 1439 (v4f32 VR128:$dst), 1440 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1441 (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1442 1443def : Pat<(v4f32 (X86Movss 1444 (v4f32 VR128:$dst), 1445 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1446 (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1447 1448def : Pat<(v2f64 (X86Movsd 1449 (v2f64 VR128:$dst), 1450 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1451 (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1452 1453def : Pat<(v2f64 (X86Movsd 1454 (v2f64 VR128:$dst), 1455 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1456 (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1457 1458def : Pat<(v2f64 (X86Movsd 1459 (v2f64 VR128:$dst), 1460 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1461 (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1462 1463def : Pat<(v2f64 (X86Movsd 1464 (v2f64 VR128:$dst), 1465 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1466 (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1467} // Predicates = [UseAVX] 1468 1469let Predicates = [UseSSE2] in { 1470def : Pat<(v4f32 (X86Movss 1471 (v4f32 VR128:$dst), 1472 (v4f32 (scalar_to_vector 1473 (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), 1474 (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; 1475 1476def : Pat<(v2f64 (X86Movsd 1477 (v2f64 VR128:$dst), 1478 (v2f64 (scalar_to_vector 1479 (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), 1480 (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; 1481 1482def : Pat<(v2f64 (X86Movsd 1483 (v2f64 VR128:$dst), 1484 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))), 1485 (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; 1486 1487def : Pat<(v2f64 (X86Movsd 1488 (v2f64 VR128:$dst), 1489 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))), 1490 (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; 1491 1492def : Pat<(v2f64 (X86Movsd 1493 (v2f64 VR128:$dst), 1494 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))), 1495 (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; 1496 1497def : Pat<(v2f64 (X86Movsd 1498 (v2f64 VR128:$dst), 1499 (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))), 1500 (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; 1501} // Predicates = [UseSSE2] 1502 1503let Predicates = [UseSSE1] in { 1504def : Pat<(v4f32 (X86Movss 1505 (v4f32 VR128:$dst), 1506 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))), 1507 (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; 1508 1509def : Pat<(v4f32 (X86Movss 1510 (v4f32 VR128:$dst), 1511 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))), 1512 (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; 1513 1514def : Pat<(v4f32 (X86Movss 1515 (v4f32 VR128:$dst), 1516 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))), 1517 (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; 1518 1519def : Pat<(v4f32 (X86Movss 1520 (v4f32 VR128:$dst), 1521 (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))), 1522 (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; 1523} // Predicates = [UseSSE1] 1524 1525let Predicates = [HasAVX, NoVLX] in { 1526// Convert packed single/double fp to doubleword 1527def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1528 "cvtps2dq\t{$src, $dst|$dst, $src}", 1529 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1530 VEX, Sched<[WriteCvtPS2I]>, WIG, SIMD_EXC; 1531def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1532 "cvtps2dq\t{$src, $dst|$dst, $src}", 1533 [(set VR128:$dst, 1534 (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, 1535 VEX, Sched<[WriteCvtPS2ILd]>, WIG, SIMD_EXC; 1536def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1537 "cvtps2dq\t{$src, $dst|$dst, $src}", 1538 [(set VR256:$dst, 1539 (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, 1540 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG, SIMD_EXC; 1541def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1542 "cvtps2dq\t{$src, $dst|$dst, $src}", 1543 [(set VR256:$dst, 1544 (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, 1545 VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, WIG, SIMD_EXC; 1546} 1547def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1548 "cvtps2dq\t{$src, $dst|$dst, $src}", 1549 [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, 1550 Sched<[WriteCvtPS2I]>, SIMD_EXC; 1551def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1552 "cvtps2dq\t{$src, $dst|$dst, $src}", 1553 [(set VR128:$dst, 1554 (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, 1555 Sched<[WriteCvtPS2ILd]>, SIMD_EXC; 1556 1557 1558// Convert Packed Double FP to Packed DW Integers 1559let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1560// The assembler can recognize rr 256-bit instructions by seeing a ymm 1561// register, but the same isn't true when using memory operands instead. 1562// Provide other assembly rr and rm forms to address this explicitly. 1563def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1564 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1565 [(set VR128:$dst, 1566 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1567 VEX, Sched<[WriteCvtPD2I]>, WIG; 1568 1569// XMM only 1570def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1571 "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", 1572 [(set VR128:$dst, 1573 (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, 1574 Sched<[WriteCvtPD2ILd]>, WIG; 1575 1576// YMM only 1577def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1578 "vcvtpd2dq\t{$src, $dst|$dst, $src}", 1579 [(set VR128:$dst, 1580 (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, 1581 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG; 1582def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1583 "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", 1584 [(set VR128:$dst, 1585 (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, 1586 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG; 1587} 1588 1589def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", 1590 (VCVTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1591def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", 1592 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1593 1594def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1595 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1596 [(set VR128:$dst, 1597 (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, 1598 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1599def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1600 "cvtpd2dq\t{$src, $dst|$dst, $src}", 1601 [(set VR128:$dst, 1602 (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, 1603 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1604 1605// Convert with truncation packed single/double fp to doubleword 1606// SSE2 packed instructions with XS prefix 1607let Uses = [MXCSR], mayRaiseFPException = 1 in { 1608let Predicates = [HasAVX, NoVLX] in { 1609def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1610 "cvttps2dq\t{$src, $dst|$dst, $src}", 1611 [(set VR128:$dst, 1612 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1613 VEX, Sched<[WriteCvtPS2I]>, WIG; 1614def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1615 "cvttps2dq\t{$src, $dst|$dst, $src}", 1616 [(set VR128:$dst, 1617 (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>, 1618 VEX, Sched<[WriteCvtPS2ILd]>, WIG; 1619def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 1620 "cvttps2dq\t{$src, $dst|$dst, $src}", 1621 [(set VR256:$dst, 1622 (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>, 1623 VEX, VEX_L, Sched<[WriteCvtPS2IY]>, WIG; 1624def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 1625 "cvttps2dq\t{$src, $dst|$dst, $src}", 1626 [(set VR256:$dst, 1627 (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>, 1628 VEX, VEX_L, 1629 Sched<[WriteCvtPS2IYLd]>, WIG; 1630} 1631 1632def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1633 "cvttps2dq\t{$src, $dst|$dst, $src}", 1634 [(set VR128:$dst, 1635 (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>, 1636 Sched<[WriteCvtPS2I]>; 1637def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1638 "cvttps2dq\t{$src, $dst|$dst, $src}", 1639 [(set VR128:$dst, 1640 (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>, 1641 Sched<[WriteCvtPS2ILd]>; 1642} 1643 1644// The assembler can recognize rr 256-bit instructions by seeing a ymm 1645// register, but the same isn't true when using memory operands instead. 1646// Provide other assembly rr and rm forms to address this explicitly. 1647let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1648// XMM only 1649def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1650 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1651 [(set VR128:$dst, 1652 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1653 VEX, Sched<[WriteCvtPD2I]>, WIG; 1654def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1655 "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", 1656 [(set VR128:$dst, 1657 (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>, 1658 VEX, Sched<[WriteCvtPD2ILd]>, WIG; 1659 1660// YMM only 1661def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1662 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1663 [(set VR128:$dst, 1664 (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>, 1665 VEX, VEX_L, Sched<[WriteCvtPD2IY]>, WIG; 1666def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1667 "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", 1668 [(set VR128:$dst, 1669 (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>, 1670 VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, WIG; 1671} // Predicates = [HasAVX, NoVLX] 1672 1673def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", 1674 (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0, "att">; 1675def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", 1676 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; 1677 1678let Predicates = [HasAVX, NoVLX] in { 1679 def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), 1680 (VCVTTPD2DQYrr VR256:$src)>; 1681 def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), 1682 (VCVTTPD2DQYrm addr:$src)>; 1683} 1684 1685def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1686 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1687 [(set VR128:$dst, 1688 (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>, 1689 Sched<[WriteCvtPD2I]>, SIMD_EXC; 1690def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), 1691 "cvttpd2dq\t{$src, $dst|$dst, $src}", 1692 [(set VR128:$dst, 1693 (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>, 1694 Sched<[WriteCvtPD2ILd]>, SIMD_EXC; 1695 1696// Convert packed single to packed double 1697let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1698 // SSE2 instructions without OpSize prefix 1699def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1700 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1701 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1702 TB, VEX, Sched<[WriteCvtPS2PD]>, WIG; 1703def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1704 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1705 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1706 TB, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG; 1707def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1708 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1709 [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>, 1710 TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG; 1711def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), 1712 "vcvtps2pd\t{$src, $dst|$dst, $src}", 1713 [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, 1714 TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG; 1715} 1716 1717let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in { 1718def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1719 "cvtps2pd\t{$src, $dst|$dst, $src}", 1720 [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>, 1721 TB, Sched<[WriteCvtPS2PD]>; 1722def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 1723 "cvtps2pd\t{$src, $dst|$dst, $src}", 1724 [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, 1725 TB, Sched<[WriteCvtPS2PD.Folded]>; 1726} 1727 1728// Convert Packed DW Integers to Packed Double FP 1729let Predicates = [HasAVX, NoVLX] in { 1730let hasSideEffects = 0, mayLoad = 1 in 1731def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1732 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1733 [(set VR128:$dst, 1734 (v2f64 (X86any_VSintToFP 1735 (bc_v4i32 1736 (v2i64 (scalar_to_vector 1737 (loadi64 addr:$src)))))))]>, 1738 VEX, Sched<[WriteCvtI2PDLd]>, WIG; 1739def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1740 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1741 [(set VR128:$dst, 1742 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1743 VEX, Sched<[WriteCvtI2PD]>, WIG; 1744def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), 1745 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1746 [(set VR256:$dst, 1747 (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>, 1748 VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, 1749 WIG; 1750def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 1751 "vcvtdq2pd\t{$src, $dst|$dst, $src}", 1752 [(set VR256:$dst, 1753 (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>, 1754 VEX, VEX_L, Sched<[WriteCvtI2PDY]>, WIG; 1755} 1756 1757let hasSideEffects = 0, mayLoad = 1 in 1758def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 1759 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1760 [(set VR128:$dst, 1761 (v2f64 (X86any_VSintToFP 1762 (bc_v4i32 1763 (v2i64 (scalar_to_vector 1764 (loadi64 addr:$src)))))))]>, 1765 Sched<[WriteCvtI2PDLd]>; 1766def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1767 "cvtdq2pd\t{$src, $dst|$dst, $src}", 1768 [(set VR128:$dst, 1769 (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>, 1770 Sched<[WriteCvtI2PD]>; 1771 1772// AVX register conversion intrinsics 1773let Predicates = [HasAVX, NoVLX] in { 1774 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1775 (VCVTDQ2PDrm addr:$src)>; 1776} // Predicates = [HasAVX, NoVLX] 1777 1778// SSE2 register conversion intrinsics 1779let Predicates = [UseSSE2] in { 1780 def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 1781 (CVTDQ2PDrm addr:$src)>; 1782} // Predicates = [UseSSE2] 1783 1784// Convert packed double to packed single 1785// The assembler can recognize rr 256-bit instructions by seeing a ymm 1786// register, but the same isn't true when using memory operands instead. 1787// Provide other assembly rr and rm forms to address this explicitly. 1788let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in { 1789// XMM only 1790def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1791 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1792 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1793 VEX, Sched<[WriteCvtPD2PS]>, WIG; 1794def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1795 "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", 1796 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>, 1797 VEX, Sched<[WriteCvtPD2PS.Folded]>, WIG; 1798 1799def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 1800 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1801 [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>, 1802 VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, WIG; 1803def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 1804 "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", 1805 [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>, 1806 VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, WIG; 1807} // Predicates = [HasAVX, NoVLX] 1808 1809def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", 1810 (VCVTPD2PSrr VR128:$dst, VR128:$src), 0, "att">; 1811def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", 1812 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0, "att">; 1813 1814def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 1815 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1816 [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>, 1817 Sched<[WriteCvtPD2PS]>, SIMD_EXC; 1818def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 1819 "cvtpd2ps\t{$src, $dst|$dst, $src}", 1820 [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>, 1821 Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC; 1822 1823//===----------------------------------------------------------------------===// 1824// SSE 1 & 2 - Compare Instructions 1825//===----------------------------------------------------------------------===// 1826 1827// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions 1828multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, 1829 Operand memop, SDNode OpNode, ValueType VT, 1830 PatFrag ld_frag, string asm, 1831 X86FoldableSchedWrite sched, 1832 PatFrags mem_frags> { 1833 def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), 1834 (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm, 1835 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1836 VR128:$src2, timm:$cc))]>, 1837 Sched<[sched]>, SIMD_EXC; 1838 let mayLoad = 1 in 1839 def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), 1840 (ins VR128:$src1, memop:$src2, u8imm:$cc), asm, 1841 [(set VR128:$dst, (OpNode (VT VR128:$src1), 1842 (mem_frags addr:$src2), timm:$cc))]>, 1843 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1844 1845 let isCodeGenOnly = 1 in { 1846 let isCommutable = 1 in 1847 def rr : SIi8<0xC2, MRMSrcReg, 1848 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1849 [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>, 1850 Sched<[sched]>, SIMD_EXC; 1851 def rm : SIi8<0xC2, MRMSrcMem, 1852 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1853 [(set RC:$dst, (OpNode RC:$src1, 1854 (ld_frag addr:$src2), timm:$cc))]>, 1855 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1856 } 1857} 1858 1859let ExeDomain = SSEPackedSingle in 1860defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1861 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1862 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, 1863 TB, XS, VEX, VVVV, VEX_LIG, WIG; 1864let ExeDomain = SSEPackedDouble in 1865defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1866 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1867 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, 1868 TB, XD, VEX, VVVV, VEX_LIG, WIG; 1869 1870let Constraints = "$src1 = $dst" in { 1871 let ExeDomain = SSEPackedSingle in 1872 defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32, 1873 "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1874 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, TB, XS; 1875 let ExeDomain = SSEPackedDouble in 1876 defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64, 1877 "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1878 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, TB, XD; 1879} 1880 1881// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS 1882multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDPatternOperator OpNode, 1883 ValueType vt, X86MemOperand x86memop, 1884 PatFrag ld_frag, string OpcodeStr, Domain d, 1885 X86FoldableSchedWrite sched = WriteFComX> { 1886 let ExeDomain = d in { 1887 def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1888 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1889 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1890 Sched<[sched]>, SIMD_EXC; 1891 let mayLoad = 1 in 1892 def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 1893 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1894 [(set EFLAGS, (OpNode (vt RC:$src1), 1895 (ld_frag addr:$src2)))]>, 1896 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1897} 1898} 1899 1900// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp 1901multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, 1902 ValueType vt, Operand memop, 1903 PatFrags mem_frags, string OpcodeStr, 1904 Domain d, 1905 X86FoldableSchedWrite sched = WriteFComX> { 1906let ExeDomain = d in { 1907 def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 1908 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1909 [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, 1910 Sched<[sched]>, SIMD_EXC; 1911let mayLoad = 1 in 1912 def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), 1913 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 1914 [(set EFLAGS, (OpNode (vt RC:$src1), 1915 (mem_frags addr:$src2)))]>, 1916 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1917} 1918} 1919 1920let Defs = [EFLAGS] in { 1921 defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1922 "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1923 defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1924 "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1925 defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1926 "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1927 defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1928 "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1929 1930 let isCodeGenOnly = 1 in { 1931 defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1932 sse_load_f32, "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1933 defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1934 sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1935 1936 defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1937 sse_load_f32, "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG; 1938 defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1939 sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG; 1940 } 1941 defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32, 1942 "ucomiss", SSEPackedSingle>, TB; 1943 defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64, 1944 "ucomisd", SSEPackedDouble>, TB, PD; 1945 defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32, 1946 "comiss", SSEPackedSingle>, TB; 1947 defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64, 1948 "comisd", SSEPackedDouble>, TB, PD; 1949 1950 let isCodeGenOnly = 1 in { 1951 defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, 1952 sse_load_f32, "ucomiss", SSEPackedSingle>, TB; 1953 defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, 1954 sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD; 1955 1956 defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, 1957 sse_load_f32, "comiss", SSEPackedSingle>, TB; 1958 defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, 1959 sse_load_f64, "comisd", SSEPackedDouble>, TB, PD; 1960 } 1961} // Defs = [EFLAGS] 1962 1963// sse12_cmp_packed - sse 1 & 2 compare packed instructions 1964multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, 1965 ValueType VT, string asm, 1966 X86FoldableSchedWrite sched, 1967 Domain d, PatFrag ld_frag> { 1968 let isCommutable = 1 in 1969 def rri : PIi8<0xC2, MRMSrcReg, 1970 (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm, 1971 [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>, 1972 Sched<[sched]>, SIMD_EXC; 1973 def rmi : PIi8<0xC2, MRMSrcMem, 1974 (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm, 1975 [(set RC:$dst, 1976 (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>, 1977 Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC; 1978} 1979 1980defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1981 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1982 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, TB, VEX, VVVV, WIG; 1983defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1984 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1985 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, TB, PD, VEX, VVVV, WIG; 1986defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32, 1987 "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1988 SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, TB, VEX, VVVV, VEX_L, WIG; 1989defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64, 1990 "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", 1991 SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, TB, PD, VEX, VVVV, VEX_L, WIG; 1992let Constraints = "$src1 = $dst" in { 1993 defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32, 1994 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1995 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, TB; 1996 defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64, 1997 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", 1998 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, TB, PD; 1999} 2000 2001def CommutableCMPCC : PatLeaf<(timm), [{ 2002 uint64_t Imm = N->getZExtValue() & 0x7; 2003 return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); 2004}]>; 2005 2006// Patterns to select compares with loads in first operand. 2007let Predicates = [HasAVX] in { 2008 def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1, 2009 CommutableCMPCC:$cc)), 2010 (VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2011 2012 def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1, 2013 CommutableCMPCC:$cc)), 2014 (VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>; 2015 2016 def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1, 2017 CommutableCMPCC:$cc)), 2018 (VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2019 2020 def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1, 2021 CommutableCMPCC:$cc)), 2022 (VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2023 2024 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2025 CommutableCMPCC:$cc)), 2026 (VCMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 2027 2028 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2029 CommutableCMPCC:$cc)), 2030 (VCMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2031} 2032 2033let Predicates = [UseSSE2] in { 2034 def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1, 2035 CommutableCMPCC:$cc)), 2036 (CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>; 2037 2038 def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, 2039 CommutableCMPCC:$cc)), 2040 (CMPSDrm FR64:$src1, addr:$src2, timm:$cc)>; 2041} 2042 2043let Predicates = [UseSSE1] in { 2044 def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1, 2045 CommutableCMPCC:$cc)), 2046 (CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>; 2047 2048 def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, 2049 CommutableCMPCC:$cc)), 2050 (CMPSSrm FR32:$src1, addr:$src2, timm:$cc)>; 2051} 2052 2053//===----------------------------------------------------------------------===// 2054// SSE 1 & 2 - Shuffle Instructions 2055//===----------------------------------------------------------------------===// 2056 2057/// sse12_shuffle - sse 1 & 2 fp shuffle instructions 2058multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, 2059 ValueType vt, string asm, PatFrag mem_frag, 2060 X86FoldableSchedWrite sched, Domain d, 2061 bit IsCommutable = 0> { 2062 def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), 2063 (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, 2064 [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), 2065 (i8 timm:$src3))))], d>, 2066 Sched<[sched.Folded, sched.ReadAfterFold]>; 2067 let isCommutable = IsCommutable in 2068 def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), 2069 (ins RC:$src1, RC:$src2, u8imm:$src3), asm, 2070 [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, 2071 (i8 timm:$src3))))], d>, 2072 Sched<[sched]>; 2073} 2074 2075let Predicates = [HasAVX, NoVLX] in { 2076 defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2077 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2078 loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, 2079 TB, VEX, VVVV, WIG; 2080 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, 2081 "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2082 loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, 2083 TB, VEX, VVVV, VEX_L, WIG; 2084 defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2085 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2086 loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, 2087 TB, PD, VEX, VVVV, WIG; 2088 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, 2089 "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 2090 loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, 2091 TB, PD, VEX, VVVV, VEX_L, WIG; 2092} 2093let Constraints = "$src1 = $dst" in { 2094 defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, 2095 "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2096 memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, TB; 2097 defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, 2098 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", 2099 memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD; 2100} 2101 2102//===----------------------------------------------------------------------===// 2103// SSE 1 & 2 - Unpack FP Instructions 2104//===----------------------------------------------------------------------===// 2105 2106/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave 2107multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, 2108 PatFrag mem_frag, RegisterClass RC, 2109 X86MemOperand x86memop, string asm, 2110 X86FoldableSchedWrite sched, Domain d, 2111 bit IsCommutable = 0> { 2112 let isCommutable = IsCommutable in 2113 def rr : PI<opc, MRMSrcReg, 2114 (outs RC:$dst), (ins RC:$src1, RC:$src2), 2115 asm, [(set RC:$dst, 2116 (vt (OpNode RC:$src1, RC:$src2)))], d>, 2117 Sched<[sched]>; 2118 def rm : PI<opc, MRMSrcMem, 2119 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2120 asm, [(set RC:$dst, 2121 (vt (OpNode RC:$src1, 2122 (mem_frag addr:$src2))))], d>, 2123 Sched<[sched.Folded, sched.ReadAfterFold]>; 2124} 2125 2126let Predicates = [HasAVX, NoVLX] in { 2127defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load, 2128 VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2129 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG; 2130defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load, 2131 VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2132 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD, VEX, VVVV, WIG; 2133defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load, 2134 VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2135 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG; 2136defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load, 2137 VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2138 SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD, VEX, VVVV, WIG; 2139 2140defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load, 2141 VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2142 SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG; 2143defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load, 2144 VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2145 SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG; 2146defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load, 2147 VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2148 SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG; 2149defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load, 2150 VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", 2151 SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG; 2152}// Predicates = [HasAVX, NoVLX] 2153 2154let Constraints = "$src1 = $dst" in { 2155 defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop, 2156 VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", 2157 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB; 2158 defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop, 2159 VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", 2160 SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD; 2161 defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop, 2162 VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", 2163 SchedWriteFShuffle.XMM, SSEPackedSingle>, TB; 2164 defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop, 2165 VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", 2166 SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD; 2167} // Constraints = "$src1 = $dst" 2168 2169let Predicates = [HasAVX1Only] in { 2170 def : Pat<(v8i32 (X86Unpckl VR256:$src1, (loadv8i32 addr:$src2))), 2171 (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; 2172 def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), 2173 (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; 2174 def : Pat<(v8i32 (X86Unpckh VR256:$src1, (loadv8i32 addr:$src2))), 2175 (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; 2176 def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), 2177 (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; 2178 2179 def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), 2180 (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; 2181 def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), 2182 (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; 2183 def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), 2184 (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; 2185 def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), 2186 (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; 2187} 2188 2189let Predicates = [UseSSE2] in { 2190 // Use MOVHPD if the load isn't aligned enough for UNPCKLPD. 2191 def : Pat<(v2f64 (X86Unpckl VR128:$src1, 2192 (v2f64 (simple_load addr:$src2)))), 2193 (MOVHPDrm VR128:$src1, addr:$src2)>; 2194} 2195 2196//===----------------------------------------------------------------------===// 2197// SSE 1 & 2 - Extract Floating-Point Sign mask 2198//===----------------------------------------------------------------------===// 2199 2200/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave 2201multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, 2202 string asm, Domain d> { 2203 def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), 2204 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), 2205 [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, 2206 Sched<[WriteFMOVMSK]>; 2207} 2208 2209let Predicates = [HasAVX] in { 2210 defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2211 SSEPackedSingle>, TB, VEX, WIG; 2212 defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2213 SSEPackedDouble>, TB, PD, VEX, WIG; 2214 defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", 2215 SSEPackedSingle>, TB, VEX, VEX_L, WIG; 2216 defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", 2217 SSEPackedDouble>, TB, PD, VEX, VEX_L, WIG; 2218 2219 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2220 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2221 (VMOVMSKPSrr VR128:$src)>; 2222 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2223 (VMOVMSKPDrr VR128:$src)>; 2224 def : Pat<(X86movmsk (v8i32 VR256:$src)), 2225 (VMOVMSKPSYrr VR256:$src)>; 2226 def : Pat<(X86movmsk (v4i64 VR256:$src)), 2227 (VMOVMSKPDYrr VR256:$src)>; 2228} 2229 2230defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", 2231 SSEPackedSingle>, TB; 2232defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", 2233 SSEPackedDouble>, TB, PD; 2234 2235let Predicates = [UseSSE2] in { 2236 // Also support integer VTs to avoid a int->fp bitcast in the DAG. 2237 def : Pat<(X86movmsk (v4i32 VR128:$src)), 2238 (MOVMSKPSrr VR128:$src)>; 2239 def : Pat<(X86movmsk (v2i64 VR128:$src)), 2240 (MOVMSKPDrr VR128:$src)>; 2241} 2242 2243//===---------------------------------------------------------------------===// 2244// SSE2 - Packed Integer Logical Instructions 2245//===---------------------------------------------------------------------===// 2246 2247let ExeDomain = SSEPackedInt in { // SSE integer instructions 2248 2249/// PDI_binop_rm - Simple SSE2 binary operator. 2250multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 2251 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 2252 X86MemOperand x86memop, X86FoldableSchedWrite sched, 2253 bit IsCommutable, bit Is2Addr> { 2254 let isCommutable = IsCommutable in 2255 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 2256 (ins RC:$src1, RC:$src2), 2257 !if(Is2Addr, 2258 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2259 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2260 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 2261 Sched<[sched]>; 2262 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 2263 (ins RC:$src1, x86memop:$src2), 2264 !if(Is2Addr, 2265 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 2266 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 2267 [(set RC:$dst, (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 2268 Sched<[sched.Folded, sched.ReadAfterFold]>; 2269} 2270} // ExeDomain = SSEPackedInt 2271 2272multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, 2273 ValueType OpVT128, ValueType OpVT256, 2274 X86SchedWriteWidths sched, bit IsCommutable, 2275 Predicate prd> { 2276let Predicates = [HasAVX, prd] in 2277 defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, 2278 VR128, load, i128mem, sched.XMM, 2279 IsCommutable, 0>, VEX, VVVV, WIG; 2280 2281let Constraints = "$src1 = $dst" in 2282 defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, 2283 memop, i128mem, sched.XMM, IsCommutable, 1>; 2284 2285let Predicates = [HasAVX2, prd] in 2286 defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, 2287 OpVT256, VR256, load, i256mem, sched.YMM, 2288 IsCommutable, 0>, VEX, VVVV, VEX_L, WIG; 2289} 2290 2291// These are ordered here for pattern ordering requirements with the fp versions 2292 2293defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, 2294 SchedWriteVecLogic, 1, NoVLX>; 2295defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, 2296 SchedWriteVecLogic, 1, NoVLX>; 2297defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, 2298 SchedWriteVecLogic, 1, NoVLX>; 2299defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, 2300 SchedWriteVecLogic, 0, NoVLX>; 2301 2302//===----------------------------------------------------------------------===// 2303// SSE 1 & 2 - Logical Instructions 2304//===----------------------------------------------------------------------===// 2305 2306/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops 2307/// 2308/// There are no patterns here because isel prefers integer versions for SSE2 2309/// and later. There are SSE1 v4f32 patterns later. 2310multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, 2311 X86SchedWriteWidths sched> { 2312 let Predicates = [HasAVX, NoVLX] in { 2313 defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, 2314 !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, 2315 [], [], 0>, TB, VEX, VVVV, VEX_L, WIG; 2316 2317 defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, 2318 !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, 2319 [], [], 0>, TB, PD, VEX, VVVV, VEX_L, WIG; 2320 2321 defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2322 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2323 [], [], 0>, TB, VEX, VVVV, WIG; 2324 2325 defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2326 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2327 [], [], 0>, TB, PD, VEX, VVVV, WIG; 2328 } 2329 2330 let Constraints = "$src1 = $dst" in { 2331 defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, 2332 !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, 2333 [], []>, TB; 2334 2335 defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, 2336 !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, 2337 [], []>, TB, PD; 2338 } 2339} 2340 2341defm AND : sse12_fp_packed_logical<0x54, "and", SchedWriteFLogic>; 2342defm OR : sse12_fp_packed_logical<0x56, "or", SchedWriteFLogic>; 2343defm XOR : sse12_fp_packed_logical<0x57, "xor", SchedWriteFLogic>; 2344let isCommutable = 0 in 2345 defm ANDN : sse12_fp_packed_logical<0x55, "andn", SchedWriteFLogic>; 2346 2347let Predicates = [HasAVX2, NoVLX] in { 2348 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2349 (VPANDYrr VR256:$src1, VR256:$src2)>; 2350 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2351 (VPANDYrr VR256:$src1, VR256:$src2)>; 2352 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2353 (VPANDYrr VR256:$src1, VR256:$src2)>; 2354 2355 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2356 (VPORYrr VR256:$src1, VR256:$src2)>; 2357 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2358 (VPORYrr VR256:$src1, VR256:$src2)>; 2359 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2360 (VPORYrr VR256:$src1, VR256:$src2)>; 2361 2362 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2363 (VPXORYrr VR256:$src1, VR256:$src2)>; 2364 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2365 (VPXORYrr VR256:$src1, VR256:$src2)>; 2366 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2367 (VPXORYrr VR256:$src1, VR256:$src2)>; 2368 2369 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2370 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2371 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2372 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2373 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2374 (VPANDNYrr VR256:$src1, VR256:$src2)>; 2375 2376 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2377 (VPANDYrm VR256:$src1, addr:$src2)>; 2378 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2379 (VPANDYrm VR256:$src1, addr:$src2)>; 2380 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2381 (VPANDYrm VR256:$src1, addr:$src2)>; 2382 2383 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2384 (VPORYrm VR256:$src1, addr:$src2)>; 2385 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2386 (VPORYrm VR256:$src1, addr:$src2)>; 2387 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2388 (VPORYrm VR256:$src1, addr:$src2)>; 2389 2390 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2391 (VPXORYrm VR256:$src1, addr:$src2)>; 2392 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2393 (VPXORYrm VR256:$src1, addr:$src2)>; 2394 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2395 (VPXORYrm VR256:$src1, addr:$src2)>; 2396 2397 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2398 (VPANDNYrm VR256:$src1, addr:$src2)>; 2399 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2400 (VPANDNYrm VR256:$src1, addr:$src2)>; 2401 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2402 (VPANDNYrm VR256:$src1, addr:$src2)>; 2403} 2404 2405// If only AVX1 is supported, we need to handle integer operations with 2406// floating point instructions since the integer versions aren't available. 2407let Predicates = [HasAVX1Only] in { 2408 def : Pat<(v32i8 (and VR256:$src1, VR256:$src2)), 2409 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2410 def : Pat<(v16i16 (and VR256:$src1, VR256:$src2)), 2411 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2412 def : Pat<(v8i32 (and VR256:$src1, VR256:$src2)), 2413 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2414 def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), 2415 (VANDPSYrr VR256:$src1, VR256:$src2)>; 2416 2417 def : Pat<(v32i8 (or VR256:$src1, VR256:$src2)), 2418 (VORPSYrr VR256:$src1, VR256:$src2)>; 2419 def : Pat<(v16i16 (or VR256:$src1, VR256:$src2)), 2420 (VORPSYrr VR256:$src1, VR256:$src2)>; 2421 def : Pat<(v8i32 (or VR256:$src1, VR256:$src2)), 2422 (VORPSYrr VR256:$src1, VR256:$src2)>; 2423 def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), 2424 (VORPSYrr VR256:$src1, VR256:$src2)>; 2425 2426 def : Pat<(v32i8 (xor VR256:$src1, VR256:$src2)), 2427 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2428 def : Pat<(v16i16 (xor VR256:$src1, VR256:$src2)), 2429 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2430 def : Pat<(v8i32 (xor VR256:$src1, VR256:$src2)), 2431 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2432 def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), 2433 (VXORPSYrr VR256:$src1, VR256:$src2)>; 2434 2435 def : Pat<(v32i8 (X86andnp VR256:$src1, VR256:$src2)), 2436 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2437 def : Pat<(v16i16 (X86andnp VR256:$src1, VR256:$src2)), 2438 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2439 def : Pat<(v8i32 (X86andnp VR256:$src1, VR256:$src2)), 2440 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2441 def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), 2442 (VANDNPSYrr VR256:$src1, VR256:$src2)>; 2443 2444 def : Pat<(and VR256:$src1, (loadv32i8 addr:$src2)), 2445 (VANDPSYrm VR256:$src1, addr:$src2)>; 2446 def : Pat<(and VR256:$src1, (loadv16i16 addr:$src2)), 2447 (VANDPSYrm VR256:$src1, addr:$src2)>; 2448 def : Pat<(and VR256:$src1, (loadv8i32 addr:$src2)), 2449 (VANDPSYrm VR256:$src1, addr:$src2)>; 2450 def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), 2451 (VANDPSYrm VR256:$src1, addr:$src2)>; 2452 2453 def : Pat<(or VR256:$src1, (loadv32i8 addr:$src2)), 2454 (VORPSYrm VR256:$src1, addr:$src2)>; 2455 def : Pat<(or VR256:$src1, (loadv16i16 addr:$src2)), 2456 (VORPSYrm VR256:$src1, addr:$src2)>; 2457 def : Pat<(or VR256:$src1, (loadv8i32 addr:$src2)), 2458 (VORPSYrm VR256:$src1, addr:$src2)>; 2459 def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), 2460 (VORPSYrm VR256:$src1, addr:$src2)>; 2461 2462 def : Pat<(xor VR256:$src1, (loadv32i8 addr:$src2)), 2463 (VXORPSYrm VR256:$src1, addr:$src2)>; 2464 def : Pat<(xor VR256:$src1, (loadv16i16 addr:$src2)), 2465 (VXORPSYrm VR256:$src1, addr:$src2)>; 2466 def : Pat<(xor VR256:$src1, (loadv8i32 addr:$src2)), 2467 (VXORPSYrm VR256:$src1, addr:$src2)>; 2468 def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), 2469 (VXORPSYrm VR256:$src1, addr:$src2)>; 2470 2471 def : Pat<(X86andnp VR256:$src1, (loadv32i8 addr:$src2)), 2472 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2473 def : Pat<(X86andnp VR256:$src1, (loadv16i16 addr:$src2)), 2474 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2475 def : Pat<(X86andnp VR256:$src1, (loadv8i32 addr:$src2)), 2476 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2477 def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), 2478 (VANDNPSYrm VR256:$src1, addr:$src2)>; 2479} 2480 2481let Predicates = [HasAVX, NoVLX] in { 2482 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2483 (VPANDrr VR128:$src1, VR128:$src2)>; 2484 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2485 (VPANDrr VR128:$src1, VR128:$src2)>; 2486 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2487 (VPANDrr VR128:$src1, VR128:$src2)>; 2488 2489 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2490 (VPORrr VR128:$src1, VR128:$src2)>; 2491 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2492 (VPORrr VR128:$src1, VR128:$src2)>; 2493 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2494 (VPORrr VR128:$src1, VR128:$src2)>; 2495 2496 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2497 (VPXORrr VR128:$src1, VR128:$src2)>; 2498 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2499 (VPXORrr VR128:$src1, VR128:$src2)>; 2500 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2501 (VPXORrr VR128:$src1, VR128:$src2)>; 2502 2503 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2504 (VPANDNrr VR128:$src1, VR128:$src2)>; 2505 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2506 (VPANDNrr VR128:$src1, VR128:$src2)>; 2507 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2508 (VPANDNrr VR128:$src1, VR128:$src2)>; 2509 2510 def : Pat<(and VR128:$src1, (loadv16i8 addr:$src2)), 2511 (VPANDrm VR128:$src1, addr:$src2)>; 2512 def : Pat<(and VR128:$src1, (loadv8i16 addr:$src2)), 2513 (VPANDrm VR128:$src1, addr:$src2)>; 2514 def : Pat<(and VR128:$src1, (loadv4i32 addr:$src2)), 2515 (VPANDrm VR128:$src1, addr:$src2)>; 2516 2517 def : Pat<(or VR128:$src1, (loadv16i8 addr:$src2)), 2518 (VPORrm VR128:$src1, addr:$src2)>; 2519 def : Pat<(or VR128:$src1, (loadv8i16 addr:$src2)), 2520 (VPORrm VR128:$src1, addr:$src2)>; 2521 def : Pat<(or VR128:$src1, (loadv4i32 addr:$src2)), 2522 (VPORrm VR128:$src1, addr:$src2)>; 2523 2524 def : Pat<(xor VR128:$src1, (loadv16i8 addr:$src2)), 2525 (VPXORrm VR128:$src1, addr:$src2)>; 2526 def : Pat<(xor VR128:$src1, (loadv8i16 addr:$src2)), 2527 (VPXORrm VR128:$src1, addr:$src2)>; 2528 def : Pat<(xor VR128:$src1, (loadv4i32 addr:$src2)), 2529 (VPXORrm VR128:$src1, addr:$src2)>; 2530 2531 def : Pat<(X86andnp VR128:$src1, (loadv16i8 addr:$src2)), 2532 (VPANDNrm VR128:$src1, addr:$src2)>; 2533 def : Pat<(X86andnp VR128:$src1, (loadv8i16 addr:$src2)), 2534 (VPANDNrm VR128:$src1, addr:$src2)>; 2535 def : Pat<(X86andnp VR128:$src1, (loadv4i32 addr:$src2)), 2536 (VPANDNrm VR128:$src1, addr:$src2)>; 2537} 2538 2539let Predicates = [UseSSE2] in { 2540 def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)), 2541 (PANDrr VR128:$src1, VR128:$src2)>; 2542 def : Pat<(v8i16 (and VR128:$src1, VR128:$src2)), 2543 (PANDrr VR128:$src1, VR128:$src2)>; 2544 def : Pat<(v4i32 (and VR128:$src1, VR128:$src2)), 2545 (PANDrr VR128:$src1, VR128:$src2)>; 2546 2547 def : Pat<(v16i8 (or VR128:$src1, VR128:$src2)), 2548 (PORrr VR128:$src1, VR128:$src2)>; 2549 def : Pat<(v8i16 (or VR128:$src1, VR128:$src2)), 2550 (PORrr VR128:$src1, VR128:$src2)>; 2551 def : Pat<(v4i32 (or VR128:$src1, VR128:$src2)), 2552 (PORrr VR128:$src1, VR128:$src2)>; 2553 2554 def : Pat<(v16i8 (xor VR128:$src1, VR128:$src2)), 2555 (PXORrr VR128:$src1, VR128:$src2)>; 2556 def : Pat<(v8i16 (xor VR128:$src1, VR128:$src2)), 2557 (PXORrr VR128:$src1, VR128:$src2)>; 2558 def : Pat<(v4i32 (xor VR128:$src1, VR128:$src2)), 2559 (PXORrr VR128:$src1, VR128:$src2)>; 2560 2561 def : Pat<(v16i8 (X86andnp VR128:$src1, VR128:$src2)), 2562 (PANDNrr VR128:$src1, VR128:$src2)>; 2563 def : Pat<(v8i16 (X86andnp VR128:$src1, VR128:$src2)), 2564 (PANDNrr VR128:$src1, VR128:$src2)>; 2565 def : Pat<(v4i32 (X86andnp VR128:$src1, VR128:$src2)), 2566 (PANDNrr VR128:$src1, VR128:$src2)>; 2567 2568 def : Pat<(and VR128:$src1, (memopv16i8 addr:$src2)), 2569 (PANDrm VR128:$src1, addr:$src2)>; 2570 def : Pat<(and VR128:$src1, (memopv8i16 addr:$src2)), 2571 (PANDrm VR128:$src1, addr:$src2)>; 2572 def : Pat<(and VR128:$src1, (memopv4i32 addr:$src2)), 2573 (PANDrm VR128:$src1, addr:$src2)>; 2574 2575 def : Pat<(or VR128:$src1, (memopv16i8 addr:$src2)), 2576 (PORrm VR128:$src1, addr:$src2)>; 2577 def : Pat<(or VR128:$src1, (memopv8i16 addr:$src2)), 2578 (PORrm VR128:$src1, addr:$src2)>; 2579 def : Pat<(or VR128:$src1, (memopv4i32 addr:$src2)), 2580 (PORrm VR128:$src1, addr:$src2)>; 2581 2582 def : Pat<(xor VR128:$src1, (memopv16i8 addr:$src2)), 2583 (PXORrm VR128:$src1, addr:$src2)>; 2584 def : Pat<(xor VR128:$src1, (memopv8i16 addr:$src2)), 2585 (PXORrm VR128:$src1, addr:$src2)>; 2586 def : Pat<(xor VR128:$src1, (memopv4i32 addr:$src2)), 2587 (PXORrm VR128:$src1, addr:$src2)>; 2588 2589 def : Pat<(X86andnp VR128:$src1, (memopv16i8 addr:$src2)), 2590 (PANDNrm VR128:$src1, addr:$src2)>; 2591 def : Pat<(X86andnp VR128:$src1, (memopv8i16 addr:$src2)), 2592 (PANDNrm VR128:$src1, addr:$src2)>; 2593 def : Pat<(X86andnp VR128:$src1, (memopv4i32 addr:$src2)), 2594 (PANDNrm VR128:$src1, addr:$src2)>; 2595} 2596 2597// Patterns for packed operations when we don't have integer type available. 2598def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), 2599 (ANDPSrr VR128:$src1, VR128:$src2)>; 2600def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), 2601 (ORPSrr VR128:$src1, VR128:$src2)>; 2602def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), 2603 (XORPSrr VR128:$src1, VR128:$src2)>; 2604def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), 2605 (ANDNPSrr VR128:$src1, VR128:$src2)>; 2606 2607def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), 2608 (ANDPSrm VR128:$src1, addr:$src2)>; 2609def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), 2610 (ORPSrm VR128:$src1, addr:$src2)>; 2611def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), 2612 (XORPSrm VR128:$src1, addr:$src2)>; 2613def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), 2614 (ANDNPSrm VR128:$src1, addr:$src2)>; 2615 2616//===----------------------------------------------------------------------===// 2617// SSE 1 & 2 - Arithmetic Instructions 2618//===----------------------------------------------------------------------===// 2619 2620/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and 2621/// vector forms. 2622/// 2623/// In addition, we also have a special variant of the scalar form here to 2624/// represent the associated intrinsic operation. This form is unlike the 2625/// plain scalar form, in that it takes an entire vector (instead of a scalar) 2626/// and leaves the top elements unmodified (therefore these cannot be commuted). 2627/// 2628/// These three forms can each be reg+reg or reg+mem. 2629/// 2630 2631/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those 2632/// classes below 2633multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, 2634 SDPatternOperator OpNode, X86SchedWriteSizes sched> { 2635let Uses = [MXCSR], mayRaiseFPException = 1 in { 2636 let Predicates = [HasAVX, NoVLX] in { 2637 defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, 2638 VR128, v4f32, f128mem, loadv4f32, 2639 SSEPackedSingle, sched.PS.XMM, 0>, TB, VEX, VVVV, WIG; 2640 defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, 2641 VR128, v2f64, f128mem, loadv2f64, 2642 SSEPackedDouble, sched.PD.XMM, 0>, TB, PD, VEX, VVVV, WIG; 2643 2644 defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), 2645 OpNode, VR256, v8f32, f256mem, loadv8f32, 2646 SSEPackedSingle, sched.PS.YMM, 0>, TB, VEX, VVVV, VEX_L, WIG; 2647 defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), 2648 OpNode, VR256, v4f64, f256mem, loadv4f64, 2649 SSEPackedDouble, sched.PD.YMM, 0>, TB, PD, VEX, VVVV, VEX_L, WIG; 2650 } 2651 2652 let Constraints = "$src1 = $dst" in { 2653 defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, 2654 v4f32, f128mem, memopv4f32, SSEPackedSingle, 2655 sched.PS.XMM>, TB; 2656 defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, 2657 v2f64, f128mem, memopv2f64, SSEPackedDouble, 2658 sched.PD.XMM>, TB, PD; 2659 } 2660} 2661} 2662 2663multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2664 X86SchedWriteSizes sched> { 2665let Uses = [MXCSR], mayRaiseFPException = 1 in { 2666 defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2667 OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, 2668 TB, XS, VEX, VVVV, VEX_LIG, WIG; 2669 defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2670 OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, 2671 TB, XD, VEX, VVVV, VEX_LIG, WIG; 2672 2673 let Constraints = "$src1 = $dst" in { 2674 defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), 2675 OpNode, FR32, f32mem, SSEPackedSingle, 2676 sched.PS.Scl>, TB, XS; 2677 defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), 2678 OpNode, FR64, f64mem, SSEPackedDouble, 2679 sched.PD.Scl>, TB, XD; 2680 } 2681} 2682} 2683 2684multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, 2685 SDPatternOperator OpNode, 2686 X86SchedWriteSizes sched> { 2687let Uses = [MXCSR], mayRaiseFPException = 1 in { 2688 defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2689 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2690 SSEPackedSingle, sched.PS.Scl, 0>, TB, XS, VEX, VVVV, VEX_LIG, WIG; 2691 defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2692 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2693 SSEPackedDouble, sched.PD.Scl, 0>, TB, XD, VEX, VVVV, VEX_LIG, WIG; 2694 2695 let Constraints = "$src1 = $dst" in { 2696 defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32, 2697 !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, 2698 SSEPackedSingle, sched.PS.Scl>, TB, XS; 2699 defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64, 2700 !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, 2701 SSEPackedDouble, sched.PD.Scl>, TB, XD; 2702 } 2703} 2704} 2705 2706// Binary Arithmetic instructions 2707defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2708 basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>, 2709 basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; 2710defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2711 basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>, 2712 basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; 2713let isCommutable = 0 in { 2714 defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2715 basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>, 2716 basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; 2717 defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2718 basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>, 2719 basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; 2720 defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2721 basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, 2722 basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; 2723 defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2724 basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, 2725 basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; 2726} 2727 2728let isCodeGenOnly = 1 in { 2729 defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, 2730 basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; 2731 defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, 2732 basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; 2733} 2734 2735// Patterns used to select SSE scalar fp arithmetic instructions from 2736// either: 2737// 2738// (1) a scalar fp operation followed by a blend 2739// 2740// The effect is that the backend no longer emits unnecessary vector 2741// insert instructions immediately after SSE scalar fp instructions 2742// like addss or mulss. 2743// 2744// For example, given the following code: 2745// __m128 foo(__m128 A, __m128 B) { 2746// A[0] += B[0]; 2747// return A; 2748// } 2749// 2750// Previously we generated: 2751// addss %xmm0, %xmm1 2752// movss %xmm1, %xmm0 2753// 2754// We now generate: 2755// addss %xmm1, %xmm0 2756// 2757// (2) a vector packed single/double fp operation followed by a vector insert 2758// 2759// The effect is that the backend converts the packed fp instruction 2760// followed by a vector insert into a single SSE scalar fp instruction. 2761// 2762// For example, given the following code: 2763// __m128 foo(__m128 A, __m128 B) { 2764// __m128 C = A + B; 2765// return (__m128) {c[0], a[1], a[2], a[3]}; 2766// } 2767// 2768// Previously we generated: 2769// addps %xmm0, %xmm1 2770// movss %xmm1, %xmm0 2771// 2772// We now generate: 2773// addss %xmm1, %xmm0 2774 2775// TODO: Some canonicalization in lowering would simplify the number of 2776// patterns we have to try to match. 2777multiclass scalar_math_patterns<SDPatternOperator Op, string OpcPrefix, SDNode Move, 2778 ValueType VT, ValueType EltTy, 2779 RegisterClass RC, PatFrag ld_frag, 2780 Predicate BasePredicate> { 2781 let Predicates = [BasePredicate] in { 2782 // extracted scalar math op with insert via movss/movsd 2783 def : Pat<(VT (Move (VT VR128:$dst), 2784 (VT (scalar_to_vector 2785 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2786 RC:$src))))), 2787 (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, 2788 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2789 def : Pat<(VT (Move (VT VR128:$dst), 2790 (VT (scalar_to_vector 2791 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2792 (ld_frag addr:$src)))))), 2793 (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2794 } 2795 2796 // Repeat for AVX versions of the instructions. 2797 let Predicates = [UseAVX] in { 2798 // extracted scalar math op with insert via movss/movsd 2799 def : Pat<(VT (Move (VT VR128:$dst), 2800 (VT (scalar_to_vector 2801 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2802 RC:$src))))), 2803 (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, 2804 (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; 2805 def : Pat<(VT (Move (VT VR128:$dst), 2806 (VT (scalar_to_vector 2807 (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), 2808 (ld_frag addr:$src)))))), 2809 (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; 2810 } 2811} 2812 2813defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2814defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2815defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2816defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; 2817 2818defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2819defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2820defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2821defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; 2822 2823/// Unop Arithmetic 2824/// In addition, we also have a special variant of the scalar form here to 2825/// represent the associated intrinsic operation. This form is unlike the 2826/// plain scalar form, in that it takes an entire vector (instead of a 2827/// scalar) and leaves the top elements undefined. 2828/// 2829/// And, we have a special variant form for a full-vector intrinsic form. 2830 2831/// sse_fp_unop_s - SSE1 unops in scalar form 2832/// For the non-AVX defs, we need $src1 to be tied to $dst because 2833/// the HW instructions are 2 operand / destructive. 2834multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2835 X86MemOperand x86memop, Operand intmemop, 2836 SDPatternOperator OpNode, Domain d, 2837 X86FoldableSchedWrite sched, Predicate target> { 2838 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2839 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), 2840 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2841 [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, 2842 Requires<[target]>; 2843 let mayLoad = 1 in 2844 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), 2845 !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), 2846 [(set RC:$dst, (OpNode (load addr:$src1)))], d>, 2847 Sched<[sched.Folded]>, 2848 Requires<[target, OptForSize]>; 2849 } 2850 2851 let hasSideEffects = 0, Constraints = "$src1 = $dst", ExeDomain = d in { 2852 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), 2853 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2854 Sched<[sched]>; 2855 let mayLoad = 1 in 2856 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), 2857 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, 2858 Sched<[sched.Folded, sched.ReadAfterFold]>; 2859 } 2860 2861} 2862 2863multiclass sse_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2864 Intrinsic Intr, Predicate target> { 2865 let Predicates = [target] in { 2866 // These are unary operations, but they are modeled as having 2 source operands 2867 // because the high elements of the destination are unchanged in SSE. 2868 def : Pat<(Intr VR128:$src), 2869 (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; 2870 } 2871 // We don't want to fold scalar loads into these instructions unless 2872 // optimizing for size. This is because the folded instruction will have a 2873 // partial register update, while the unfolded sequence will not, e.g. 2874 // movss mem, %xmm0 2875 // rcpss %xmm0, %xmm0 2876 // which has a clobber before the rcp, vs. 2877 // rcpss mem, %xmm0 2878 let Predicates = [target, OptForSize] in { 2879 def : Pat<(Intr (mem_frags addr:$src2)), 2880 (!cast<Instruction>(NAME#m_Int) 2881 (vt (IMPLICIT_DEF)), addr:$src2)>; 2882 } 2883} 2884 2885multiclass avx_fp_unop_s_intr<ValueType vt, PatFrags mem_frags, 2886 Intrinsic Intr, Predicate target> { 2887 let Predicates = [target] in { 2888 def : Pat<(Intr VR128:$src), 2889 (!cast<Instruction>(NAME#r_Int) VR128:$src, 2890 VR128:$src)>; 2891 } 2892 let Predicates = [target, OptForSize] in { 2893 def : Pat<(Intr (mem_frags addr:$src2)), 2894 (!cast<Instruction>(NAME#m_Int) 2895 (vt (IMPLICIT_DEF)), addr:$src2)>; 2896 } 2897} 2898 2899multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, 2900 ValueType ScalarVT, X86MemOperand x86memop, 2901 Operand intmemop, SDPatternOperator OpNode, Domain d, 2902 X86FoldableSchedWrite sched, Predicate target> { 2903 let isCodeGenOnly = 1, hasSideEffects = 0 in { 2904 def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 2905 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2906 [], d>, Sched<[sched]>; 2907 let mayLoad = 1 in 2908 def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 2909 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2910 [], d>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2911 } 2912 let hasSideEffects = 0, ExeDomain = d in { 2913 def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), 2914 (ins VR128:$src1, VR128:$src2), 2915 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2916 []>, Sched<[sched]>; 2917 let mayLoad = 1 in 2918 def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), 2919 (ins VR128:$src1, intmemop:$src2), 2920 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 2921 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 2922 } 2923 2924 // We don't want to fold scalar loads into these instructions unless 2925 // optimizing for size. This is because the folded instruction will have a 2926 // partial register update, while the unfolded sequence will not, e.g. 2927 // vmovss mem, %xmm0 2928 // vrcpss %xmm0, %xmm0, %xmm0 2929 // which has a clobber before the rcp, vs. 2930 // vrcpss mem, %xmm0, %xmm0 2931 // TODO: In theory, we could fold the load, and avoid the stall caused by 2932 // the partial register store, either in BreakFalseDeps or with smarter RA. 2933 let Predicates = [target] in { 2934 def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) 2935 (ScalarVT (IMPLICIT_DEF)), RC:$src)>; 2936 } 2937 let Predicates = [target, OptForSize] in { 2938 def : Pat<(ScalarVT (OpNode (load addr:$src))), 2939 (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), 2940 addr:$src)>; 2941 } 2942} 2943 2944/// sse1_fp_unop_p - SSE1 unops in packed form. 2945multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 2946 X86SchedWriteWidths sched, list<Predicate> prds> { 2947let Predicates = prds in { 2948 def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2949 !strconcat("v", OpcodeStr, 2950 "ps\t{$src, $dst|$dst, $src}"), 2951 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2952 VEX, Sched<[sched.XMM]>, WIG; 2953 def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2954 !strconcat("v", OpcodeStr, 2955 "ps\t{$src, $dst|$dst, $src}"), 2956 [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, 2957 VEX, Sched<[sched.XMM.Folded]>, WIG; 2958 def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2959 !strconcat("v", OpcodeStr, 2960 "ps\t{$src, $dst|$dst, $src}"), 2961 [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, 2962 VEX, VEX_L, Sched<[sched.YMM]>, WIG; 2963 def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 2964 !strconcat("v", OpcodeStr, 2965 "ps\t{$src, $dst|$dst, $src}"), 2966 [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, 2967 VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG; 2968} 2969 2970 def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2971 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2972 [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, 2973 Sched<[sched.XMM]>; 2974 def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2975 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), 2976 [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, 2977 Sched<[sched.XMM.Folded]>; 2978} 2979 2980/// sse2_fp_unop_p - SSE2 unops in vector forms. 2981multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, 2982 SDPatternOperator OpNode, X86SchedWriteWidths sched> { 2983let Predicates = [HasAVX, NoVLX] in { 2984 def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 2985 !strconcat("v", OpcodeStr, 2986 "pd\t{$src, $dst|$dst, $src}"), 2987 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 2988 VEX, Sched<[sched.XMM]>, WIG; 2989 def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 2990 !strconcat("v", OpcodeStr, 2991 "pd\t{$src, $dst|$dst, $src}"), 2992 [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, 2993 VEX, Sched<[sched.XMM.Folded]>, WIG; 2994 def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 2995 !strconcat("v", OpcodeStr, 2996 "pd\t{$src, $dst|$dst, $src}"), 2997 [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, 2998 VEX, VEX_L, Sched<[sched.YMM]>, WIG; 2999 def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 3000 !strconcat("v", OpcodeStr, 3001 "pd\t{$src, $dst|$dst, $src}"), 3002 [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, 3003 VEX, VEX_L, Sched<[sched.YMM.Folded]>, WIG; 3004} 3005 3006 def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3007 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3008 [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, 3009 Sched<[sched.XMM]>; 3010 def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 3011 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), 3012 [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, 3013 Sched<[sched.XMM.Folded]>; 3014} 3015 3016multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> { 3017 defm SS : sse_fp_unop_s_intr<v4f32, sse_load_f32, 3018 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3019 UseSSE1>, TB, XS; 3020 defm V#NAME#SS : avx_fp_unop_s_intr<v4f32, sse_load_f32, 3021 !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss), 3022 AVXTarget>, 3023 TB, XS, VEX, VVVV, VEX_LIG, WIG; 3024} 3025 3026multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3027 X86SchedWriteWidths sched, Predicate AVXTarget> { 3028 defm SS : sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem, 3029 ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, TB, XS; 3030 defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32, 3031 f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, 3032 TB, XS, VEX, VVVV, VEX_LIG, WIG; 3033} 3034 3035multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, 3036 X86SchedWriteWidths sched, Predicate AVXTarget> { 3037 defm SD : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem, 3038 sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, TB, XD; 3039 defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64, 3040 f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, 3041 TB, XD, VEX, VVVV, VEX_LIG, WIG; 3042} 3043 3044// Square root. 3045defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>, 3046 sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, 3047 sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>, 3048 sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC; 3049 3050// Reciprocal approximations. Note that these typically require refinement 3051// in order to obtain suitable precision. 3052defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, 3053 sse1_fp_unop_s_intr<"rsqrt", HasAVX>, 3054 sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; 3055defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, 3056 sse1_fp_unop_s_intr<"rcp", HasAVX>, 3057 sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; 3058 3059// There is no f64 version of the reciprocal approximation instructions. 3060 3061multiclass scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix, SDNode Move, 3062 ValueType VT, Predicate BasePredicate> { 3063 let Predicates = [BasePredicate] in { 3064 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3065 (OpNode (extractelt VT:$src, 0))))), 3066 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3067 } 3068 3069 // Repeat for AVX versions of the instructions. 3070 let Predicates = [UseAVX] in { 3071 def : Pat<(VT (Move VT:$dst, (scalar_to_vector 3072 (OpNode (extractelt VT:$src, 0))))), 3073 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3074 } 3075} 3076 3077defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; 3078defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; 3079 3080multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, 3081 SDNode Move, ValueType VT, 3082 Predicate BasePredicate> { 3083 let Predicates = [BasePredicate] in { 3084 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3085 (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3086 } 3087 3088 // Repeat for AVX versions of the instructions. 3089 let Predicates = [HasAVX] in { 3090 def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), 3091 (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; 3092 } 3093} 3094 3095defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, 3096 v4f32, UseSSE1>; 3097defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, 3098 v4f32, UseSSE1>; 3099 3100 3101//===----------------------------------------------------------------------===// 3102// SSE 1 & 2 - Non-temporal stores 3103//===----------------------------------------------------------------------===// 3104 3105let AddedComplexity = 400 in { // Prefer non-temporal versions 3106let Predicates = [HasAVX, NoVLX] in { 3107let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3108def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), 3109 (ins f128mem:$dst, VR128:$src), 3110 "movntps\t{$src, $dst|$dst, $src}", 3111 [(alignednontemporalstore (v4f32 VR128:$src), 3112 addr:$dst)]>, VEX, WIG; 3113def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), 3114 (ins f128mem:$dst, VR128:$src), 3115 "movntpd\t{$src, $dst|$dst, $src}", 3116 [(alignednontemporalstore (v2f64 VR128:$src), 3117 addr:$dst)]>, VEX, WIG; 3118} // SchedRW 3119 3120let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { 3121def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), 3122 (ins f256mem:$dst, VR256:$src), 3123 "movntps\t{$src, $dst|$dst, $src}", 3124 [(alignednontemporalstore (v8f32 VR256:$src), 3125 addr:$dst)]>, VEX, VEX_L, WIG; 3126def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), 3127 (ins f256mem:$dst, VR256:$src), 3128 "movntpd\t{$src, $dst|$dst, $src}", 3129 [(alignednontemporalstore (v4f64 VR256:$src), 3130 addr:$dst)]>, VEX, VEX_L, WIG; 3131} // SchedRW 3132 3133let ExeDomain = SSEPackedInt in { 3134def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), 3135 (ins i128mem:$dst, VR128:$src), 3136 "movntdq\t{$src, $dst|$dst, $src}", 3137 [(alignednontemporalstore (v2i64 VR128:$src), 3138 addr:$dst)]>, VEX, WIG, 3139 Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; 3140def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), 3141 (ins i256mem:$dst, VR256:$src), 3142 "movntdq\t{$src, $dst|$dst, $src}", 3143 [(alignednontemporalstore (v4i64 VR256:$src), 3144 addr:$dst)]>, VEX, VEX_L, WIG, 3145 Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; 3146} // ExeDomain 3147} // Predicates 3148 3149let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { 3150def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3151 "movntps\t{$src, $dst|$dst, $src}", 3152 [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; 3153def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3154 "movntpd\t{$src, $dst|$dst, $src}", 3155 [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; 3156} // SchedRW 3157 3158let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in 3159def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), 3160 "movntdq\t{$src, $dst|$dst, $src}", 3161 [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; 3162 3163let SchedRW = [WriteStoreNT] in { 3164// There is no AVX form for instructions below this point 3165def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), 3166 "movnti{l}\t{$src, $dst|$dst, $src}", 3167 [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, 3168 TB, Requires<[HasSSE2]>; 3169def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), 3170 "movnti{q}\t{$src, $dst|$dst, $src}", 3171 [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, 3172 TB, Requires<[HasSSE2]>; 3173} // SchedRW = [WriteStoreNT] 3174 3175let Predicates = [HasAVX, NoVLX] in { 3176 def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), 3177 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3178 def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), 3179 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3180 def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst), 3181 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3182 def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), 3183 (VMOVNTDQYmr addr:$dst, VR256:$src)>; 3184 3185 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3186 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3187 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3188 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3189 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3190 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3191 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3192 (VMOVNTDQmr addr:$dst, VR128:$src)>; 3193} 3194 3195let Predicates = [UseSSE2] in { 3196 def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), 3197 (MOVNTDQmr addr:$dst, VR128:$src)>; 3198 def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), 3199 (MOVNTDQmr addr:$dst, VR128:$src)>; 3200 def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst), 3201 (MOVNTDQmr addr:$dst, VR128:$src)>; 3202 def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), 3203 (MOVNTDQmr addr:$dst, VR128:$src)>; 3204} 3205 3206} // AddedComplexity 3207 3208//===----------------------------------------------------------------------===// 3209// SSE 1 & 2 - Prefetch and memory fence 3210//===----------------------------------------------------------------------===// 3211 3212// Prefetch intrinsic. 3213let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { 3214def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), 3215 "prefetcht0\t$src", [(prefetch addr:$src, timm, (i32 3), (i32 1))]>, TB; 3216def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), 3217 "prefetcht1\t$src", [(prefetch addr:$src, timm, (i32 2), (i32 1))]>, TB; 3218def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), 3219 "prefetcht2\t$src", [(prefetch addr:$src, timm, (i32 1), (i32 1))]>, TB; 3220def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), 3221 "prefetchnta\t$src", [(prefetch addr:$src, timm, (i32 0), (i32 1))]>, TB; 3222} 3223 3224// FIXME: How should flush instruction be modeled? 3225let SchedRW = [WriteLoad] in { 3226// Flush cache 3227def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), 3228 "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, 3229 TB, Requires<[HasCLFLUSH]>; 3230} 3231 3232let SchedRW = [WriteNop] in { 3233// Pause. This "instruction" is encoded as "rep; nop", so even though it 3234// was introduced with SSE2, it's backward compatible. 3235def PAUSE : I<0x90, RawFrm, (outs), (ins), 3236 "pause", [(int_x86_sse2_pause)]>, XS; 3237} 3238 3239let SchedRW = [WriteFence] in { 3240// Load, store, and memory fence 3241// TODO: As with mfence, we may want to ease the availability of sfence/lfence 3242// to include any 64-bit target. 3243def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, 3244 TB, Requires<[HasSSE1]>; 3245def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, 3246 TB, Requires<[HasSSE2]>; 3247def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, 3248 TB, Requires<[HasMFence]>; 3249} // SchedRW 3250 3251def : Pat<(X86MFence), (MFENCE)>; 3252 3253//===----------------------------------------------------------------------===// 3254// SSE 1 & 2 - Load/Store XCSR register 3255//===----------------------------------------------------------------------===// 3256 3257let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3258def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), 3259 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3260 VEX, Sched<[WriteLDMXCSR]>, WIG; 3261let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3262def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3263 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3264 VEX, Sched<[WriteSTMXCSR]>, WIG; 3265 3266let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in 3267def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), 3268 "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, 3269 TB, Sched<[WriteLDMXCSR]>; 3270let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in 3271def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), 3272 "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, 3273 TB, Sched<[WriteSTMXCSR]>; 3274 3275//===---------------------------------------------------------------------===// 3276// SSE2 - Move Aligned/Unaligned Packed Integer Instructions 3277//===---------------------------------------------------------------------===// 3278 3279let ExeDomain = SSEPackedInt in { // SSE integer instructions 3280 3281let hasSideEffects = 0 in { 3282def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3283 "movdqa\t{$src, $dst|$dst, $src}", []>, 3284 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG; 3285def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3286 "movdqu\t{$src, $dst|$dst, $src}", []>, 3287 Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, WIG; 3288def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3289 "movdqa\t{$src, $dst|$dst, $src}", []>, 3290 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG; 3291def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 3292 "movdqu\t{$src, $dst|$dst, $src}", []>, 3293 Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, WIG; 3294} 3295 3296// For Disassembler 3297let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3298def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3299 "movdqa\t{$src, $dst|$dst, $src}", []>, 3300 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3301 VEX, WIG; 3302def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3303 "movdqa\t{$src, $dst|$dst, $src}", []>, 3304 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3305 VEX, VEX_L, WIG; 3306def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3307 "movdqu\t{$src, $dst|$dst, $src}", []>, 3308 Sched<[SchedWriteVecMoveLS.XMM.RR]>, 3309 VEX, WIG; 3310def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), 3311 "movdqu\t{$src, $dst|$dst, $src}", []>, 3312 Sched<[SchedWriteVecMoveLS.YMM.RR]>, 3313 VEX, VEX_L, WIG; 3314} 3315 3316let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3317 hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3318def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3319 "movdqa\t{$src, $dst|$dst, $src}", 3320 [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, 3321 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG; 3322def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3323 "movdqa\t{$src, $dst|$dst, $src}", []>, 3324 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3325 VEX, VEX_L, WIG; 3326def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3327 "vmovdqu\t{$src, $dst|$dst, $src}", 3328 [(set VR128:$dst, (loadv2i64 addr:$src))]>, 3329 Sched<[SchedWriteVecMoveLS.XMM.RM]>, 3330 TB, XS, VEX, WIG; 3331def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 3332 "vmovdqu\t{$src, $dst|$dst, $src}", []>, 3333 Sched<[SchedWriteVecMoveLS.YMM.RM]>, 3334 TB, XS, VEX, VEX_L, WIG; 3335} 3336 3337let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { 3338def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), 3339 (ins i128mem:$dst, VR128:$src), 3340 "movdqa\t{$src, $dst|$dst, $src}", 3341 [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, 3342 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, WIG; 3343def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), 3344 (ins i256mem:$dst, VR256:$src), 3345 "movdqa\t{$src, $dst|$dst, $src}", []>, 3346 Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, WIG; 3347def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3348 "vmovdqu\t{$src, $dst|$dst, $src}", 3349 [(store (v2i64 VR128:$src), addr:$dst)]>, 3350 Sched<[SchedWriteVecMoveLS.XMM.MR]>, TB, XS, VEX, WIG; 3351def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), 3352 "vmovdqu\t{$src, $dst|$dst, $src}",[]>, 3353 Sched<[SchedWriteVecMoveLS.YMM.MR]>, TB, XS, VEX, VEX_L, WIG; 3354} 3355 3356let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { 3357let hasSideEffects = 0 in { 3358def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3359 "movdqa\t{$src, $dst|$dst, $src}", []>; 3360 3361def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 3362 "movdqu\t{$src, $dst|$dst, $src}", []>, 3363 TB, XS, Requires<[UseSSE2]>; 3364} 3365 3366// For Disassembler 3367let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { 3368def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3369 "movdqa\t{$src, $dst|$dst, $src}", []>; 3370 3371def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 3372 "movdqu\t{$src, $dst|$dst, $src}", []>, 3373 TB, XS, Requires<[UseSSE2]>; 3374} 3375} // SchedRW 3376 3377let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, 3378 hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { 3379def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3380 "movdqa\t{$src, $dst|$dst, $src}", 3381 [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; 3382def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 3383 "movdqu\t{$src, $dst|$dst, $src}", 3384 [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, 3385 TB, XS, Requires<[UseSSE2]>; 3386} 3387 3388let mayStore = 1, hasSideEffects = 0, 3389 SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 3390def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3391 "movdqa\t{$src, $dst|$dst, $src}", 3392 [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; 3393def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), 3394 "movdqu\t{$src, $dst|$dst, $src}", 3395 [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, 3396 TB, XS, Requires<[UseSSE2]>; 3397} 3398 3399} // ExeDomain = SSEPackedInt 3400 3401// Reversed version with ".s" suffix for GAS compatibility. 3402def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3403 (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3404def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", 3405 (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; 3406def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3407 (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3408def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", 3409 (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; 3410 3411// Reversed version with ".s" suffix for GAS compatibility. 3412def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", 3413 (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; 3414def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", 3415 (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; 3416 3417let Predicates = [HasAVX, NoVLX] in { 3418 // Additional patterns for other integer sizes. 3419 def : Pat<(alignedloadv4i32 addr:$src), 3420 (VMOVDQArm addr:$src)>; 3421 def : Pat<(alignedloadv8i16 addr:$src), 3422 (VMOVDQArm addr:$src)>; 3423 def : Pat<(alignedloadv8f16 addr:$src), 3424 (VMOVDQArm addr:$src)>; 3425 def : Pat<(alignedloadv16i8 addr:$src), 3426 (VMOVDQArm addr:$src)>; 3427 def : Pat<(loadv4i32 addr:$src), 3428 (VMOVDQUrm addr:$src)>; 3429 def : Pat<(loadv8i16 addr:$src), 3430 (VMOVDQUrm addr:$src)>; 3431 def : Pat<(loadv8f16 addr:$src), 3432 (VMOVDQUrm addr:$src)>; 3433 def : Pat<(loadv16i8 addr:$src), 3434 (VMOVDQUrm addr:$src)>; 3435 3436 def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), 3437 (VMOVDQAmr addr:$dst, VR128:$src)>; 3438 def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), 3439 (VMOVDQAmr addr:$dst, VR128:$src)>; 3440 def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst), 3441 (VMOVDQAmr addr:$dst, VR128:$src)>; 3442 def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), 3443 (VMOVDQAmr addr:$dst, VR128:$src)>; 3444 def : Pat<(store (v4i32 VR128:$src), addr:$dst), 3445 (VMOVDQUmr addr:$dst, VR128:$src)>; 3446 def : Pat<(store (v8i16 VR128:$src), addr:$dst), 3447 (VMOVDQUmr addr:$dst, VR128:$src)>; 3448 def : Pat<(store (v8f16 VR128:$src), addr:$dst), 3449 (VMOVDQUmr addr:$dst, VR128:$src)>; 3450 def : Pat<(store (v16i8 VR128:$src), addr:$dst), 3451 (VMOVDQUmr addr:$dst, VR128:$src)>; 3452} 3453 3454//===---------------------------------------------------------------------===// 3455// SSE2 - Packed Integer Arithmetic Instructions 3456//===---------------------------------------------------------------------===// 3457 3458let ExeDomain = SSEPackedInt in { // SSE integer instructions 3459 3460/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types 3461multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, 3462 ValueType DstVT, ValueType SrcVT, RegisterClass RC, 3463 PatFrag memop_frag, X86MemOperand x86memop, 3464 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3465 let isCommutable = 1 in 3466 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3467 (ins RC:$src1, RC:$src2), 3468 !if(Is2Addr, 3469 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3470 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3471 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, 3472 Sched<[sched]>; 3473 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3474 (ins RC:$src1, x86memop:$src2), 3475 !if(Is2Addr, 3476 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3477 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3478 [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), 3479 (memop_frag addr:$src2))))]>, 3480 Sched<[sched.Folded, sched.ReadAfterFold]>; 3481} 3482} // ExeDomain = SSEPackedInt 3483 3484defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, 3485 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3486defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, 3487 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3488defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, 3489 SchedWriteVecALU, 1, NoVLX>; 3490defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, 3491 SchedWriteVecALU, 1, NoVLX>; 3492defm PADDSB : PDI_binop_all<0xEC, "paddsb", saddsat, v16i8, v32i8, 3493 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3494defm PADDSW : PDI_binop_all<0xED, "paddsw", saddsat, v8i16, v16i16, 3495 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3496defm PADDUSB : PDI_binop_all<0xDC, "paddusb", uaddsat, v16i8, v32i8, 3497 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3498defm PADDUSW : PDI_binop_all<0xDD, "paddusw", uaddsat, v8i16, v16i16, 3499 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3500defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, 3501 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3502defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, 3503 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3504defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, 3505 SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; 3506defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, 3507 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3508defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, 3509 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3510defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, 3511 SchedWriteVecALU, 0, NoVLX>; 3512defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, 3513 SchedWriteVecALU, 0, NoVLX>; 3514defm PSUBSB : PDI_binop_all<0xE8, "psubsb", ssubsat, v16i8, v32i8, 3515 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3516defm PSUBSW : PDI_binop_all<0xE9, "psubsw", ssubsat, v8i16, v16i16, 3517 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3518defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", usubsat, v16i8, v32i8, 3519 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3520defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", usubsat, v8i16, v16i16, 3521 SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; 3522defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, 3523 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3524defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, 3525 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3526defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, 3527 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3528defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, 3529 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3530defm PAVGB : PDI_binop_all<0xE0, "pavgb", avgceilu, v16i8, v32i8, 3531 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3532defm PAVGW : PDI_binop_all<0xE3, "pavgw", avgceilu, v8i16, v16i16, 3533 SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; 3534defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, 3535 SchedWriteVecIMul, 1, NoVLX>; 3536 3537let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3538defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3539 load, i128mem, SchedWriteVecIMul.XMM, 0>, 3540 VEX, VVVV, WIG; 3541 3542let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3543defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, 3544 VR256, load, i256mem, SchedWriteVecIMul.YMM, 3545 0>, VEX, VVVV, VEX_L, WIG; 3546let Constraints = "$src1 = $dst" in 3547defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, 3548 memop, i128mem, SchedWriteVecIMul.XMM>; 3549 3550let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3551defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, 3552 load, i128mem, SchedWritePSADBW.XMM, 0>, 3553 VEX, VVVV, WIG; 3554let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3555defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, 3556 load, i256mem, SchedWritePSADBW.YMM, 0>, 3557 VEX, VVVV, VEX_L, WIG; 3558let Constraints = "$src1 = $dst" in 3559defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, 3560 memop, i128mem, SchedWritePSADBW.XMM>; 3561 3562//===---------------------------------------------------------------------===// 3563// SSE2 - Packed Integer Logical Instructions 3564//===---------------------------------------------------------------------===// 3565 3566multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, 3567 string OpcodeStr, SDNode OpNode, 3568 SDNode OpNode2, RegisterClass RC, 3569 X86FoldableSchedWrite sched, 3570 X86FoldableSchedWrite schedImm, 3571 ValueType DstVT, ValueType SrcVT, 3572 PatFrag ld_frag, bit Is2Addr = 1> { 3573 // src2 is always 128-bit 3574 def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), 3575 (ins RC:$src1, VR128:$src2), 3576 !if(Is2Addr, 3577 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3578 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3579 [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, 3580 Sched<[sched]>; 3581 def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), 3582 (ins RC:$src1, i128mem:$src2), 3583 !if(Is2Addr, 3584 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3585 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3586 [(set RC:$dst, (DstVT (OpNode RC:$src1, 3587 (SrcVT (ld_frag addr:$src2)))))]>, 3588 Sched<[sched.Folded, sched.ReadAfterFold]>; 3589 def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), 3590 (ins RC:$src1, u8imm:$src2), 3591 !if(Is2Addr, 3592 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3593 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3594 [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 timm:$src2))))]>, 3595 Sched<[schedImm]>; 3596} 3597 3598multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, 3599 string OpcodeStr, SDNode OpNode, 3600 SDNode OpNode2, ValueType DstVT128, 3601 ValueType DstVT256, ValueType SrcVT, 3602 X86SchedWriteWidths sched, 3603 X86SchedWriteWidths schedImm, Predicate prd> { 3604let Predicates = [HasAVX, prd] in 3605 defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3606 OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, 3607 DstVT128, SrcVT, load, 0>, VEX, VVVV, WIG; 3608let Predicates = [HasAVX2, prd] in 3609 defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), 3610 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, 3611 DstVT256, SrcVT, load, 0>, VEX, VVVV, VEX_L, 3612 WIG; 3613let Constraints = "$src1 = $dst" in 3614 defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, 3615 VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, 3616 memop>; 3617} 3618 3619multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, 3620 SDNode OpNode, RegisterClass RC, ValueType VT, 3621 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 3622 def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), 3623 !if(Is2Addr, 3624 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3625 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3626 [(set RC:$dst, (VT (OpNode RC:$src1, (i8 timm:$src2))))]>, 3627 Sched<[sched]>; 3628} 3629 3630multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, 3631 SDNode OpNode, X86SchedWriteWidths sched> { 3632let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 3633 defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3634 VR128, v16i8, sched.XMM, 0>, VEX, VVVV, WIG; 3635let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 3636 defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, 3637 VR256, v32i8, sched.YMM, 0>, 3638 VEX, VVVV, VEX_L, WIG; 3639let Constraints = "$src1 = $dst" in 3640 defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, 3641 sched.XMM>; 3642} 3643 3644let ExeDomain = SSEPackedInt in { 3645 defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, 3646 v8i16, v16i16, v8i16, SchedWriteVecShift, 3647 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3648 defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, 3649 v4i32, v8i32, v4i32, SchedWriteVecShift, 3650 SchedWriteVecShiftImm, NoVLX>; 3651 defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, 3652 v2i64, v4i64, v2i64, SchedWriteVecShift, 3653 SchedWriteVecShiftImm, NoVLX>; 3654 3655 defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, 3656 v8i16, v16i16, v8i16, SchedWriteVecShift, 3657 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3658 defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, 3659 v4i32, v8i32, v4i32, SchedWriteVecShift, 3660 SchedWriteVecShiftImm, NoVLX>; 3661 defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, 3662 v2i64, v4i64, v2i64, SchedWriteVecShift, 3663 SchedWriteVecShiftImm, NoVLX>; 3664 3665 defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, 3666 v8i16, v16i16, v8i16, SchedWriteVecShift, 3667 SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; 3668 defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, 3669 v4i32, v8i32, v4i32, SchedWriteVecShift, 3670 SchedWriteVecShiftImm, NoVLX>; 3671 3672 defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, 3673 SchedWriteShuffle>; 3674 defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, 3675 SchedWriteShuffle>; 3676} // ExeDomain = SSEPackedInt 3677 3678//===---------------------------------------------------------------------===// 3679// SSE2 - Packed Integer Comparison Instructions 3680//===---------------------------------------------------------------------===// 3681 3682defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, 3683 SchedWriteVecALU, 1, TruePredicate>; 3684defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, 3685 SchedWriteVecALU, 1, TruePredicate>; 3686defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, 3687 SchedWriteVecALU, 1, TruePredicate>; 3688defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, 3689 SchedWriteVecALU, 0, TruePredicate>; 3690defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, 3691 SchedWriteVecALU, 0, TruePredicate>; 3692defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, 3693 SchedWriteVecALU, 0, TruePredicate>; 3694 3695//===---------------------------------------------------------------------===// 3696// SSE2 - Packed Integer Shuffle Instructions 3697//===---------------------------------------------------------------------===// 3698 3699let ExeDomain = SSEPackedInt in { 3700multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, 3701 SDNode OpNode, X86SchedWriteWidths sched, 3702 Predicate prd> { 3703let Predicates = [HasAVX, prd] in { 3704 def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), 3705 (ins VR128:$src1, u8imm:$src2), 3706 !strconcat("v", OpcodeStr, 3707 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3708 [(set VR128:$dst, 3709 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3710 VEX, Sched<[sched.XMM]>, WIG; 3711 def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), 3712 (ins i128mem:$src1, u8imm:$src2), 3713 !strconcat("v", OpcodeStr, 3714 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3715 [(set VR128:$dst, 3716 (vt128 (OpNode (load addr:$src1), 3717 (i8 timm:$src2))))]>, VEX, 3718 Sched<[sched.XMM.Folded]>, WIG; 3719} 3720 3721let Predicates = [HasAVX2, prd] in { 3722 def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), 3723 (ins VR256:$src1, u8imm:$src2), 3724 !strconcat("v", OpcodeStr, 3725 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3726 [(set VR256:$dst, 3727 (vt256 (OpNode VR256:$src1, (i8 timm:$src2))))]>, 3728 VEX, VEX_L, Sched<[sched.YMM]>, WIG; 3729 def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), 3730 (ins i256mem:$src1, u8imm:$src2), 3731 !strconcat("v", OpcodeStr, 3732 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3733 [(set VR256:$dst, 3734 (vt256 (OpNode (load addr:$src1), 3735 (i8 timm:$src2))))]>, VEX, VEX_L, 3736 Sched<[sched.YMM.Folded]>, WIG; 3737} 3738 3739let Predicates = [UseSSE2] in { 3740 def ri : Ii8<0x70, MRMSrcReg, 3741 (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), 3742 !strconcat(OpcodeStr, 3743 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3744 [(set VR128:$dst, 3745 (vt128 (OpNode VR128:$src1, (i8 timm:$src2))))]>, 3746 Sched<[sched.XMM]>; 3747 def mi : Ii8<0x70, MRMSrcMem, 3748 (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), 3749 !strconcat(OpcodeStr, 3750 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 3751 [(set VR128:$dst, 3752 (vt128 (OpNode (memop addr:$src1), 3753 (i8 timm:$src2))))]>, 3754 Sched<[sched.XMM.Folded]>; 3755} 3756} 3757} // ExeDomain = SSEPackedInt 3758 3759defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, 3760 SchedWriteShuffle, NoVLX>, TB, PD; 3761defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, 3762 SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XS; 3763defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, 3764 SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XD; 3765 3766//===---------------------------------------------------------------------===// 3767// Packed Integer Pack Instructions (SSE & AVX) 3768//===---------------------------------------------------------------------===// 3769 3770let ExeDomain = SSEPackedInt in { 3771multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3772 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3773 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3774 PatFrag ld_frag, bit Is2Addr = 1> { 3775 def rr : PDI<opc, MRMSrcReg, 3776 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3777 !if(Is2Addr, 3778 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3779 !strconcat(OpcodeStr, 3780 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3781 [(set RC:$dst, 3782 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3783 Sched<[sched]>; 3784 def rm : PDI<opc, MRMSrcMem, 3785 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3786 !if(Is2Addr, 3787 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3788 !strconcat(OpcodeStr, 3789 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3790 [(set RC:$dst, 3791 (OutVT (OpNode (ArgVT RC:$src1), 3792 (ld_frag addr:$src2))))]>, 3793 Sched<[sched.Folded, sched.ReadAfterFold]>; 3794} 3795 3796multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, 3797 ValueType ArgVT, SDNode OpNode, RegisterClass RC, 3798 X86MemOperand x86memop, X86FoldableSchedWrite sched, 3799 PatFrag ld_frag, bit Is2Addr = 1> { 3800 def rr : SS48I<opc, MRMSrcReg, 3801 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3802 !if(Is2Addr, 3803 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3804 !strconcat(OpcodeStr, 3805 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3806 [(set RC:$dst, 3807 (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, 3808 Sched<[sched]>; 3809 def rm : SS48I<opc, MRMSrcMem, 3810 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3811 !if(Is2Addr, 3812 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 3813 !strconcat(OpcodeStr, 3814 "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3815 [(set RC:$dst, 3816 (OutVT (OpNode (ArgVT RC:$src1), 3817 (ld_frag addr:$src2))))]>, 3818 Sched<[sched.Folded, sched.ReadAfterFold]>; 3819} 3820 3821let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3822 defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, 3823 i128mem, SchedWriteShuffle.XMM, load, 0>, 3824 VEX, VVVV, WIG; 3825 defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, 3826 i128mem, SchedWriteShuffle.XMM, load, 0>, 3827 VEX, VVVV, WIG; 3828 3829 defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, 3830 i128mem, SchedWriteShuffle.XMM, load, 0>, 3831 VEX, VVVV, WIG; 3832 defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, 3833 i128mem, SchedWriteShuffle.XMM, load, 0>, 3834 VEX, VVVV, WIG; 3835} 3836 3837let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3838 defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, 3839 i256mem, SchedWriteShuffle.YMM, load, 0>, 3840 VEX, VVVV, VEX_L, WIG; 3841 defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, 3842 i256mem, SchedWriteShuffle.YMM, load, 0>, 3843 VEX, VVVV, VEX_L, WIG; 3844 3845 defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, 3846 i256mem, SchedWriteShuffle.YMM, load, 0>, 3847 VEX, VVVV, VEX_L, WIG; 3848 defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, 3849 i256mem, SchedWriteShuffle.YMM, load, 0>, 3850 VEX, VVVV, VEX_L, WIG; 3851} 3852 3853let Constraints = "$src1 = $dst" in { 3854 defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, 3855 i128mem, SchedWriteShuffle.XMM, memop>; 3856 defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, 3857 i128mem, SchedWriteShuffle.XMM, memop>; 3858 3859 defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, 3860 i128mem, SchedWriteShuffle.XMM, memop>; 3861 3862 defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, 3863 i128mem, SchedWriteShuffle.XMM, memop>; 3864} 3865} // ExeDomain = SSEPackedInt 3866 3867//===---------------------------------------------------------------------===// 3868// SSE2 - Packed Integer Unpack Instructions 3869//===---------------------------------------------------------------------===// 3870 3871let ExeDomain = SSEPackedInt in { 3872multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, 3873 SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, 3874 X86FoldableSchedWrite sched, PatFrag ld_frag, 3875 bit Is2Addr = 1> { 3876 def rr : PDI<opc, MRMSrcReg, 3877 (outs RC:$dst), (ins RC:$src1, RC:$src2), 3878 !if(Is2Addr, 3879 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3880 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3881 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 3882 Sched<[sched]>; 3883 def rm : PDI<opc, MRMSrcMem, 3884 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 3885 !if(Is2Addr, 3886 !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), 3887 !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 3888 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 3889 Sched<[sched.Folded, sched.ReadAfterFold]>; 3890} 3891 3892let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 3893 defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, 3894 i128mem, SchedWriteShuffle.XMM, load, 0>, 3895 VEX, VVVV, WIG; 3896 defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, 3897 i128mem, SchedWriteShuffle.XMM, load, 0>, 3898 VEX, VVVV, WIG; 3899 defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, 3900 i128mem, SchedWriteShuffle.XMM, load, 0>, 3901 VEX, VVVV, WIG; 3902 defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, 3903 i128mem, SchedWriteShuffle.XMM, load, 0>, 3904 VEX, VVVV, WIG; 3905} 3906 3907let Predicates = [HasAVX, NoVLX] in { 3908 defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, 3909 i128mem, SchedWriteShuffle.XMM, load, 0>, 3910 VEX, VVVV, WIG; 3911 defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, 3912 i128mem, SchedWriteShuffle.XMM, load, 0>, 3913 VEX, VVVV, WIG; 3914 defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, 3915 i128mem, SchedWriteShuffle.XMM, load, 0>, 3916 VEX, VVVV, WIG; 3917 defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, 3918 i128mem, SchedWriteShuffle.XMM, load, 0>, 3919 VEX, VVVV, WIG; 3920} 3921 3922let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 3923 defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, 3924 i256mem, SchedWriteShuffle.YMM, load, 0>, 3925 VEX, VVVV, VEX_L, WIG; 3926 defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, 3927 i256mem, SchedWriteShuffle.YMM, load, 0>, 3928 VEX, VVVV, VEX_L, WIG; 3929 defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, 3930 i256mem, SchedWriteShuffle.YMM, load, 0>, 3931 VEX, VVVV, VEX_L, WIG; 3932 defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, 3933 i256mem, SchedWriteShuffle.YMM, load, 0>, 3934 VEX, VVVV, VEX_L, WIG; 3935} 3936 3937let Predicates = [HasAVX2, NoVLX] in { 3938 defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, 3939 i256mem, SchedWriteShuffle.YMM, load, 0>, 3940 VEX, VVVV, VEX_L, WIG; 3941 defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, 3942 i256mem, SchedWriteShuffle.YMM, load, 0>, 3943 VEX, VVVV, VEX_L, WIG; 3944 defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, 3945 i256mem, SchedWriteShuffle.YMM, load, 0>, 3946 VEX, VVVV, VEX_L, WIG; 3947 defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, 3948 i256mem, SchedWriteShuffle.YMM, load, 0>, 3949 VEX, VVVV, VEX_L, WIG; 3950} 3951 3952let Constraints = "$src1 = $dst" in { 3953 defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, 3954 i128mem, SchedWriteShuffle.XMM, memop>; 3955 defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, 3956 i128mem, SchedWriteShuffle.XMM, memop>; 3957 defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, 3958 i128mem, SchedWriteShuffle.XMM, memop>; 3959 defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, 3960 i128mem, SchedWriteShuffle.XMM, memop>; 3961 3962 defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, 3963 i128mem, SchedWriteShuffle.XMM, memop>; 3964 defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, 3965 i128mem, SchedWriteShuffle.XMM, memop>; 3966 defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, 3967 i128mem, SchedWriteShuffle.XMM, memop>; 3968 defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, 3969 i128mem, SchedWriteShuffle.XMM, memop>; 3970} 3971} // ExeDomain = SSEPackedInt 3972 3973//===---------------------------------------------------------------------===// 3974// SSE2 - Packed Integer Extract and Insert 3975//===---------------------------------------------------------------------===// 3976 3977let ExeDomain = SSEPackedInt in { 3978multiclass sse2_pinsrw<bit Is2Addr = 1> { 3979 def rr : Ii8<0xC4, MRMSrcReg, 3980 (outs VR128:$dst), (ins VR128:$src1, 3981 GR32orGR64:$src2, u8imm:$src3), 3982 !if(Is2Addr, 3983 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3984 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3985 [(set VR128:$dst, 3986 (X86pinsrw VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 3987 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 3988 def rm : Ii8<0xC4, MRMSrcMem, 3989 (outs VR128:$dst), (ins VR128:$src1, 3990 i16mem:$src2, u8imm:$src3), 3991 !if(Is2Addr, 3992 "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", 3993 "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 3994 [(set VR128:$dst, 3995 (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), 3996 timm:$src3))]>, 3997 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 3998} 3999 4000// Extract 4001let Predicates = [HasAVX, NoBWI] in 4002def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, 4003 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4004 "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4005 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4006 timm:$src2))]>, 4007 TB, PD, VEX, WIG, Sched<[WriteVecExtract]>; 4008def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, 4009 (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), 4010 "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", 4011 [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), 4012 timm:$src2))]>, 4013 Sched<[WriteVecExtract]>; 4014 4015// Insert 4016let Predicates = [HasAVX, NoBWI] in 4017defm VPINSRW : sse2_pinsrw<0>, TB, PD, VEX, VVVV, WIG; 4018 4019let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in 4020defm PINSRW : sse2_pinsrw, TB, PD; 4021 4022} // ExeDomain = SSEPackedInt 4023 4024// Always select FP16 instructions if available. 4025let Predicates = [UseSSE2], AddedComplexity = -10 in { 4026 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (PINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4027 def : Pat<(store f16:$src, addr:$dst), (MOV16mr addr:$dst, (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit))>; 4028 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (PEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4029 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (PINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4030} 4031 4032let Predicates = [HasAVX, NoBWI] in { 4033 def : Pat<(f16 (load addr:$src)), (COPY_TO_REGCLASS (VPINSRWrm (v8i16 (IMPLICIT_DEF)), addr:$src, 0), FR16)>; 4034 def : Pat<(i16 (bitconvert f16:$src)), (EXTRACT_SUBREG (VPEXTRWrr (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0), sub_16bit)>; 4035 def : Pat<(f16 (bitconvert i16:$src)), (COPY_TO_REGCLASS (VPINSRWrr (v8i16 (IMPLICIT_DEF)), (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit), 0), FR16)>; 4036} 4037 4038//===---------------------------------------------------------------------===// 4039// SSE2 - Packed Mask Creation 4040//===---------------------------------------------------------------------===// 4041 4042let ExeDomain = SSEPackedInt in { 4043 4044def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4045 (ins VR128:$src), 4046 "pmovmskb\t{$src, $dst|$dst, $src}", 4047 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4048 Sched<[WriteVecMOVMSK]>, VEX, WIG; 4049 4050let Predicates = [HasAVX2] in { 4051def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), 4052 (ins VR256:$src), 4053 "pmovmskb\t{$src, $dst|$dst, $src}", 4054 [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, 4055 Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, WIG; 4056} 4057 4058def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), 4059 "pmovmskb\t{$src, $dst|$dst, $src}", 4060 [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, 4061 Sched<[WriteVecMOVMSK]>; 4062 4063} // ExeDomain = SSEPackedInt 4064 4065//===---------------------------------------------------------------------===// 4066// SSE2 - Conditional Store 4067//===---------------------------------------------------------------------===// 4068 4069let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { 4070// As VEX does not have separate instruction contexts for address size 4071// overrides, VMASKMOVDQU and VMASKMOVDQU64 would have a decode conflict. 4072// Prefer VMASKMODDQU64. 4073let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in 4074def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), 4075 (ins VR128:$src, VR128:$mask), 4076 "maskmovdqu\t{$mask, $src|$src, $mask}", 4077 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, 4078 VEX, WIG; 4079let Uses = [EDI], Predicates = [HasAVX], isAsmParserOnly = 1 in 4080def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), 4081 (ins VR128:$src, VR128:$mask), 4082 "maskmovdqu\t{$mask, $src|$src, $mask}", 4083 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, 4084 VEX, WIG; 4085 4086let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in 4087def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4088 "maskmovdqu\t{$mask, $src|$src, $mask}", 4089 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; 4090let Uses = [EDI], Predicates = [UseSSE2] in 4091def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), 4092 "maskmovdqu\t{$mask, $src|$src, $mask}", 4093 [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; 4094 4095} // ExeDomain = SSEPackedInt 4096 4097//===---------------------------------------------------------------------===// 4098// SSE2 - Move Doubleword/Quadword 4099//===---------------------------------------------------------------------===// 4100 4101//===---------------------------------------------------------------------===// 4102// Move Int Doubleword to Packed Double Int 4103// 4104let ExeDomain = SSEPackedInt in { 4105def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4106 "movd\t{$src, $dst|$dst, $src}", 4107 [(set VR128:$dst, 4108 (v4i32 (scalar_to_vector GR32:$src)))]>, 4109 VEX, Sched<[WriteVecMoveFromGpr]>; 4110def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4111 "movd\t{$src, $dst|$dst, $src}", 4112 [(set VR128:$dst, 4113 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4114 VEX, Sched<[WriteVecLoad]>; 4115def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4116 "movq\t{$src, $dst|$dst, $src}", 4117 [(set VR128:$dst, 4118 (v2i64 (scalar_to_vector GR64:$src)))]>, 4119 VEX, Sched<[WriteVecMoveFromGpr]>; 4120let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4121def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4122 "movq\t{$src, $dst|$dst, $src}", []>, 4123 VEX, Sched<[WriteVecLoad]>; 4124let isCodeGenOnly = 1 in 4125def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4126 "movq\t{$src, $dst|$dst, $src}", 4127 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4128 VEX, Sched<[WriteVecMoveFromGpr]>; 4129 4130def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), 4131 "movd\t{$src, $dst|$dst, $src}", 4132 [(set VR128:$dst, 4133 (v4i32 (scalar_to_vector GR32:$src)))]>, 4134 Sched<[WriteVecMoveFromGpr]>; 4135def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), 4136 "movd\t{$src, $dst|$dst, $src}", 4137 [(set VR128:$dst, 4138 (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, 4139 Sched<[WriteVecLoad]>; 4140def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), 4141 "movq\t{$src, $dst|$dst, $src}", 4142 [(set VR128:$dst, 4143 (v2i64 (scalar_to_vector GR64:$src)))]>, 4144 Sched<[WriteVecMoveFromGpr]>; 4145let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in 4146def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4147 "movq\t{$src, $dst|$dst, $src}", []>, 4148 Sched<[WriteVecLoad]>; 4149let isCodeGenOnly = 1 in 4150def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), 4151 "movq\t{$src, $dst|$dst, $src}", 4152 [(set FR64:$dst, (bitconvert GR64:$src))]>, 4153 Sched<[WriteVecMoveFromGpr]>; 4154} // ExeDomain = SSEPackedInt 4155 4156//===---------------------------------------------------------------------===// 4157// Move Int Doubleword to Single Scalar 4158// 4159let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4160 def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4161 "movd\t{$src, $dst|$dst, $src}", 4162 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4163 VEX, Sched<[WriteVecMoveFromGpr]>; 4164 4165 def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), 4166 "movd\t{$src, $dst|$dst, $src}", 4167 [(set FR32:$dst, (bitconvert GR32:$src))]>, 4168 Sched<[WriteVecMoveFromGpr]>; 4169 4170} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4171 4172//===---------------------------------------------------------------------===// 4173// Move Packed Doubleword Int to Packed Double Int 4174// 4175let ExeDomain = SSEPackedInt in { 4176def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4177 "movd\t{$src, $dst|$dst, $src}", 4178 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4179 (iPTR 0)))]>, VEX, 4180 Sched<[WriteVecMoveToGpr]>; 4181def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), 4182 (ins i32mem:$dst, VR128:$src), 4183 "movd\t{$src, $dst|$dst, $src}", 4184 [(store (i32 (extractelt (v4i32 VR128:$src), 4185 (iPTR 0))), addr:$dst)]>, 4186 VEX, Sched<[WriteVecStore]>; 4187def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), 4188 "movd\t{$src, $dst|$dst, $src}", 4189 [(set GR32:$dst, (extractelt (v4i32 VR128:$src), 4190 (iPTR 0)))]>, 4191 Sched<[WriteVecMoveToGpr]>; 4192def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), 4193 "movd\t{$src, $dst|$dst, $src}", 4194 [(store (i32 (extractelt (v4i32 VR128:$src), 4195 (iPTR 0))), addr:$dst)]>, 4196 Sched<[WriteVecStore]>; 4197} // ExeDomain = SSEPackedInt 4198 4199//===---------------------------------------------------------------------===// 4200// Move Packed Doubleword Int first element to Doubleword Int 4201// 4202let ExeDomain = SSEPackedInt in { 4203let SchedRW = [WriteVecMoveToGpr] in { 4204def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4205 "movq\t{$src, $dst|$dst, $src}", 4206 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4207 (iPTR 0)))]>, 4208 VEX; 4209 4210def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), 4211 "movq\t{$src, $dst|$dst, $src}", 4212 [(set GR64:$dst, (extractelt (v2i64 VR128:$src), 4213 (iPTR 0)))]>; 4214} //SchedRW 4215 4216let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4217def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), 4218 (ins i64mem:$dst, VR128:$src), 4219 "movq\t{$src, $dst|$dst, $src}", []>, 4220 VEX, Sched<[WriteVecStore]>; 4221let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in 4222def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4223 "movq\t{$src, $dst|$dst, $src}", []>, 4224 Sched<[WriteVecStore]>; 4225} // ExeDomain = SSEPackedInt 4226 4227//===---------------------------------------------------------------------===// 4228// Bitcast FR64 <-> GR64 4229// 4230let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4231 def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4232 "movq\t{$src, $dst|$dst, $src}", 4233 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4234 VEX, Sched<[WriteVecMoveToGpr]>; 4235 4236 def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), 4237 "movq\t{$src, $dst|$dst, $src}", 4238 [(set GR64:$dst, (bitconvert FR64:$src))]>, 4239 Sched<[WriteVecMoveToGpr]>; 4240} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4241 4242//===---------------------------------------------------------------------===// 4243// Move Scalar Single to Double Int 4244// 4245let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { 4246 def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4247 "movd\t{$src, $dst|$dst, $src}", 4248 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4249 VEX, Sched<[WriteVecMoveToGpr]>; 4250 def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), 4251 "movd\t{$src, $dst|$dst, $src}", 4252 [(set GR32:$dst, (bitconvert FR32:$src))]>, 4253 Sched<[WriteVecMoveToGpr]>; 4254} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 4255 4256let Predicates = [UseAVX] in { 4257 def : Pat<(v4i32 (scalar_to_vector (i32 (anyext GR8:$src)))), 4258 (VMOVDI2PDIrr (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 4259 GR8:$src, sub_8bit)))>; 4260 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4261 (VMOVDI2PDIrr GR32:$src)>; 4262 4263 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4264 (VMOV64toPQIrr GR64:$src)>; 4265 4266 // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. 4267 // These instructions also write zeros in the high part of a 256-bit register. 4268 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4269 (VMOVDI2PDIrm addr:$src)>; 4270 def : Pat<(v8i32 (X86vzload32 addr:$src)), 4271 (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; 4272} 4273 4274let Predicates = [UseSSE2] in { 4275 def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), 4276 (MOVDI2PDIrr GR32:$src)>; 4277 4278 def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), 4279 (MOV64toPQIrr GR64:$src)>; 4280 def : Pat<(v4i32 (X86vzload32 addr:$src)), 4281 (MOVDI2PDIrm addr:$src)>; 4282} 4283 4284// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of 4285// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add 4286// these aliases. 4287def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4288 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4289def : InstAlias<"movd\t{$src, $dst|$dst, $src}", 4290 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4291// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. 4292def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4293 (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; 4294def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", 4295 (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; 4296 4297//===---------------------------------------------------------------------===// 4298// SSE2 - Move Quadword 4299//===---------------------------------------------------------------------===// 4300 4301//===---------------------------------------------------------------------===// 4302// Move Quadword Int to Packed Quadword Int 4303// 4304 4305let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { 4306def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4307 "vmovq\t{$src, $dst|$dst, $src}", 4308 [(set VR128:$dst, 4309 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, TB, XS, 4310 VEX, Requires<[UseAVX]>, WIG; 4311def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), 4312 "movq\t{$src, $dst|$dst, $src}", 4313 [(set VR128:$dst, 4314 (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, 4315 TB, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix 4316} // ExeDomain, SchedRW 4317 4318//===---------------------------------------------------------------------===// 4319// Move Packed Quadword Int to Quadword Int 4320// 4321let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { 4322def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4323 "movq\t{$src, $dst|$dst, $src}", 4324 [(store (i64 (extractelt (v2i64 VR128:$src), 4325 (iPTR 0))), addr:$dst)]>, 4326 VEX, WIG; 4327def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), 4328 "movq\t{$src, $dst|$dst, $src}", 4329 [(store (i64 (extractelt (v2i64 VR128:$src), 4330 (iPTR 0))), addr:$dst)]>; 4331} // ExeDomain, SchedRW 4332 4333// For disassembler only 4334let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, 4335 SchedRW = [SchedWriteVecLogic.XMM] in { 4336def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4337 "movq\t{$src, $dst|$dst, $src}", []>, VEX, WIG; 4338def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), 4339 "movq\t{$src, $dst|$dst, $src}", []>; 4340} 4341 4342def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", 4343 (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4344def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", 4345 (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; 4346 4347let Predicates = [UseAVX] in { 4348 def : Pat<(v2i64 (X86vzload64 addr:$src)), 4349 (VMOVQI2PQIrm addr:$src)>; 4350 def : Pat<(v4i64 (X86vzload64 addr:$src)), 4351 (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; 4352 4353 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4354 (VMOVPQI2QImr addr:$dst, VR128:$src)>; 4355} 4356 4357let Predicates = [UseSSE2] in { 4358 def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>; 4359 4360 def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst), 4361 (MOVPQI2QImr addr:$dst, VR128:$src)>; 4362} 4363 4364//===---------------------------------------------------------------------===// 4365// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in 4366// IA32 document. movq xmm1, xmm2 does clear the high bits. 4367// 4368let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { 4369def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4370 "vmovq\t{$src, $dst|$dst, $src}", 4371 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4372 TB, XS, VEX, Requires<[UseAVX]>, WIG; 4373def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4374 "movq\t{$src, $dst|$dst, $src}", 4375 [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, 4376 TB, XS, Requires<[UseSSE2]>; 4377} // ExeDomain, SchedRW 4378 4379let Predicates = [UseAVX] in { 4380 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4381 (VMOVZPQILo2PQIrr VR128:$src)>; 4382} 4383let Predicates = [UseSSE2] in { 4384 def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), 4385 (MOVZPQILo2PQIrr VR128:$src)>; 4386} 4387 4388let Predicates = [UseAVX] in { 4389 def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), 4390 (SUBREG_TO_REG (i32 0), 4391 (v2f64 (VMOVZPQILo2PQIrr 4392 (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), 4393 sub_xmm)>; 4394 def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), 4395 (SUBREG_TO_REG (i32 0), 4396 (v2i64 (VMOVZPQILo2PQIrr 4397 (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), 4398 sub_xmm)>; 4399} 4400 4401//===---------------------------------------------------------------------===// 4402// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP 4403//===---------------------------------------------------------------------===// 4404 4405multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, 4406 ValueType vt, RegisterClass RC, PatFrag mem_frag, 4407 X86MemOperand x86memop, X86FoldableSchedWrite sched> { 4408def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), 4409 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4410 [(set RC:$dst, (vt (OpNode RC:$src)))]>, 4411 Sched<[sched]>; 4412def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 4413 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4414 [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, 4415 Sched<[sched.Folded]>; 4416} 4417 4418let Predicates = [HasAVX, NoVLX] in { 4419 defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4420 v4f32, VR128, loadv4f32, f128mem, 4421 SchedWriteFShuffle.XMM>, VEX, WIG; 4422 defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4423 v4f32, VR128, loadv4f32, f128mem, 4424 SchedWriteFShuffle.XMM>, VEX, WIG; 4425 defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", 4426 v8f32, VR256, loadv8f32, f256mem, 4427 SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG; 4428 defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", 4429 v8f32, VR256, loadv8f32, f256mem, 4430 SchedWriteFShuffle.YMM>, VEX, VEX_L, WIG; 4431} 4432defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, 4433 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4434defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, 4435 memopv4f32, f128mem, SchedWriteFShuffle.XMM>; 4436 4437let Predicates = [HasAVX, NoVLX] in { 4438 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4439 (VMOVSHDUPrr VR128:$src)>; 4440 def : Pat<(v4i32 (X86Movshdup (load addr:$src))), 4441 (VMOVSHDUPrm addr:$src)>; 4442 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4443 (VMOVSLDUPrr VR128:$src)>; 4444 def : Pat<(v4i32 (X86Movsldup (load addr:$src))), 4445 (VMOVSLDUPrm addr:$src)>; 4446 def : Pat<(v8i32 (X86Movshdup VR256:$src)), 4447 (VMOVSHDUPYrr VR256:$src)>; 4448 def : Pat<(v8i32 (X86Movshdup (load addr:$src))), 4449 (VMOVSHDUPYrm addr:$src)>; 4450 def : Pat<(v8i32 (X86Movsldup VR256:$src)), 4451 (VMOVSLDUPYrr VR256:$src)>; 4452 def : Pat<(v8i32 (X86Movsldup (load addr:$src))), 4453 (VMOVSLDUPYrm addr:$src)>; 4454} 4455 4456let Predicates = [UseSSE3] in { 4457 def : Pat<(v4i32 (X86Movshdup VR128:$src)), 4458 (MOVSHDUPrr VR128:$src)>; 4459 def : Pat<(v4i32 (X86Movshdup (memop addr:$src))), 4460 (MOVSHDUPrm addr:$src)>; 4461 def : Pat<(v4i32 (X86Movsldup VR128:$src)), 4462 (MOVSLDUPrr VR128:$src)>; 4463 def : Pat<(v4i32 (X86Movsldup (memop addr:$src))), 4464 (MOVSLDUPrm addr:$src)>; 4465} 4466 4467//===---------------------------------------------------------------------===// 4468// SSE3 - Replicate Double FP - MOVDDUP 4469//===---------------------------------------------------------------------===// 4470 4471multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { 4472def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 4473 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4474 [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, 4475 Sched<[sched.XMM]>; 4476def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), 4477 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4478 [(set VR128:$dst, 4479 (v2f64 (X86Movddup 4480 (scalar_to_vector (loadf64 addr:$src)))))]>, 4481 Sched<[sched.XMM.Folded]>; 4482} 4483 4484// FIXME: Merge with above classes when there are patterns for the ymm version 4485multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { 4486def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), 4487 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4488 [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, 4489 Sched<[sched.YMM]>; 4490def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), 4491 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4492 [(set VR256:$dst, 4493 (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, 4494 Sched<[sched.YMM.Folded]>; 4495} 4496 4497let Predicates = [HasAVX, NoVLX] in { 4498 defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, 4499 VEX, WIG; 4500 defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, 4501 VEX, VEX_L, WIG; 4502} 4503 4504defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; 4505 4506 4507let Predicates = [HasAVX, NoVLX] in { 4508 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4509 (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; 4510} 4511 4512let Predicates = [UseSSE3] in { 4513 def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))), 4514 (MOVDDUPrm addr:$src)>; 4515} 4516 4517//===---------------------------------------------------------------------===// 4518// SSE3 - Move Unaligned Integer 4519//===---------------------------------------------------------------------===// 4520 4521let Predicates = [HasAVX] in { 4522 def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4523 "vlddqu\t{$src, $dst|$dst, $src}", 4524 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4525 Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, WIG; 4526 def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 4527 "vlddqu\t{$src, $dst|$dst, $src}", 4528 [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, 4529 Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, WIG; 4530} // Predicates 4531 4532def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 4533 "lddqu\t{$src, $dst|$dst, $src}", 4534 [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, 4535 Sched<[SchedWriteVecMoveLS.XMM.RM]>; 4536 4537//===---------------------------------------------------------------------===// 4538// SSE3 - Arithmetic 4539//===---------------------------------------------------------------------===// 4540 4541multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, 4542 X86MemOperand x86memop, X86FoldableSchedWrite sched, 4543 PatFrag ld_frag, bit Is2Addr = 1> { 4544let Uses = [MXCSR], mayRaiseFPException = 1 in { 4545 def rr : I<0xD0, MRMSrcReg, 4546 (outs RC:$dst), (ins RC:$src1, RC:$src2), 4547 !if(Is2Addr, 4548 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4549 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4550 [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, 4551 Sched<[sched]>; 4552 def rm : I<0xD0, MRMSrcMem, 4553 (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4554 !if(Is2Addr, 4555 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4556 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4557 [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, 4558 Sched<[sched.Folded, sched.ReadAfterFold]>; 4559} 4560} 4561 4562let Predicates = [HasAVX] in { 4563 let ExeDomain = SSEPackedSingle in { 4564 defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, 4565 SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, 4566 TB, XD, VEX, VVVV, WIG; 4567 defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, 4568 SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, 4569 TB, XD, VEX, VVVV, VEX_L, WIG; 4570 } 4571 let ExeDomain = SSEPackedDouble in { 4572 defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, 4573 SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, 4574 TB, PD, VEX, VVVV, WIG; 4575 defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, 4576 SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, 4577 TB, PD, VEX, VVVV, VEX_L, WIG; 4578 } 4579} 4580let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { 4581 let ExeDomain = SSEPackedSingle in 4582 defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, 4583 SchedWriteFAddSizes.PS.XMM, memopv4f32>, TB, XD; 4584 let ExeDomain = SSEPackedDouble in 4585 defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, 4586 SchedWriteFAddSizes.PD.XMM, memopv2f64>, TB, PD; 4587} 4588 4589//===---------------------------------------------------------------------===// 4590// SSE3 Instructions 4591//===---------------------------------------------------------------------===// 4592 4593// Horizontal ops 4594multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4595 X86MemOperand x86memop, SDNode OpNode, 4596 X86FoldableSchedWrite sched, PatFrag ld_frag, 4597 bit Is2Addr = 1> { 4598let Uses = [MXCSR], mayRaiseFPException = 1 in { 4599 def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4600 !if(Is2Addr, 4601 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4602 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4603 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4604 Sched<[sched]>; 4605 4606 def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4607 !if(Is2Addr, 4608 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4609 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4610 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4611 Sched<[sched.Folded, sched.ReadAfterFold]>; 4612} 4613} 4614multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, 4615 X86MemOperand x86memop, SDNode OpNode, 4616 X86FoldableSchedWrite sched, PatFrag ld_frag, 4617 bit Is2Addr = 1> { 4618let Uses = [MXCSR], mayRaiseFPException = 1 in { 4619 def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), 4620 !if(Is2Addr, 4621 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4622 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4623 [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, 4624 Sched<[sched]>; 4625 4626 def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), 4627 !if(Is2Addr, 4628 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4629 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4630 [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, 4631 Sched<[sched.Folded, sched.ReadAfterFold]>; 4632} 4633} 4634 4635let Predicates = [HasAVX] in { 4636 let ExeDomain = SSEPackedSingle in { 4637 defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, 4638 X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG; 4639 defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, 4640 X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG; 4641 defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, 4642 X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG; 4643 defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, 4644 X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG; 4645 } 4646 let ExeDomain = SSEPackedDouble in { 4647 defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, 4648 X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG; 4649 defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, 4650 X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG; 4651 defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, 4652 X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG; 4653 defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, 4654 X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG; 4655 } 4656} 4657 4658let Constraints = "$src1 = $dst" in { 4659 let ExeDomain = SSEPackedSingle in { 4660 defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, 4661 WriteFHAdd, memopv4f32>; 4662 defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, 4663 WriteFHAdd, memopv4f32>; 4664 } 4665 let ExeDomain = SSEPackedDouble in { 4666 defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, 4667 WriteFHAdd, memopv2f64>; 4668 defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, 4669 WriteFHAdd, memopv2f64>; 4670 } 4671} 4672 4673//===---------------------------------------------------------------------===// 4674// SSSE3 - Packed Absolute Instructions 4675//===---------------------------------------------------------------------===// 4676 4677/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4678multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, 4679 SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { 4680 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4681 (ins VR128:$src), 4682 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4683 [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, 4684 Sched<[sched.XMM]>; 4685 4686 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4687 (ins i128mem:$src), 4688 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4689 [(set VR128:$dst, 4690 (vt (OpNode (ld_frag addr:$src))))]>, 4691 Sched<[sched.XMM.Folded]>; 4692} 4693 4694/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. 4695multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, 4696 SDNode OpNode, X86SchedWriteWidths sched> { 4697 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4698 (ins VR256:$src), 4699 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4700 [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, 4701 Sched<[sched.YMM]>; 4702 4703 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4704 (ins i256mem:$src), 4705 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 4706 [(set VR256:$dst, 4707 (vt (OpNode (load addr:$src))))]>, 4708 Sched<[sched.YMM.Folded]>; 4709} 4710 4711let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4712 defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, 4713 load>, VEX, WIG; 4714 defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, 4715 load>, VEX, WIG; 4716} 4717let Predicates = [HasAVX, NoVLX] in { 4718 defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, 4719 load>, VEX, WIG; 4720} 4721let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4722 defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, 4723 VEX, VEX_L, WIG; 4724 defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, 4725 VEX, VEX_L, WIG; 4726} 4727let Predicates = [HasAVX2, NoVLX] in { 4728 defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, 4729 VEX, VEX_L, WIG; 4730} 4731 4732defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, 4733 memop>; 4734defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, 4735 memop>; 4736defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, 4737 memop>; 4738 4739//===---------------------------------------------------------------------===// 4740// SSSE3 - Packed Binary Operator Instructions 4741//===---------------------------------------------------------------------===// 4742 4743/// SS3I_binop_rm - Simple SSSE3 bin op 4744multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 4745 ValueType DstVT, ValueType OpVT, RegisterClass RC, 4746 PatFrag memop_frag, X86MemOperand x86memop, 4747 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4748 let isCommutable = 1 in 4749 def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), 4750 (ins RC:$src1, RC:$src2), 4751 !if(Is2Addr, 4752 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4753 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4754 [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, 4755 Sched<[sched]>; 4756 def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), 4757 (ins RC:$src1, x86memop:$src2), 4758 !if(Is2Addr, 4759 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4760 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4761 [(set RC:$dst, 4762 (DstVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))]>, 4763 Sched<[sched.Folded, sched.ReadAfterFold]>; 4764} 4765 4766/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. 4767multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, 4768 Intrinsic IntId128, X86FoldableSchedWrite sched, 4769 PatFrag ld_frag, bit Is2Addr = 1> { 4770 let isCommutable = 1 in 4771 def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), 4772 (ins VR128:$src1, VR128:$src2), 4773 !if(Is2Addr, 4774 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4775 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4776 [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, 4777 Sched<[sched]>; 4778 def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), 4779 (ins VR128:$src1, i128mem:$src2), 4780 !if(Is2Addr, 4781 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 4782 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 4783 [(set VR128:$dst, 4784 (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, 4785 Sched<[sched.Folded, sched.ReadAfterFold]>; 4786} 4787 4788multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, 4789 Intrinsic IntId256, 4790 X86FoldableSchedWrite sched> { 4791 let isCommutable = 1 in 4792 def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), 4793 (ins VR256:$src1, VR256:$src2), 4794 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4795 [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, 4796 Sched<[sched]>; 4797 def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), 4798 (ins VR256:$src1, i256mem:$src2), 4799 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 4800 [(set VR256:$dst, 4801 (IntId256 VR256:$src1, (load addr:$src2)))]>, 4802 Sched<[sched.Folded, sched.ReadAfterFold]>; 4803} 4804 4805let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 4806let isCommutable = 0 in { 4807 defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, 4808 VR128, load, i128mem, 4809 SchedWriteVarShuffle.XMM, 0>, VEX, VVVV, WIG; 4810 defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, 4811 v16i8, VR128, load, i128mem, 4812 SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG; 4813} 4814defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, 4815 VR128, load, i128mem, 4816 SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG; 4817} 4818 4819let ImmT = NoImm, Predicates = [HasAVX] in { 4820let isCommutable = 0 in { 4821 defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, 4822 load, i128mem, 4823 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4824 defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, 4825 load, i128mem, 4826 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4827 defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, 4828 load, i128mem, 4829 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4830 defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, 4831 load, i128mem, 4832 SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG; 4833 defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", 4834 int_x86_ssse3_psign_b_128, 4835 SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; 4836 defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", 4837 int_x86_ssse3_psign_w_128, 4838 SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; 4839 defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", 4840 int_x86_ssse3_psign_d_128, 4841 SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG; 4842 defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", 4843 int_x86_ssse3_phadd_sw_128, 4844 SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; 4845 defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", 4846 int_x86_ssse3_phsub_sw_128, 4847 SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG; 4848} 4849} 4850 4851let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 4852let isCommutable = 0 in { 4853 defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, 4854 VR256, load, i256mem, 4855 SchedWriteVarShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4856 defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, 4857 v32i8, VR256, load, i256mem, 4858 SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4859} 4860defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, 4861 VR256, load, i256mem, 4862 SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4863} 4864 4865let ImmT = NoImm, Predicates = [HasAVX2] in { 4866let isCommutable = 0 in { 4867 defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, 4868 VR256, load, i256mem, 4869 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4870 defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, 4871 load, i256mem, 4872 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4873 defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, 4874 VR256, load, i256mem, 4875 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4876 defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, 4877 load, i256mem, 4878 SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4879 defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, 4880 SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; 4881 defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, 4882 SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; 4883 defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, 4884 SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG; 4885 defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", 4886 int_x86_avx2_phadd_sw, 4887 SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; 4888 defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", 4889 int_x86_avx2_phsub_sw, 4890 SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG; 4891} 4892} 4893 4894// None of these have i8 immediate fields. 4895let ImmT = NoImm, Constraints = "$src1 = $dst" in { 4896let isCommutable = 0 in { 4897 defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, 4898 memop, i128mem, SchedWritePHAdd.XMM>; 4899 defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, 4900 memop, i128mem, SchedWritePHAdd.XMM>; 4901 defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, 4902 memop, i128mem, SchedWritePHAdd.XMM>; 4903 defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, 4904 memop, i128mem, SchedWritePHAdd.XMM>; 4905 defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, 4906 SchedWriteVecALU.XMM, memop>; 4907 defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, 4908 SchedWriteVecALU.XMM, memop>; 4909 defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, 4910 SchedWriteVecALU.XMM, memop>; 4911 defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, 4912 memop, i128mem, SchedWriteVarShuffle.XMM>; 4913 defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", 4914 int_x86_ssse3_phadd_sw_128, 4915 SchedWritePHAdd.XMM, memop>; 4916 defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", 4917 int_x86_ssse3_phsub_sw_128, 4918 SchedWritePHAdd.XMM, memop>; 4919 defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, 4920 v16i8, VR128, memop, i128mem, 4921 SchedWriteVecIMul.XMM>; 4922} 4923defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, 4924 VR128, memop, i128mem, SchedWriteVecIMul.XMM>; 4925} 4926 4927//===---------------------------------------------------------------------===// 4928// SSSE3 - Packed Align Instruction Patterns 4929//===---------------------------------------------------------------------===// 4930 4931multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, 4932 PatFrag memop_frag, X86MemOperand x86memop, 4933 X86FoldableSchedWrite sched, bit Is2Addr = 1> { 4934 let hasSideEffects = 0 in { 4935 def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), 4936 (ins RC:$src1, RC:$src2, u8imm:$src3), 4937 !if(Is2Addr, 4938 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4939 !strconcat(asm, 4940 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4941 [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 timm:$src3))))]>, 4942 Sched<[sched]>; 4943 let mayLoad = 1 in 4944 def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), 4945 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 4946 !if(Is2Addr, 4947 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 4948 !strconcat(asm, 4949 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 4950 [(set RC:$dst, (VT (X86PAlignr RC:$src1, 4951 (memop_frag addr:$src2), 4952 (i8 timm:$src3))))]>, 4953 Sched<[sched.Folded, sched.ReadAfterFold]>; 4954 } 4955} 4956 4957let Predicates = [HasAVX, NoVLX_Or_NoBWI] in 4958 defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem, 4959 SchedWriteShuffle.XMM, 0>, VEX, VVVV, WIG; 4960let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in 4961 defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem, 4962 SchedWriteShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG; 4963let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in 4964 defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem, 4965 SchedWriteShuffle.XMM>; 4966 4967//===---------------------------------------------------------------------===// 4968// SSSE3 - Thread synchronization 4969//===---------------------------------------------------------------------===// 4970 4971let SchedRW = [WriteSystem] in { 4972let Uses = [EAX, ECX, EDX] in 4973def MONITOR32rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4974 TB, Requires<[HasSSE3, Not64BitMode]>; 4975let Uses = [RAX, ECX, EDX] in 4976def MONITOR64rrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, 4977 TB, Requires<[HasSSE3, In64BitMode]>; 4978 4979let Uses = [ECX, EAX] in 4980def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", 4981 [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; 4982} // SchedRW 4983 4984def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; 4985def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; 4986 4987def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITOR32rrr)>, 4988 Requires<[Not64BitMode]>; 4989def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITOR64rrr)>, 4990 Requires<[In64BitMode]>; 4991 4992//===----------------------------------------------------------------------===// 4993// SSE4.1 - Packed Move with Sign/Zero Extend 4994// NOTE: Any Extend is promoted to Zero Extend in X86ISelDAGToDAG.cpp 4995//===----------------------------------------------------------------------===// 4996 4997multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 4998 RegisterClass OutRC, RegisterClass InRC, 4999 X86FoldableSchedWrite sched> { 5000 def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), 5001 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 5002 Sched<[sched]>; 5003 5004 def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), 5005 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, 5006 Sched<[sched.Folded]>; 5007} 5008 5009multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, 5010 X86MemOperand MemOp, X86MemOperand MemYOp, 5011 Predicate prd> { 5012 defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, 5013 SchedWriteShuffle.XMM>; 5014 let Predicates = [HasAVX, prd] in 5015 defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, 5016 VR128, VR128, SchedWriteVecExtend.XMM>, 5017 VEX, WIG; 5018 let Predicates = [HasAVX2, prd] in 5019 defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, 5020 VR256, VR128, SchedWriteVecExtend.YMM>, 5021 VEX, VEX_L, WIG; 5022} 5023 5024multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, 5025 X86MemOperand MemYOp, Predicate prd> { 5026 defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), 5027 MemOp, MemYOp, prd>; 5028 defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), 5029 !strconcat("pmovzx", OpcodeStr), 5030 MemOp, MemYOp, prd>; 5031} 5032 5033defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; 5034defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; 5035defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; 5036 5037defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; 5038defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; 5039 5040defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; 5041 5042// AVX2 Patterns 5043multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, 5044 SDNode ExtOp, SDNode InVecOp> { 5045 // Register-Register patterns 5046 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5047 def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), 5048 (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; 5049 } 5050 let Predicates = [HasAVX2, NoVLX] in { 5051 def : Pat<(v8i32 (InVecOp (v16i8 VR128:$src))), 5052 (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; 5053 def : Pat<(v4i64 (InVecOp (v16i8 VR128:$src))), 5054 (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; 5055 5056 def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), 5057 (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; 5058 def : Pat<(v4i64 (InVecOp (v8i16 VR128:$src))), 5059 (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; 5060 5061 def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), 5062 (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; 5063 } 5064 5065 // Simple Register-Memory patterns 5066 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5067 def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5068 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5069 5070 def : Pat<(v16i16 (ExtOp (loadv16i8 addr:$src))), 5071 (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; 5072 } 5073 5074 let Predicates = [HasAVX2, NoVLX] in { 5075 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5076 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5077 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5078 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5079 5080 def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5081 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5082 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5083 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5084 5085 def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5086 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5087 } 5088 5089 // AVX2 Register-Memory patterns 5090 let Predicates = [HasAVX2, NoVLX] in { 5091 def : Pat<(v8i32 (ExtOp (loadv8i16 addr:$src))), 5092 (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; 5093 5094 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5095 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5096 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5097 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5098 def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5099 (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; 5100 5101 def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))), 5102 (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; 5103 5104 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5105 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5106 def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))), 5107 (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; 5108 5109 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5110 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5111 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5112 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5113 def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5114 (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; 5115 } 5116} 5117 5118defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", sext, sext_invec>; 5119defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", zext, zext_invec>; 5120 5121// SSE4.1/AVX patterns. 5122multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, 5123 SDNode ExtOp> { 5124 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5125 def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), 5126 (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; 5127 } 5128 let Predicates = [HasAVX, NoVLX] in { 5129 def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), 5130 (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; 5131 def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), 5132 (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; 5133 5134 def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), 5135 (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; 5136 def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), 5137 (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; 5138 5139 def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), 5140 (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; 5141 } 5142 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5143 def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5144 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5145 } 5146 let Predicates = [HasAVX, NoVLX] in { 5147 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5148 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5149 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), 5150 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5151 5152 def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5153 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5154 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), 5155 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5156 5157 def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), 5158 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5159 } 5160 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5161 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5162 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5163 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5164 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5165 def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))), 5166 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5167 def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))), 5168 (!cast<I>(OpcPrefix#BWrm) addr:$src)>; 5169 } 5170 let Predicates = [HasAVX, NoVLX] in { 5171 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5172 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5173 def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))), 5174 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5175 def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))), 5176 (!cast<I>(OpcPrefix#BDrm) addr:$src)>; 5177 5178 def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), 5179 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5180 def : Pat<(v2i64 (ExtOp (loadv16i8 addr:$src))), 5181 (!cast<I>(OpcPrefix#BQrm) addr:$src)>; 5182 5183 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5184 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5185 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5186 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5187 def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 5188 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5189 def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))), 5190 (!cast<I>(OpcPrefix#WDrm) addr:$src)>; 5191 5192 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), 5193 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5194 def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))), 5195 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5196 def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))), 5197 (!cast<I>(OpcPrefix#WQrm) addr:$src)>; 5198 5199 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 5200 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5201 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), 5202 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5203 def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))), 5204 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5205 def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))), 5206 (!cast<I>(OpcPrefix#DQrm) addr:$src)>; 5207 } 5208} 5209 5210defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; 5211defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; 5212 5213let Predicates = [UseSSE41] in { 5214 defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; 5215 defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; 5216} 5217 5218//===----------------------------------------------------------------------===// 5219// SSE4.1 - Extract Instructions 5220//===----------------------------------------------------------------------===// 5221 5222/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem 5223multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { 5224 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5225 (ins VR128:$src1, u8imm:$src2), 5226 !strconcat(OpcodeStr, 5227 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5228 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), 5229 timm:$src2))]>, 5230 Sched<[WriteVecExtract]>; 5231 let hasSideEffects = 0, mayStore = 1 in 5232 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5233 (ins i8mem:$dst, VR128:$src1, u8imm:$src2), 5234 !strconcat(OpcodeStr, 5235 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5236 [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), timm:$src2))), 5237 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5238} 5239 5240let Predicates = [HasAVX, NoBWI] in 5241 defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX, WIG; 5242 5243defm PEXTRB : SS41I_extract8<0x14, "pextrb">; 5244 5245 5246/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination 5247multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { 5248 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in 5249 def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5250 (ins VR128:$src1, u8imm:$src2), 5251 !strconcat(OpcodeStr, 5252 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 5253 Sched<[WriteVecExtract]>; 5254 5255 let hasSideEffects = 0, mayStore = 1 in 5256 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5257 (ins i16mem:$dst, VR128:$src1, u8imm:$src2), 5258 !strconcat(OpcodeStr, 5259 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5260 [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), 5261 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5262} 5263 5264let Predicates = [HasAVX, NoBWI] in 5265 defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX, WIG; 5266 5267defm PEXTRW : SS41I_extract16<0x15, "pextrw">; 5268 5269let Predicates = [UseSSE41] in 5270 def : Pat<(store f16:$src, addr:$dst), (PEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5271 5272let Predicates = [HasAVX, NoBWI] in 5273 def : Pat<(store f16:$src, addr:$dst), (VPEXTRWmr addr:$dst, (v8i16 (COPY_TO_REGCLASS FR16:$src, VR128)), 0)>; 5274 5275 5276/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5277multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { 5278 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), 5279 (ins VR128:$src1, u8imm:$src2), 5280 !strconcat(OpcodeStr, 5281 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5282 [(set GR32:$dst, 5283 (extractelt (v4i32 VR128:$src1), imm:$src2))]>, 5284 Sched<[WriteVecExtract]>; 5285 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5286 (ins i32mem:$dst, VR128:$src1, u8imm:$src2), 5287 !strconcat(OpcodeStr, 5288 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5289 [(store (extractelt (v4i32 VR128:$src1), imm:$src2), 5290 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5291} 5292 5293let Predicates = [HasAVX, NoDQI] in 5294 defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; 5295 5296defm PEXTRD : SS41I_extract32<0x16, "pextrd">; 5297 5298/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination 5299multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { 5300 def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), 5301 (ins VR128:$src1, u8imm:$src2), 5302 !strconcat(OpcodeStr, 5303 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5304 [(set GR64:$dst, 5305 (extractelt (v2i64 VR128:$src1), imm:$src2))]>, 5306 Sched<[WriteVecExtract]>; 5307 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5308 (ins i64mem:$dst, VR128:$src1, u8imm:$src2), 5309 !strconcat(OpcodeStr, 5310 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5311 [(store (extractelt (v2i64 VR128:$src1), imm:$src2), 5312 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5313} 5314 5315let Predicates = [HasAVX, NoDQI] in 5316 defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, REX_W; 5317 5318defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; 5319 5320/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory 5321/// destination 5322multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { 5323 def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), 5324 (ins VR128:$src1, u8imm:$src2), 5325 !strconcat(OpcodeStr, 5326 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5327 [(set GR32orGR64:$dst, 5328 (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, 5329 Sched<[WriteVecExtract]>; 5330 def mr : SS4AIi8<opc, MRMDestMem, (outs), 5331 (ins f32mem:$dst, VR128:$src1, u8imm:$src2), 5332 !strconcat(OpcodeStr, 5333 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5334 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), 5335 addr:$dst)]>, Sched<[WriteVecExtractSt]>; 5336} 5337 5338let ExeDomain = SSEPackedSingle in { 5339 let Predicates = [UseAVX] in 5340 defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, WIG; 5341 defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; 5342} 5343 5344//===----------------------------------------------------------------------===// 5345// SSE4.1 - Insert Instructions 5346//===----------------------------------------------------------------------===// 5347 5348multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { 5349 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5350 (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), 5351 !if(Is2Addr, 5352 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5353 !strconcat(asm, 5354 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5355 [(set VR128:$dst, 5356 (X86pinsrb VR128:$src1, GR32orGR64:$src2, timm:$src3))]>, 5357 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5358 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5359 (ins VR128:$src1, i8mem:$src2, u8imm:$src3), 5360 !if(Is2Addr, 5361 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5362 !strconcat(asm, 5363 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5364 [(set VR128:$dst, 5365 (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), timm:$src3))]>, 5366 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5367} 5368 5369let Predicates = [HasAVX, NoBWI] in { 5370 defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX, VVVV, WIG; 5371 def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3), 5372 (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 5373 GR8:$src2, sub_8bit), timm:$src3)>; 5374} 5375 5376let Constraints = "$src1 = $dst" in 5377 defm PINSRB : SS41I_insert8<0x20, "pinsrb">; 5378 5379multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { 5380 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5381 (ins VR128:$src1, GR32:$src2, u8imm:$src3), 5382 !if(Is2Addr, 5383 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5384 !strconcat(asm, 5385 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5386 [(set VR128:$dst, 5387 (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, 5388 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5389 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5390 (ins VR128:$src1, i32mem:$src2, u8imm:$src3), 5391 !if(Is2Addr, 5392 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5393 !strconcat(asm, 5394 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5395 [(set VR128:$dst, 5396 (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), imm:$src3)))]>, 5397 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5398} 5399 5400let Predicates = [HasAVX, NoDQI] in 5401 defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX, VVVV; 5402let Constraints = "$src1 = $dst" in 5403 defm PINSRD : SS41I_insert32<0x22, "pinsrd">; 5404 5405multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { 5406 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5407 (ins VR128:$src1, GR64:$src2, u8imm:$src3), 5408 !if(Is2Addr, 5409 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5410 !strconcat(asm, 5411 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5412 [(set VR128:$dst, 5413 (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, 5414 Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>; 5415 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5416 (ins VR128:$src1, i64mem:$src2, u8imm:$src3), 5417 !if(Is2Addr, 5418 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5419 !strconcat(asm, 5420 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5421 [(set VR128:$dst, 5422 (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), imm:$src3)))]>, 5423 Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>; 5424} 5425 5426let Predicates = [HasAVX, NoDQI] in 5427 defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX, VVVV, REX_W; 5428let Constraints = "$src1 = $dst" in 5429 defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; 5430 5431// insertps has a few different modes, there's the first two here below which 5432// are optimized inserts that won't zero arbitrary elements in the destination 5433// vector. The next one matches the intrinsic and could zero arbitrary elements 5434// in the target vector. 5435multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { 5436 let isCommutable = 1 in 5437 def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), 5438 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 5439 !if(Is2Addr, 5440 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5441 !strconcat(asm, 5442 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5443 [(set VR128:$dst, 5444 (X86insertps VR128:$src1, VR128:$src2, timm:$src3))]>, 5445 Sched<[SchedWriteFShuffle.XMM]>; 5446 def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), 5447 (ins VR128:$src1, f32mem:$src2, u8imm:$src3), 5448 !if(Is2Addr, 5449 !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5450 !strconcat(asm, 5451 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5452 [(set VR128:$dst, 5453 (X86insertps VR128:$src1, 5454 (v4f32 (scalar_to_vector (loadf32 addr:$src2))), 5455 timm:$src3))]>, 5456 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>; 5457} 5458 5459let ExeDomain = SSEPackedSingle in { 5460 let Predicates = [UseAVX] in 5461 defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, 5462 VEX, VVVV, WIG; 5463 let Constraints = "$src1 = $dst" in 5464 defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; 5465} 5466 5467//===----------------------------------------------------------------------===// 5468// SSE4.1 - Round Instructions 5469//===----------------------------------------------------------------------===// 5470 5471multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, 5472 X86MemOperand x86memop, RegisterClass RC, 5473 ValueType VT, PatFrag mem_frag, SDPatternOperator OpNode, 5474 X86FoldableSchedWrite sched> { 5475 // Intrinsic operation, reg. 5476 // Vector intrinsic operation, reg 5477let Uses = [MXCSR], mayRaiseFPException = 1 in { 5478 def r : SS4AIi8<opc, MRMSrcReg, 5479 (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), 5480 !strconcat(OpcodeStr, 5481 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5482 [(set RC:$dst, (VT (OpNode RC:$src1, timm:$src2)))]>, 5483 Sched<[sched]>; 5484 5485 // Vector intrinsic operation, mem 5486 def m : SS4AIi8<opc, MRMSrcMem, 5487 (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), 5488 !strconcat(OpcodeStr, 5489 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5490 [(set RC:$dst, 5491 (VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>, 5492 Sched<[sched.Folded]>; 5493} 5494} 5495 5496multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, 5497 string OpcodeStr, X86FoldableSchedWrite sched> { 5498let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5499 def SSr : SS4AIi8<opcss, MRMSrcReg, 5500 (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), 5501 !strconcat(OpcodeStr, 5502 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5503 []>, Sched<[sched]>; 5504 5505 let mayLoad = 1 in 5506 def SSm : SS4AIi8<opcss, MRMSrcMem, 5507 (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), 5508 !strconcat(OpcodeStr, 5509 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5510 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5511} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5512 5513let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5514 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5515 (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), 5516 !strconcat(OpcodeStr, 5517 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5518 []>, Sched<[sched]>; 5519 5520 let mayLoad = 1 in 5521 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5522 (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), 5523 !strconcat(OpcodeStr, 5524 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 5525 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5526} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5527} 5528 5529multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, 5530 string OpcodeStr, X86FoldableSchedWrite sched> { 5531let Uses = [MXCSR], mayRaiseFPException = 1 in { 5532let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in { 5533 def SSr : SS4AIi8<opcss, MRMSrcReg, 5534 (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), 5535 !strconcat(OpcodeStr, 5536 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5537 []>, Sched<[sched]>; 5538 5539 let mayLoad = 1 in 5540 def SSm : SS4AIi8<opcss, MRMSrcMem, 5541 (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), 5542 !strconcat(OpcodeStr, 5543 "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5544 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5545} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 5546 5547let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in { 5548 def SDr : SS4AIi8<opcsd, MRMSrcReg, 5549 (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), 5550 !strconcat(OpcodeStr, 5551 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5552 []>, Sched<[sched]>; 5553 5554 let mayLoad = 1 in 5555 def SDm : SS4AIi8<opcsd, MRMSrcMem, 5556 (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), 5557 !strconcat(OpcodeStr, 5558 "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 5559 []>, Sched<[sched.Folded, sched.ReadAfterFold]>; 5560} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 5561} 5562} 5563 5564multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, 5565 string OpcodeStr, X86FoldableSchedWrite sched, 5566 ValueType VT32, ValueType VT64, 5567 SDNode OpNode, bit Is2Addr = 1> { 5568let Uses = [MXCSR], mayRaiseFPException = 1 in { 5569let ExeDomain = SSEPackedSingle in { 5570 def SSr_Int : SS4AIi8<opcss, MRMSrcReg, 5571 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5572 !if(Is2Addr, 5573 !strconcat(OpcodeStr, 5574 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5575 !strconcat(OpcodeStr, 5576 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5577 [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5578 Sched<[sched]>; 5579 5580 def SSm_Int : SS4AIi8<opcss, MRMSrcMem, 5581 (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), 5582 !if(Is2Addr, 5583 !strconcat(OpcodeStr, 5584 "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5585 !strconcat(OpcodeStr, 5586 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5587 [(set VR128:$dst, 5588 (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>, 5589 Sched<[sched.Folded, sched.ReadAfterFold]>; 5590} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 5591 5592let ExeDomain = SSEPackedDouble in { 5593 def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, 5594 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), 5595 !if(Is2Addr, 5596 !strconcat(OpcodeStr, 5597 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5598 !strconcat(OpcodeStr, 5599 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5600 [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, timm:$src3)))]>, 5601 Sched<[sched]>; 5602 5603 def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, 5604 (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), 5605 !if(Is2Addr, 5606 !strconcat(OpcodeStr, 5607 "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5608 !strconcat(OpcodeStr, 5609 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5610 [(set VR128:$dst, 5611 (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>, 5612 Sched<[sched.Folded, sched.ReadAfterFold]>; 5613} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 5614} 5615} 5616 5617// FP round - roundss, roundps, roundsd, roundpd 5618let Predicates = [HasAVX, NoVLX] in { 5619 let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in { 5620 // Intrinsic form 5621 defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, 5622 loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>, 5623 VEX, WIG; 5624 defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, 5625 loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>, 5626 VEX, VEX_L, WIG; 5627 } 5628 5629 let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in { 5630 defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, 5631 loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>, 5632 VEX, WIG; 5633 defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, 5634 loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>, 5635 VEX, VEX_L, WIG; 5636 } 5637} 5638let Predicates = [UseAVX] in { 5639 defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, 5640 v4f32, v2f64, X86RndScales, 0>, 5641 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; 5642 defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, 5643 VEX, VVVV, VEX_LIG, WIG, SIMD_EXC; 5644} 5645 5646let Predicates = [UseAVX] in { 5647 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5648 (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>; 5649 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5650 (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>; 5651} 5652 5653let Predicates = [UseAVX, OptForSize] in { 5654 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5655 (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5656 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5657 (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>; 5658} 5659 5660let ExeDomain = SSEPackedSingle in 5661defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, 5662 memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>; 5663let ExeDomain = SSEPackedDouble in 5664defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, 5665 memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>; 5666 5667defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; 5668 5669let Constraints = "$src1 = $dst" in 5670defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, 5671 v4f32, v2f64, X86RndScales>; 5672 5673let Predicates = [UseSSE41] in { 5674 def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2), 5675 (ROUNDSSr FR32:$src1, timm:$src2)>; 5676 def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2), 5677 (ROUNDSDr FR64:$src1, timm:$src2)>; 5678} 5679 5680let Predicates = [UseSSE41, OptForSize] in { 5681 def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2), 5682 (ROUNDSSm addr:$src1, timm:$src2)>; 5683 def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2), 5684 (ROUNDSDm addr:$src1, timm:$src2)>; 5685} 5686 5687//===----------------------------------------------------------------------===// 5688// SSE4.1 - Packed Bit Test 5689//===----------------------------------------------------------------------===// 5690 5691// ptest instruction we'll lower to this in X86ISelLowering primarily from 5692// the intel intrinsic that corresponds to this. 5693let Defs = [EFLAGS], Predicates = [HasAVX] in { 5694def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5695 "vptest\t{$src2, $src1|$src1, $src2}", 5696 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5697 Sched<[SchedWriteVecTest.XMM]>, VEX, WIG; 5698def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5699 "vptest\t{$src2, $src1|$src1, $src2}", 5700 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, 5701 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>, 5702 VEX, WIG; 5703 5704def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), 5705 "vptest\t{$src2, $src1|$src1, $src2}", 5706 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, 5707 Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, WIG; 5708def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), 5709 "vptest\t{$src2, $src1|$src1, $src2}", 5710 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, 5711 Sched<[SchedWriteVecTest.YMM.Folded, SchedWriteVecTest.YMM.ReadAfterFold]>, 5712 VEX, VEX_L, WIG; 5713} 5714 5715let Defs = [EFLAGS] in { 5716def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), 5717 "ptest\t{$src2, $src1|$src1, $src2}", 5718 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, 5719 Sched<[SchedWriteVecTest.XMM]>; 5720def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), 5721 "ptest\t{$src2, $src1|$src1, $src2}", 5722 [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, 5723 Sched<[SchedWriteVecTest.XMM.Folded, SchedWriteVecTest.XMM.ReadAfterFold]>; 5724} 5725 5726// The bit test instructions below are AVX only 5727multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, 5728 X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, 5729 X86FoldableSchedWrite sched> { 5730 def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), 5731 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5732 [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, 5733 Sched<[sched]>, VEX; 5734 def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), 5735 !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), 5736 [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, 5737 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX; 5738} 5739 5740let Defs = [EFLAGS], Predicates = [HasAVX] in { 5741let ExeDomain = SSEPackedSingle in { 5742defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, 5743 SchedWriteFTest.XMM>; 5744defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, 5745 SchedWriteFTest.YMM>, VEX_L; 5746} 5747let ExeDomain = SSEPackedDouble in { 5748defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, 5749 SchedWriteFTest.XMM>; 5750defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, 5751 SchedWriteFTest.YMM>, VEX_L; 5752} 5753} 5754 5755//===----------------------------------------------------------------------===// 5756// SSE4.1 - Misc Instructions 5757//===----------------------------------------------------------------------===// 5758 5759let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { 5760 def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), 5761 "popcnt{w}\t{$src, $dst|$dst, $src}", 5762 [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, 5763 Sched<[WritePOPCNT]>, OpSize16, TB, XS; 5764 def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 5765 "popcnt{w}\t{$src, $dst|$dst, $src}", 5766 [(set GR16:$dst, (ctpop (loadi16 addr:$src))), 5767 (implicit EFLAGS)]>, 5768 Sched<[WritePOPCNT.Folded]>, OpSize16, TB, XS; 5769 5770 def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), 5771 "popcnt{l}\t{$src, $dst|$dst, $src}", 5772 [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, 5773 Sched<[WritePOPCNT]>, OpSize32, TB, XS; 5774 5775 def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), 5776 "popcnt{l}\t{$src, $dst|$dst, $src}", 5777 [(set GR32:$dst, (ctpop (loadi32 addr:$src))), 5778 (implicit EFLAGS)]>, 5779 Sched<[WritePOPCNT.Folded]>, OpSize32, TB, XS; 5780 5781 def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), 5782 "popcnt{q}\t{$src, $dst|$dst, $src}", 5783 [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, 5784 Sched<[WritePOPCNT]>, TB, XS; 5785 def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), 5786 "popcnt{q}\t{$src, $dst|$dst, $src}", 5787 [(set GR64:$dst, (ctpop (loadi64 addr:$src))), 5788 (implicit EFLAGS)]>, 5789 Sched<[WritePOPCNT.Folded]>, TB, XS; 5790} 5791 5792// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. 5793multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, 5794 SDNode OpNode, PatFrag ld_frag, 5795 X86FoldableSchedWrite Sched> { 5796 def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 5797 (ins VR128:$src), 5798 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5799 [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, 5800 Sched<[Sched]>; 5801 def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 5802 (ins i128mem:$src), 5803 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 5804 [(set VR128:$dst, 5805 (v8i16 (OpNode (ld_frag addr:$src))))]>, 5806 Sched<[Sched.Folded]>; 5807} 5808 5809// PHMIN has the same profile as PSAD, thus we use the same scheduling 5810// model, although the naming is misleading. 5811let Predicates = [HasAVX] in 5812defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", 5813 X86phminpos, load, 5814 WritePHMINPOS>, VEX, WIG; 5815defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", 5816 X86phminpos, memop, 5817 WritePHMINPOS>; 5818 5819/// SS48I_binop_rm - Simple SSE41 binary operator. 5820multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 5821 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5822 X86MemOperand x86memop, X86FoldableSchedWrite sched, 5823 bit Is2Addr = 1> { 5824 let isCommutable = 1 in 5825 def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), 5826 (ins RC:$src1, RC:$src2), 5827 !if(Is2Addr, 5828 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5829 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5830 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 5831 Sched<[sched]>; 5832 def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), 5833 (ins RC:$src1, x86memop:$src2), 5834 !if(Is2Addr, 5835 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 5836 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 5837 [(set RC:$dst, 5838 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 5839 Sched<[sched.Folded, sched.ReadAfterFold]>; 5840} 5841 5842let Predicates = [HasAVX, NoVLX] in { 5843 defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, 5844 load, i128mem, SchedWriteVecALU.XMM, 0>, 5845 VEX, VVVV, WIG; 5846 defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, 5847 load, i128mem, SchedWriteVecALU.XMM, 0>, 5848 VEX, VVVV, WIG; 5849 defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, 5850 load, i128mem, SchedWriteVecALU.XMM, 0>, 5851 VEX, VVVV, WIG; 5852 defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, 5853 load, i128mem, SchedWriteVecALU.XMM, 0>, 5854 VEX, VVVV, WIG; 5855 defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, 5856 load, i128mem, SchedWriteVecIMul.XMM, 0>, 5857 VEX, VVVV, WIG; 5858} 5859let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { 5860 defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, 5861 load, i128mem, SchedWriteVecALU.XMM, 0>, 5862 VEX, VVVV, WIG; 5863 defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, 5864 load, i128mem, SchedWriteVecALU.XMM, 0>, 5865 VEX, VVVV, WIG; 5866 defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, 5867 load, i128mem, SchedWriteVecALU.XMM, 0>, 5868 VEX, VVVV, WIG; 5869 defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, 5870 load, i128mem, SchedWriteVecALU.XMM, 0>, 5871 VEX, VVVV, WIG; 5872} 5873 5874let Predicates = [HasAVX2, NoVLX] in { 5875 defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, 5876 load, i256mem, SchedWriteVecALU.YMM, 0>, 5877 VEX, VVVV, VEX_L, WIG; 5878 defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, 5879 load, i256mem, SchedWriteVecALU.YMM, 0>, 5880 VEX, VVVV, VEX_L, WIG; 5881 defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, 5882 load, i256mem, SchedWriteVecALU.YMM, 0>, 5883 VEX, VVVV, VEX_L, WIG; 5884 defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, 5885 load, i256mem, SchedWriteVecALU.YMM, 0>, 5886 VEX, VVVV, VEX_L, WIG; 5887 defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, 5888 load, i256mem, SchedWriteVecIMul.YMM, 0>, 5889 VEX, VVVV, VEX_L, WIG; 5890} 5891let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 5892 defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, 5893 load, i256mem, SchedWriteVecALU.YMM, 0>, 5894 VEX, VVVV, VEX_L, WIG; 5895 defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, 5896 load, i256mem, SchedWriteVecALU.YMM, 0>, 5897 VEX, VVVV, VEX_L, WIG; 5898 defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, 5899 load, i256mem, SchedWriteVecALU.YMM, 0>, 5900 VEX, VVVV, VEX_L, WIG; 5901 defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, 5902 load, i256mem, SchedWriteVecALU.YMM, 0>, 5903 VEX, VVVV, VEX_L, WIG; 5904} 5905 5906let Constraints = "$src1 = $dst" in { 5907 defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, 5908 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5909 defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, 5910 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5911 defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, 5912 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5913 defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, 5914 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5915 defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, 5916 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5917 defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, 5918 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5919 defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, 5920 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5921 defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, 5922 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5923 defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, 5924 memop, i128mem, SchedWriteVecIMul.XMM, 1>; 5925} 5926 5927let Predicates = [HasAVX, NoVLX] in 5928 defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, 5929 load, i128mem, SchedWritePMULLD.XMM, 0>, 5930 VEX, VVVV, WIG; 5931let Predicates = [HasAVX] in 5932 defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, 5933 load, i128mem, SchedWriteVecALU.XMM, 0>, 5934 VEX, VVVV, WIG; 5935 5936let Predicates = [HasAVX2, NoVLX] in 5937 defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, 5938 load, i256mem, SchedWritePMULLD.YMM, 0>, 5939 VEX, VVVV, VEX_L, WIG; 5940let Predicates = [HasAVX2] in 5941 defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, 5942 load, i256mem, SchedWriteVecALU.YMM, 0>, 5943 VEX, VVVV, VEX_L, WIG; 5944 5945let Constraints = "$src1 = $dst" in { 5946 defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, 5947 memop, i128mem, SchedWritePMULLD.XMM, 1>; 5948 defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, 5949 memop, i128mem, SchedWriteVecALU.XMM, 1>; 5950} 5951 5952/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate 5953multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, 5954 Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, 5955 X86MemOperand x86memop, bit Is2Addr, 5956 X86FoldableSchedWrite sched> { 5957 let isCommutable = 1 in 5958 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5959 (ins RC:$src1, RC:$src2, u8imm:$src3), 5960 !if(Is2Addr, 5961 !strconcat(OpcodeStr, 5962 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5963 !strconcat(OpcodeStr, 5964 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5965 [(set RC:$dst, (IntId RC:$src1, RC:$src2, timm:$src3))]>, 5966 Sched<[sched]>; 5967 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5968 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5969 !if(Is2Addr, 5970 !strconcat(OpcodeStr, 5971 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5972 !strconcat(OpcodeStr, 5973 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5974 [(set RC:$dst, 5975 (IntId RC:$src1, (memop_frag addr:$src2), timm:$src3))]>, 5976 Sched<[sched.Folded, sched.ReadAfterFold]>; 5977} 5978 5979/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate 5980multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 5981 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 5982 X86MemOperand x86memop, bit Is2Addr, 5983 X86FoldableSchedWrite sched> { 5984 let isCommutable = 1 in 5985 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 5986 (ins RC:$src1, RC:$src2, u8imm:$src3), 5987 !if(Is2Addr, 5988 !strconcat(OpcodeStr, 5989 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5990 !strconcat(OpcodeStr, 5991 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 5992 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 5993 Sched<[sched]>; 5994 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 5995 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 5996 !if(Is2Addr, 5997 !strconcat(OpcodeStr, 5998 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 5999 !strconcat(OpcodeStr, 6000 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6001 [(set RC:$dst, 6002 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6003 Sched<[sched.Folded, sched.ReadAfterFold]>; 6004} 6005 6006def BlendCommuteImm2 : SDNodeXForm<timm, [{ 6007 uint8_t Imm = N->getZExtValue() & 0x03; 6008 return getI8Imm(Imm ^ 0x03, SDLoc(N)); 6009}]>; 6010 6011def BlendCommuteImm4 : SDNodeXForm<timm, [{ 6012 uint8_t Imm = N->getZExtValue() & 0x0f; 6013 return getI8Imm(Imm ^ 0x0f, SDLoc(N)); 6014}]>; 6015 6016def BlendCommuteImm8 : SDNodeXForm<timm, [{ 6017 uint8_t Imm = N->getZExtValue() & 0xff; 6018 return getI8Imm(Imm ^ 0xff, SDLoc(N)); 6019}]>; 6020 6021// Turn a 4-bit blendi immediate to 8-bit for use with pblendw. 6022def BlendScaleImm4 : SDNodeXForm<timm, [{ 6023 uint8_t Imm = N->getZExtValue(); 6024 uint8_t NewImm = 0; 6025 for (unsigned i = 0; i != 4; ++i) { 6026 if (Imm & (1 << i)) 6027 NewImm |= 0x3 << (i * 2); 6028 } 6029 return getI8Imm(NewImm, SDLoc(N)); 6030}]>; 6031 6032// Turn a 2-bit blendi immediate to 8-bit for use with pblendw. 6033def BlendScaleImm2 : SDNodeXForm<timm, [{ 6034 uint8_t Imm = N->getZExtValue(); 6035 uint8_t NewImm = 0; 6036 for (unsigned i = 0; i != 2; ++i) { 6037 if (Imm & (1 << i)) 6038 NewImm |= 0xf << (i * 4); 6039 } 6040 return getI8Imm(NewImm, SDLoc(N)); 6041}]>; 6042 6043// Turn a 2-bit blendi immediate to 4-bit for use with pblendd. 6044def BlendScaleImm2to4 : SDNodeXForm<timm, [{ 6045 uint8_t Imm = N->getZExtValue(); 6046 uint8_t NewImm = 0; 6047 for (unsigned i = 0; i != 2; ++i) { 6048 if (Imm & (1 << i)) 6049 NewImm |= 0x3 << (i * 2); 6050 } 6051 return getI8Imm(NewImm, SDLoc(N)); 6052}]>; 6053 6054// Turn a 4-bit blendi immediate to 8-bit for use with pblendw and invert it. 6055def BlendScaleCommuteImm4 : SDNodeXForm<timm, [{ 6056 uint8_t Imm = N->getZExtValue(); 6057 uint8_t NewImm = 0; 6058 for (unsigned i = 0; i != 4; ++i) { 6059 if (Imm & (1 << i)) 6060 NewImm |= 0x3 << (i * 2); 6061 } 6062 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6063}]>; 6064 6065// Turn a 2-bit blendi immediate to 8-bit for use with pblendw and invert it. 6066def BlendScaleCommuteImm2 : SDNodeXForm<timm, [{ 6067 uint8_t Imm = N->getZExtValue(); 6068 uint8_t NewImm = 0; 6069 for (unsigned i = 0; i != 2; ++i) { 6070 if (Imm & (1 << i)) 6071 NewImm |= 0xf << (i * 4); 6072 } 6073 return getI8Imm(NewImm ^ 0xff, SDLoc(N)); 6074}]>; 6075 6076// Turn a 2-bit blendi immediate to 4-bit for use with pblendd and invert it. 6077def BlendScaleCommuteImm2to4 : SDNodeXForm<timm, [{ 6078 uint8_t Imm = N->getZExtValue(); 6079 uint8_t NewImm = 0; 6080 for (unsigned i = 0; i != 2; ++i) { 6081 if (Imm & (1 << i)) 6082 NewImm |= 0x3 << (i * 2); 6083 } 6084 return getI8Imm(NewImm ^ 0xf, SDLoc(N)); 6085}]>; 6086 6087let Predicates = [HasAVX] in { 6088 let isCommutable = 0 in { 6089 defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, 6090 VR128, load, i128mem, 0, 6091 SchedWriteMPSAD.XMM>, VEX, VVVV, WIG; 6092 } 6093 6094let Uses = [MXCSR], mayRaiseFPException = 1 in { 6095 let ExeDomain = SSEPackedSingle in 6096 defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, 6097 VR128, load, f128mem, 0, 6098 SchedWriteDPPS.XMM>, VEX, VVVV, WIG; 6099 let ExeDomain = SSEPackedDouble in 6100 defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, 6101 VR128, load, f128mem, 0, 6102 SchedWriteDPPD.XMM>, VEX, VVVV, WIG; 6103 let ExeDomain = SSEPackedSingle in 6104 defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, 6105 VR256, load, i256mem, 0, 6106 SchedWriteDPPS.YMM>, VEX, VVVV, VEX_L, WIG; 6107} 6108} 6109 6110let Predicates = [HasAVX2] in { 6111 let isCommutable = 0 in { 6112 defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, 6113 VR256, load, i256mem, 0, 6114 SchedWriteMPSAD.YMM>, VEX, VVVV, VEX_L, WIG; 6115 } 6116} 6117 6118let Constraints = "$src1 = $dst" in { 6119 let isCommutable = 0 in { 6120 defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, 6121 VR128, memop, i128mem, 1, 6122 SchedWriteMPSAD.XMM>; 6123 } 6124 6125 let ExeDomain = SSEPackedSingle in 6126 defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, 6127 VR128, memop, f128mem, 1, 6128 SchedWriteDPPS.XMM>, SIMD_EXC; 6129 let ExeDomain = SSEPackedDouble in 6130 defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, 6131 VR128, memop, f128mem, 1, 6132 SchedWriteDPPD.XMM>, SIMD_EXC; 6133} 6134 6135/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate 6136multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 6137 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6138 X86MemOperand x86memop, bit Is2Addr, Domain d, 6139 X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { 6140let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { 6141 let isCommutable = 1 in 6142 def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), 6143 (ins RC:$src1, RC:$src2, u8imm:$src3), 6144 !if(Is2Addr, 6145 !strconcat(OpcodeStr, 6146 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6147 !strconcat(OpcodeStr, 6148 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6149 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 6150 Sched<[sched]>; 6151 def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), 6152 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 6153 !if(Is2Addr, 6154 !strconcat(OpcodeStr, 6155 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 6156 !strconcat(OpcodeStr, 6157 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), 6158 [(set RC:$dst, 6159 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2), timm:$src3)))]>, 6160 Sched<[sched.Folded, sched.ReadAfterFold]>; 6161} 6162 6163 // Pattern to commute if load is in first source. 6164 def : Pat<(OpVT (OpNode (memop_frag addr:$src2), RC:$src1, timm:$src3)), 6165 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 6166 (commuteXForm timm:$src3))>; 6167} 6168 6169let Predicates = [HasAVX] in { 6170 defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, 6171 VR128, load, f128mem, 0, SSEPackedSingle, 6172 SchedWriteFBlend.XMM, BlendCommuteImm4>, 6173 VEX, VVVV, WIG; 6174 defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, 6175 VR256, load, f256mem, 0, SSEPackedSingle, 6176 SchedWriteFBlend.YMM, BlendCommuteImm8>, 6177 VEX, VVVV, VEX_L, WIG; 6178 defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, 6179 VR128, load, f128mem, 0, SSEPackedDouble, 6180 SchedWriteFBlend.XMM, BlendCommuteImm2>, 6181 VEX, VVVV, WIG; 6182 defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, 6183 VR256, load, f256mem, 0, SSEPackedDouble, 6184 SchedWriteFBlend.YMM, BlendCommuteImm4>, 6185 VEX, VVVV, VEX_L, WIG; 6186 defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, 6187 VR128, load, i128mem, 0, SSEPackedInt, 6188 SchedWriteBlend.XMM, BlendCommuteImm8>, 6189 VEX, VVVV, WIG; 6190} 6191 6192let Predicates = [HasAVX2] in { 6193 defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, 6194 VR256, load, i256mem, 0, SSEPackedInt, 6195 SchedWriteBlend.YMM, BlendCommuteImm8>, 6196 VEX, VVVV, VEX_L, WIG; 6197} 6198 6199// Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw. 6200// ExecutionDomainFixPass will cleanup domains later on. 6201let Predicates = [HasAVX1Only] in { 6202def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 6203 (VBLENDPDYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6204def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 6205 (VBLENDPDYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6206def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 6207 (VBLENDPDYrmi VR256:$src1, addr:$src2, (BlendCommuteImm4 timm:$src3))>; 6208 6209// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6210// it from becoming movsd via commuting under optsize. 6211def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6212 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6213def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 6214 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6215def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 6216 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6217 6218def : Pat<(X86Blendi (v8i32 VR256:$src1), (v8i32 VR256:$src2), timm:$src3), 6219 (VBLENDPSYrri VR256:$src1, VR256:$src2, timm:$src3)>; 6220def : Pat<(X86Blendi VR256:$src1, (loadv8i32 addr:$src2), timm:$src3), 6221 (VBLENDPSYrmi VR256:$src1, addr:$src2, timm:$src3)>; 6222def : Pat<(X86Blendi (loadv8i32 addr:$src2), VR256:$src1, timm:$src3), 6223 (VBLENDPSYrmi VR256:$src1, addr:$src2, (BlendCommuteImm8 timm:$src3))>; 6224 6225// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6226// it from becoming movss via commuting under optsize. 6227def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6228 (VPBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6229def : Pat<(X86Blendi VR128:$src1, (loadv4i32 addr:$src2), timm:$src3), 6230 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6231def : Pat<(X86Blendi (loadv4i32 addr:$src2), VR128:$src1, timm:$src3), 6232 (VPBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6233} 6234 6235defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, 6236 VR128, memop, f128mem, 1, SSEPackedSingle, 6237 SchedWriteFBlend.XMM, BlendCommuteImm4>; 6238defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, 6239 VR128, memop, f128mem, 1, SSEPackedDouble, 6240 SchedWriteFBlend.XMM, BlendCommuteImm2>; 6241defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, 6242 VR128, memop, i128mem, 1, SSEPackedInt, 6243 SchedWriteBlend.XMM, BlendCommuteImm8>; 6244 6245let Predicates = [UseSSE41] in { 6246// Use pblendw for 128-bit integer to keep it in the integer domain and prevent 6247// it from becoming movss via commuting under optsize. 6248def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 6249 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm2 timm:$src3))>; 6250def : Pat<(X86Blendi VR128:$src1, (memopv2i64 addr:$src2), timm:$src3), 6251 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm2 timm:$src3))>; 6252def : Pat<(X86Blendi (memopv2i64 addr:$src2), VR128:$src1, timm:$src3), 6253 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2 timm:$src3))>; 6254 6255def : Pat<(X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2), timm:$src3), 6256 (PBLENDWrri VR128:$src1, VR128:$src2, (BlendScaleImm4 timm:$src3))>; 6257def : Pat<(X86Blendi VR128:$src1, (memopv4i32 addr:$src2), timm:$src3), 6258 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 6259def : Pat<(X86Blendi (memopv4i32 addr:$src2), VR128:$src1, timm:$src3), 6260 (PBLENDWrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 6261} 6262 6263// For insertion into the zero index (low half) of a 256-bit vector, it is 6264// more efficient to generate a blend with immediate instead of an insert*128. 6265let Predicates = [HasAVX] in { 6266def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), 6267 (VBLENDPDYrri VR256:$src1, 6268 (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6269 VR128:$src2, sub_xmm), 0x3)>; 6270def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), 6271 (VBLENDPSYrri VR256:$src1, 6272 (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6273 VR128:$src2, sub_xmm), 0xf)>; 6274 6275def : Pat<(insert_subvector (loadv4f64 addr:$src2), (v2f64 VR128:$src1), (iPTR 0)), 6276 (VBLENDPDYrmi (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 6277 VR128:$src1, sub_xmm), addr:$src2, 0xc)>; 6278def : Pat<(insert_subvector (loadv8f32 addr:$src2), (v4f32 VR128:$src1), (iPTR 0)), 6279 (VBLENDPSYrmi (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 6280 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 6281} 6282 6283/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators 6284multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC, 6285 X86MemOperand x86memop, ValueType VT, 6286 PatFrag mem_frag, SDNode OpNode, 6287 X86FoldableSchedWrite sched> { 6288 def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), 6289 (ins RC:$src1, RC:$src2, RC:$src3), 6290 !strconcat(OpcodeStr, 6291 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6292 [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))], 6293 SSEPackedInt>, TA, PD, VEX, VVVV, 6294 Sched<[sched]>; 6295 6296 def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), 6297 (ins RC:$src1, x86memop:$src2, RC:$src3), 6298 !strconcat(OpcodeStr, 6299 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 6300 [(set RC:$dst, 6301 (OpNode RC:$src3, (mem_frag addr:$src2), 6302 RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV, 6303 Sched<[sched.Folded, sched.ReadAfterFold, 6304 // x86memop:$src2 6305 ReadDefault, ReadDefault, ReadDefault, ReadDefault, 6306 ReadDefault, 6307 // RC::$src3 6308 sched.ReadAfterFold]>; 6309} 6310 6311let Predicates = [HasAVX] in { 6312let ExeDomain = SSEPackedDouble in { 6313defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, 6314 v2f64, loadv2f64, X86Blendv, 6315 SchedWriteFVarBlend.XMM>; 6316defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, 6317 v4f64, loadv4f64, X86Blendv, 6318 SchedWriteFVarBlend.YMM>, VEX_L; 6319} // ExeDomain = SSEPackedDouble 6320let ExeDomain = SSEPackedSingle in { 6321defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, 6322 v4f32, loadv4f32, X86Blendv, 6323 SchedWriteFVarBlend.XMM>; 6324defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, 6325 v8f32, loadv8f32, X86Blendv, 6326 SchedWriteFVarBlend.YMM>, VEX_L; 6327} // ExeDomain = SSEPackedSingle 6328defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, 6329 v16i8, loadv16i8, X86Blendv, 6330 SchedWriteVarBlend.XMM>; 6331} 6332 6333let Predicates = [HasAVX2] in { 6334defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, 6335 v32i8, loadv32i8, X86Blendv, 6336 SchedWriteVarBlend.YMM>, VEX_L; 6337} 6338 6339let Predicates = [HasAVX] in { 6340 def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), 6341 (v4i32 VR128:$src2))), 6342 (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6343 def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), 6344 (v2i64 VR128:$src2))), 6345 (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; 6346 def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), 6347 (v8i32 VR256:$src2))), 6348 (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6349 def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), 6350 (v4i64 VR256:$src2))), 6351 (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; 6352} 6353 6354// Prefer a movss or movsd over a blendps when optimizing for size. these were 6355// changed to use blends because blends have better throughput on sandybridge 6356// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6357let Predicates = [HasAVX, OptForSpeed] in { 6358 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6359 (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6360 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6361 (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6362 6363 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6364 (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6365 def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), 6366 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6367 def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), 6368 (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6369 6370 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6371 (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6372 def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), 6373 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6374 def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), 6375 (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6376 6377 // Move low f32 and clear high bits. 6378 def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), 6379 (SUBREG_TO_REG (i32 0), 6380 (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), 6381 (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), 6382 (i8 1))), sub_xmm)>; 6383 def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), 6384 (SUBREG_TO_REG (i32 0), 6385 (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), 6386 (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), 6387 (i8 3))), sub_xmm)>; 6388} 6389 6390// Prefer a movss or movsd over a blendps when optimizing for size. these were 6391// changed to use blends because blends have better throughput on sandybridge 6392// and haswell, but movs[s/d] are 1-2 byte shorter instructions. 6393let Predicates = [UseSSE41, OptForSpeed] in { 6394 // With SSE41 we can use blends for these patterns. 6395 def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), 6396 (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; 6397 def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), 6398 (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; 6399 6400 def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), 6401 (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; 6402 def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), 6403 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; 6404 def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), 6405 (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; 6406 6407 def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), 6408 (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; 6409 def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), 6410 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; 6411 def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), 6412 (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; 6413} 6414 6415 6416/// SS41I_ternary - SSE 4.1 ternary operator 6417let Uses = [XMM0], Constraints = "$src1 = $dst" in { 6418 multiclass SS41I_ternary<bits<8> opc, string OpcodeStr, ValueType VT, 6419 PatFrag mem_frag, X86MemOperand x86memop, 6420 SDNode OpNode, X86FoldableSchedWrite sched> { 6421 def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), 6422 (ins VR128:$src1, VR128:$src2), 6423 !strconcat(OpcodeStr, 6424 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6425 [(set VR128:$dst, 6426 (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, 6427 Sched<[sched]>; 6428 6429 def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), 6430 (ins VR128:$src1, x86memop:$src2), 6431 !strconcat(OpcodeStr, 6432 "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6433 [(set VR128:$dst, 6434 (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, 6435 Sched<[sched.Folded, sched.ReadAfterFold]>; 6436 } 6437} 6438 6439let ExeDomain = SSEPackedDouble in 6440defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, 6441 X86Blendv, SchedWriteFVarBlend.XMM>; 6442let ExeDomain = SSEPackedSingle in 6443defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, 6444 X86Blendv, SchedWriteFVarBlend.XMM>; 6445defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, 6446 X86Blendv, SchedWriteVarBlend.XMM>; 6447 6448// Aliases with the implicit xmm0 argument 6449def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6450 (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; 6451def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", 6452 (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; 6453def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6454 (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; 6455def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", 6456 (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; 6457def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6458 (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; 6459def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", 6460 (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; 6461 6462let Predicates = [UseSSE41] in { 6463 def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), 6464 (v4i32 VR128:$src2))), 6465 (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; 6466 def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), 6467 (v2i64 VR128:$src2))), 6468 (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; 6469} 6470 6471let AddedComplexity = 400 in { // Prefer non-temporal versions 6472 6473let Predicates = [HasAVX, NoVLX] in 6474def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6475 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6476 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, WIG; 6477let Predicates = [HasAVX2, NoVLX] in 6478def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), 6479 "vmovntdqa\t{$src, $dst|$dst, $src}", []>, 6480 Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, WIG; 6481def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), 6482 "movntdqa\t{$src, $dst|$dst, $src}", []>, 6483 Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; 6484 6485let Predicates = [HasAVX2, NoVLX] in { 6486 def : Pat<(v8f32 (alignednontemporalload addr:$src)), 6487 (VMOVNTDQAYrm addr:$src)>; 6488 def : Pat<(v4f64 (alignednontemporalload addr:$src)), 6489 (VMOVNTDQAYrm addr:$src)>; 6490 def : Pat<(v4i64 (alignednontemporalload addr:$src)), 6491 (VMOVNTDQAYrm addr:$src)>; 6492 def : Pat<(v8i32 (alignednontemporalload addr:$src)), 6493 (VMOVNTDQAYrm addr:$src)>; 6494 def : Pat<(v16i16 (alignednontemporalload addr:$src)), 6495 (VMOVNTDQAYrm addr:$src)>; 6496 def : Pat<(v16f16 (alignednontemporalload addr:$src)), 6497 (VMOVNTDQAYrm addr:$src)>; 6498 def : Pat<(v32i8 (alignednontemporalload addr:$src)), 6499 (VMOVNTDQAYrm addr:$src)>; 6500} 6501 6502let Predicates = [HasAVX, NoVLX] in { 6503 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6504 (VMOVNTDQArm addr:$src)>; 6505 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6506 (VMOVNTDQArm addr:$src)>; 6507 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6508 (VMOVNTDQArm addr:$src)>; 6509 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6510 (VMOVNTDQArm addr:$src)>; 6511 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6512 (VMOVNTDQArm addr:$src)>; 6513 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6514 (VMOVNTDQArm addr:$src)>; 6515 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6516 (VMOVNTDQArm addr:$src)>; 6517} 6518 6519let Predicates = [UseSSE41] in { 6520 def : Pat<(v4f32 (alignednontemporalload addr:$src)), 6521 (MOVNTDQArm addr:$src)>; 6522 def : Pat<(v2f64 (alignednontemporalload addr:$src)), 6523 (MOVNTDQArm addr:$src)>; 6524 def : Pat<(v2i64 (alignednontemporalload addr:$src)), 6525 (MOVNTDQArm addr:$src)>; 6526 def : Pat<(v4i32 (alignednontemporalload addr:$src)), 6527 (MOVNTDQArm addr:$src)>; 6528 def : Pat<(v8i16 (alignednontemporalload addr:$src)), 6529 (MOVNTDQArm addr:$src)>; 6530 def : Pat<(v8f16 (alignednontemporalload addr:$src)), 6531 (MOVNTDQArm addr:$src)>; 6532 def : Pat<(v16i8 (alignednontemporalload addr:$src)), 6533 (MOVNTDQArm addr:$src)>; 6534} 6535 6536} // AddedComplexity 6537 6538//===----------------------------------------------------------------------===// 6539// SSE4.2 - Compare Instructions 6540//===----------------------------------------------------------------------===// 6541 6542/// SS42I_binop_rm - Simple SSE 4.2 binary operator 6543multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 6544 ValueType OpVT, RegisterClass RC, PatFrag memop_frag, 6545 X86MemOperand x86memop, X86FoldableSchedWrite sched, 6546 bit Is2Addr = 1> { 6547 def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), 6548 (ins RC:$src1, RC:$src2), 6549 !if(Is2Addr, 6550 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6551 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6552 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, 6553 Sched<[sched]>; 6554 def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), 6555 (ins RC:$src1, x86memop:$src2), 6556 !if(Is2Addr, 6557 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), 6558 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), 6559 [(set RC:$dst, 6560 (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, 6561 Sched<[sched.Folded, sched.ReadAfterFold]>; 6562} 6563 6564let Predicates = [HasAVX] in 6565 defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, 6566 load, i128mem, SchedWriteVecALU.XMM, 0>, 6567 VEX, VVVV, WIG; 6568 6569let Predicates = [HasAVX2] in 6570 defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, 6571 load, i256mem, SchedWriteVecALU.YMM, 0>, 6572 VEX, VVVV, VEX_L, WIG; 6573 6574let Constraints = "$src1 = $dst" in 6575 defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, 6576 memop, i128mem, SchedWriteVecALU.XMM>; 6577 6578//===----------------------------------------------------------------------===// 6579// SSE4.2 - String/text Processing Instructions 6580//===----------------------------------------------------------------------===// 6581 6582multiclass pcmpistrm_SS42AI<string asm> { 6583 def rr : SS42AI<0x62, MRMSrcReg, (outs), 6584 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6585 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6586 []>, Sched<[WritePCmpIStrM]>; 6587 let mayLoad = 1 in 6588 def rm :SS42AI<0x62, MRMSrcMem, (outs), 6589 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6590 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6591 []>, Sched<[WritePCmpIStrM.Folded, WritePCmpIStrM.ReadAfterFold]>; 6592} 6593 6594let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { 6595 let Predicates = [HasAVX] in 6596 defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX, WIG; 6597 defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; 6598} 6599 6600multiclass SS42AI_pcmpestrm<string asm> { 6601 def rr : SS42AI<0x60, MRMSrcReg, (outs), 6602 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6603 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6604 []>, Sched<[WritePCmpEStrM]>; 6605 let mayLoad = 1 in 6606 def rm : SS42AI<0x60, MRMSrcMem, (outs), 6607 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6608 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6609 []>, Sched<[WritePCmpEStrM.Folded, WritePCmpEStrM.ReadAfterFold]>; 6610} 6611 6612let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6613 let Predicates = [HasAVX] in 6614 defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX, WIG; 6615 defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; 6616} 6617 6618multiclass SS42AI_pcmpistri<string asm> { 6619 def rr : SS42AI<0x63, MRMSrcReg, (outs), 6620 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6621 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6622 []>, Sched<[WritePCmpIStrI]>; 6623 let mayLoad = 1 in 6624 def rm : SS42AI<0x63, MRMSrcMem, (outs), 6625 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6626 !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), 6627 []>, Sched<[WritePCmpIStrI.Folded, WritePCmpIStrI.ReadAfterFold]>; 6628} 6629 6630let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { 6631 let Predicates = [HasAVX] in 6632 defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX, WIG; 6633 defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; 6634} 6635 6636multiclass SS42AI_pcmpestri<string asm> { 6637 def rr : SS42AI<0x61, MRMSrcReg, (outs), 6638 (ins VR128:$src1, VR128:$src3, u8imm:$src5), 6639 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6640 []>, Sched<[WritePCmpEStrI]>; 6641 let mayLoad = 1 in 6642 def rm : SS42AI<0x61, MRMSrcMem, (outs), 6643 (ins VR128:$src1, i128mem:$src3, u8imm:$src5), 6644 !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), 6645 []>, Sched<[WritePCmpEStrI.Folded, WritePCmpEStrI.ReadAfterFold]>; 6646} 6647 6648let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { 6649 let Predicates = [HasAVX] in 6650 defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX, WIG; 6651 defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; 6652} 6653 6654//===----------------------------------------------------------------------===// 6655// SSE4.2 - CRC Instructions 6656//===----------------------------------------------------------------------===// 6657 6658// NOTE: 'HasCRC32' is used as CRC32 instructions are GPR only and not directly 6659// controlled by the SSE42 flag. 6660// 6661// No CRC instructions have AVX equivalents 6662 6663class Crc32r<X86TypeInfo t, RegisterClass rc, SDPatternOperator node> 6664 : ITy<0xF1, MRMSrcReg, t, (outs rc:$dst), (ins rc:$src1, t.RegClass:$src2), 6665 "crc32", binop_args, [(set rc:$dst, (node rc:$src1, t.RegClass:$src2))]>, 6666 Sched<[WriteCRC32]>, NoCD8 { 6667 let Constraints = "$src1 = $dst"; 6668} 6669 6670class Crc32m<X86TypeInfo t, RegisterClass rc, SDPatternOperator node> 6671 : ITy<0xF1, MRMSrcMem, t, (outs rc:$dst), (ins rc:$src1, t.MemOperand:$src2), 6672 "crc32", binop_args, [(set rc:$dst, (node rc:$src1, (load addr:$src2)))]>, 6673 Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>, NoCD8 { 6674 let Constraints = "$src1 = $dst"; 6675} 6676 6677let Predicates = [HasCRC32, NoEGPR], OpMap = T8, OpPrefix = XD in { 6678 def CRC32r32r8 : Crc32r<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6679 def CRC32r32m8 : Crc32m<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6680 def CRC32r32r16 : Crc32r<Xi16, GR32, int_x86_sse42_crc32_32_16>, OpSize16; 6681 def CRC32r32m16 : Crc32m<Xi16, GR32, int_x86_sse42_crc32_32_16>, OpSize16; 6682 def CRC32r32r32 : Crc32r<Xi32, GR32, int_x86_sse42_crc32_32_32>, OpSize32; 6683 def CRC32r32m32 : Crc32m<Xi32, GR32, int_x86_sse42_crc32_32_32>, OpSize32; 6684 def CRC32r64r64 : Crc32r<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6685 def CRC32r64m64 : Crc32m<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6686 def CRC32r64r8 : Crc32r<Xi8, GR64, null_frag>, REX_W; 6687 let mayLoad = 1 in 6688 def CRC32r64m8 : Crc32m<Xi8, GR64, null_frag>, REX_W; 6689} 6690 6691let Predicates = [HasCRC32, HasEGPR, In64BitMode], OpMap = T_MAP4, OpEnc = EncEVEX in { 6692 def CRC32r32r8_EVEX : Crc32r<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6693 def CRC32r32m8_EVEX : Crc32m<Xi8, GR32, int_x86_sse42_crc32_32_8>; 6694 def CRC32r32r16_EVEX : Crc32r<Xi16, GR32, int_x86_sse42_crc32_32_16>, PD; 6695 def CRC32r32m16_EVEX : Crc32m<Xi16, GR32, int_x86_sse42_crc32_32_16>, PD; 6696 def CRC32r32r32_EVEX : Crc32r<Xi32, GR32, int_x86_sse42_crc32_32_32>; 6697 def CRC32r32m32_EVEX : Crc32m<Xi32, GR32, int_x86_sse42_crc32_32_32>; 6698 def CRC32r64r64_EVEX : Crc32r<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6699 def CRC32r64m64_EVEX : Crc32m<Xi64, GR64, int_x86_sse42_crc32_64_64>; 6700 def CRC32r64r8_EVEX : Crc32r<Xi8, GR64, null_frag>, REX_W; 6701 let mayLoad = 1 in 6702 def CRC32r64m8_EVEX : Crc32m<Xi8, GR64, null_frag>, REX_W; 6703} 6704 6705//===----------------------------------------------------------------------===// 6706// SHA-NI Instructions 6707//===----------------------------------------------------------------------===// 6708 6709// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? 6710multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, 6711 X86FoldableSchedWrite sched, string Suffix = "", bit UsesXMM0 = 0> { 6712 def rr#Suffix : I<Opc, MRMSrcReg, (outs VR128:$dst), 6713 (ins VR128:$src1, VR128:$src2), 6714 !if(UsesXMM0, 6715 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6716 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6717 [!if(UsesXMM0, 6718 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), 6719 (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, 6720 T8, Sched<[sched]>; 6721 6722 def rm#Suffix : I<Opc, MRMSrcMem, (outs VR128:$dst), 6723 (ins VR128:$src1, i128mem:$src2), 6724 !if(UsesXMM0, 6725 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), 6726 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), 6727 [!if(UsesXMM0, 6728 (set VR128:$dst, (IntId VR128:$src1, 6729 (memop addr:$src2), XMM0)), 6730 (set VR128:$dst, (IntId VR128:$src1, 6731 (memop addr:$src2))))]>, T8, 6732 Sched<[sched.Folded, sched.ReadAfterFold]>; 6733} 6734 6735let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in { 6736 def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), 6737 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6738 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6739 [(set VR128:$dst, 6740 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6741 (i8 timm:$src3)))]>, TA, 6742 Sched<[SchedWriteVecIMul.XMM]>; 6743 def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), 6744 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6745 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6746 [(set VR128:$dst, 6747 (int_x86_sha1rnds4 VR128:$src1, 6748 (memop addr:$src2), 6749 (i8 timm:$src3)))]>, TA, 6750 Sched<[SchedWriteVecIMul.XMM.Folded, 6751 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6752 6753 defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, 6754 SchedWriteVecIMul.XMM>; 6755 defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, 6756 SchedWriteVecIMul.XMM>; 6757 defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, 6758 SchedWriteVecIMul.XMM>; 6759 6760 let Uses=[XMM0] in 6761 defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 6762 SchedWriteVecIMul.XMM, "", 1>; 6763 6764 defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, 6765 SchedWriteVecIMul.XMM>; 6766 defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, 6767 SchedWriteVecIMul.XMM>; 6768} 6769 6770let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in { 6771 def SHA1RNDS4rri_EVEX: Ii8<0xD4, MRMSrcReg, (outs VR128:$dst), 6772 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6773 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6774 [(set VR128:$dst, 6775 (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, 6776 (i8 timm:$src3)))]>, 6777 EVEX, NoCD8, T_MAP4, Sched<[SchedWriteVecIMul.XMM]>; 6778 def SHA1RNDS4rmi_EVEX: Ii8<0xD4, MRMSrcMem, (outs VR128:$dst), 6779 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6780 "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6781 [(set VR128:$dst, 6782 (int_x86_sha1rnds4 VR128:$src1, 6783 (memop addr:$src2), 6784 (i8 timm:$src3)))]>, 6785 EVEX, NoCD8, T_MAP4, 6786 Sched<[SchedWriteVecIMul.XMM.Folded, 6787 SchedWriteVecIMul.XMM.ReadAfterFold]>; 6788 6789 defm SHA1NEXTE : SHAI_binop<0xD8, "sha1nexte", int_x86_sha1nexte, 6790 SchedWriteVecIMul.XMM, "_EVEX">, 6791 EVEX, NoCD8, T_MAP4; 6792 defm SHA1MSG1 : SHAI_binop<0xD9, "sha1msg1", int_x86_sha1msg1, 6793 SchedWriteVecIMul.XMM, "_EVEX">, 6794 EVEX, NoCD8, T_MAP4; 6795 defm SHA1MSG2 : SHAI_binop<0xDA, "sha1msg2", int_x86_sha1msg2, 6796 SchedWriteVecIMul.XMM, "_EVEX">, 6797 EVEX, NoCD8, T_MAP4; 6798 6799 let Uses=[XMM0] in 6800 defm SHA256RNDS2 : SHAI_binop<0xDB, "sha256rnds2", int_x86_sha256rnds2, 6801 SchedWriteVecIMul.XMM, "_EVEX", 1>, 6802 EVEX, NoCD8, T_MAP4; 6803 6804 defm SHA256MSG1 : SHAI_binop<0xDC, "sha256msg1", int_x86_sha256msg1, 6805 SchedWriteVecIMul.XMM, "_EVEX">, 6806 EVEX, NoCD8, T_MAP4; 6807 defm SHA256MSG2 : SHAI_binop<0xDD, "sha256msg2", int_x86_sha256msg2, 6808 SchedWriteVecIMul.XMM, "_EVEX">, 6809 EVEX, NoCD8, T_MAP4; 6810} 6811 6812//===----------------------------------------------------------------------===// 6813// AES-NI Instructions 6814//===----------------------------------------------------------------------===// 6815 6816multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, 6817 Intrinsic IntId, PatFrag ld_frag, 6818 bit Is2Addr = 0, RegisterClass RC = VR128, 6819 X86MemOperand MemOp = i128mem> { 6820 let AsmString = OpcodeStr# 6821 !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", 6822 "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 6823 def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), 6824 (ins RC:$src1, RC:$src2), "", 6825 [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, 6826 Sched<[WriteAESDecEnc]>; 6827 def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), 6828 (ins RC:$src1, MemOp:$src2), "", 6829 [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, 6830 Sched<[WriteAESDecEnc.Folded, WriteAESDecEnc.ReadAfterFold]>; 6831 } 6832} 6833 6834// Perform One Round of an AES Encryption/Decryption Flow 6835let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { 6836 defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", 6837 int_x86_aesni_aesenc, load>, VEX, VVVV, WIG; 6838 defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", 6839 int_x86_aesni_aesenclast, load>, VEX, VVVV, WIG; 6840 defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", 6841 int_x86_aesni_aesdec, load>, VEX, VVVV, WIG; 6842 defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", 6843 int_x86_aesni_aesdeclast, load>, VEX, VVVV, WIG; 6844} 6845 6846let Predicates = [NoVLX, HasVAES] in { 6847 defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", 6848 int_x86_aesni_aesenc_256, load, 0, VR256, 6849 i256mem>, VEX, VVVV, VEX_L, WIG; 6850 defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", 6851 int_x86_aesni_aesenclast_256, load, 0, VR256, 6852 i256mem>, VEX, VVVV, VEX_L, WIG; 6853 defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", 6854 int_x86_aesni_aesdec_256, load, 0, VR256, 6855 i256mem>, VEX, VVVV, VEX_L, WIG; 6856 defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", 6857 int_x86_aesni_aesdeclast_256, load, 0, VR256, 6858 i256mem>, VEX, VVVV, VEX_L, WIG; 6859} 6860 6861let Constraints = "$src1 = $dst" in { 6862 defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", 6863 int_x86_aesni_aesenc, memop, 1>; 6864 defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", 6865 int_x86_aesni_aesenclast, memop, 1>; 6866 defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", 6867 int_x86_aesni_aesdec, memop, 1>; 6868 defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", 6869 int_x86_aesni_aesdeclast, memop, 1>; 6870} 6871 6872// Perform the AES InvMixColumn Transformation 6873let Predicates = [HasAVX, HasAES] in { 6874 def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6875 (ins VR128:$src1), 6876 "vaesimc\t{$src1, $dst|$dst, $src1}", 6877 [(set VR128:$dst, 6878 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, 6879 VEX, WIG; 6880 def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6881 (ins i128mem:$src1), 6882 "vaesimc\t{$src1, $dst|$dst, $src1}", 6883 [(set VR128:$dst, (int_x86_aesni_aesimc (load addr:$src1)))]>, 6884 Sched<[WriteAESIMC.Folded]>, VEX, WIG; 6885} 6886def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), 6887 (ins VR128:$src1), 6888 "aesimc\t{$src1, $dst|$dst, $src1}", 6889 [(set VR128:$dst, 6890 (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; 6891def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), 6892 (ins i128mem:$src1), 6893 "aesimc\t{$src1, $dst|$dst, $src1}", 6894 [(set VR128:$dst, (int_x86_aesni_aesimc (memop addr:$src1)))]>, 6895 Sched<[WriteAESIMC.Folded]>; 6896 6897// AES Round Key Generation Assist 6898let Predicates = [HasAVX, HasAES] in { 6899 def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6900 (ins VR128:$src1, u8imm:$src2), 6901 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6902 [(set VR128:$dst, 6903 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6904 Sched<[WriteAESKeyGen]>, VEX, WIG; 6905 def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6906 (ins i128mem:$src1, u8imm:$src2), 6907 "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6908 [(set VR128:$dst, 6909 (int_x86_aesni_aeskeygenassist (load addr:$src1), timm:$src2))]>, 6910 Sched<[WriteAESKeyGen.Folded]>, VEX, WIG; 6911} 6912def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), 6913 (ins VR128:$src1, u8imm:$src2), 6914 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6915 [(set VR128:$dst, 6916 (int_x86_aesni_aeskeygenassist VR128:$src1, timm:$src2))]>, 6917 Sched<[WriteAESKeyGen]>; 6918def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), 6919 (ins i128mem:$src1, u8imm:$src2), 6920 "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", 6921 [(set VR128:$dst, 6922 (int_x86_aesni_aeskeygenassist (memop addr:$src1), timm:$src2))]>, 6923 Sched<[WriteAESKeyGen.Folded]>; 6924 6925//===----------------------------------------------------------------------===// 6926// PCLMUL Instructions 6927//===----------------------------------------------------------------------===// 6928 6929// Immediate transform to help with commuting. 6930def PCLMULCommuteImm : SDNodeXForm<timm, [{ 6931 uint8_t Imm = N->getZExtValue(); 6932 return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); 6933}]>; 6934 6935// SSE carry-less Multiplication instructions 6936let Predicates = [NoAVX, HasPCLMUL] in { 6937 let Constraints = "$src1 = $dst" in { 6938 let isCommutable = 1 in 6939 def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), 6940 (ins VR128:$src1, VR128:$src2, u8imm:$src3), 6941 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6942 [(set VR128:$dst, 6943 (int_x86_pclmulqdq VR128:$src1, VR128:$src2, timm:$src3))]>, 6944 Sched<[WriteCLMul]>; 6945 6946 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), 6947 (ins VR128:$src1, i128mem:$src2, u8imm:$src3), 6948 "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", 6949 [(set VR128:$dst, 6950 (int_x86_pclmulqdq VR128:$src1, (memop addr:$src2), 6951 timm:$src3))]>, 6952 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6953 } // Constraints = "$src1 = $dst" 6954 6955 def : Pat<(int_x86_pclmulqdq (memop addr:$src2), VR128:$src1, 6956 (i8 timm:$src3)), 6957 (PCLMULQDQrm VR128:$src1, addr:$src2, 6958 (PCLMULCommuteImm timm:$src3))>; 6959} // Predicates = [NoAVX, HasPCLMUL] 6960 6961// SSE aliases 6962foreach HI = ["hq","lq"] in 6963foreach LO = ["hq","lq"] in { 6964 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6965 (PCLMULQDQrr VR128:$dst, VR128:$src, 6966 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6967 def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", 6968 (PCLMULQDQrm VR128:$dst, i128mem:$src, 6969 !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; 6970} 6971 6972// AVX carry-less Multiplication instructions 6973multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, 6974 PatFrag LdFrag, Intrinsic IntId> { 6975 let isCommutable = 1 in 6976 def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), 6977 (ins RC:$src1, RC:$src2, u8imm:$src3), 6978 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6979 [(set RC:$dst, 6980 (IntId RC:$src1, RC:$src2, timm:$src3))]>, 6981 Sched<[WriteCLMul]>; 6982 6983 def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), 6984 (ins RC:$src1, MemOp:$src2, u8imm:$src3), 6985 "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 6986 [(set RC:$dst, 6987 (IntId RC:$src1, (LdFrag addr:$src2), timm:$src3))]>, 6988 Sched<[WriteCLMul.Folded, WriteCLMul.ReadAfterFold]>; 6989 6990 // We can commute a load in the first operand by swapping the sources and 6991 // rotating the immediate. 6992 def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 timm:$src3)), 6993 (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, 6994 (PCLMULCommuteImm timm:$src3))>; 6995} 6996 6997let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in 6998defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load, 6999 int_x86_pclmulqdq>, VEX, VVVV, WIG; 7000 7001let Predicates = [NoVLX, HasVPCLMULQDQ] in 7002defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load, 7003 int_x86_pclmulqdq_256>, VEX, VVVV, VEX_L, WIG; 7004 7005multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, 7006 X86MemOperand MemOp, string Hi, string Lo> { 7007 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7008 (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, 7009 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7010 def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7011 (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, 7012 !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; 7013} 7014 7015multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, 7016 X86MemOperand MemOp> { 7017 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; 7018 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; 7019 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; 7020 defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; 7021} 7022 7023// AVX aliases 7024defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; 7025defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; 7026 7027//===----------------------------------------------------------------------===// 7028// SSE4A Instructions 7029//===----------------------------------------------------------------------===// 7030 7031let Predicates = [HasSSE4A] in { 7032 7033let ExeDomain = SSEPackedInt in { 7034let Constraints = "$src = $dst" in { 7035def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), 7036 (ins VR128:$src, u8imm:$len, u8imm:$idx), 7037 "extrq\t{$idx, $len, $src|$src, $len, $idx}", 7038 [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len, 7039 timm:$idx))]>, 7040 TB, PD, Sched<[SchedWriteVecALU.XMM]>; 7041def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7042 (ins VR128:$src, VR128:$mask), 7043 "extrq\t{$mask, $src|$src, $mask}", 7044 [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, 7045 VR128:$mask))]>, 7046 TB, PD, Sched<[SchedWriteVecALU.XMM]>; 7047 7048def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), 7049 (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), 7050 "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", 7051 [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, 7052 timm:$len, timm:$idx))]>, 7053 TB, XD, Sched<[SchedWriteVecALU.XMM]>; 7054def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), 7055 (ins VR128:$src, VR128:$mask), 7056 "insertq\t{$mask, $src|$src, $mask}", 7057 [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, 7058 VR128:$mask))]>, 7059 TB, XD, Sched<[SchedWriteVecALU.XMM]>; 7060} 7061} // ExeDomain = SSEPackedInt 7062 7063// Non-temporal (unaligned) scalar stores. 7064let AddedComplexity = 400 in { // Prefer non-temporal versions 7065let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { 7066def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), 7067 "movntss\t{$src, $dst|$dst, $src}", []>, TB, XS; 7068 7069def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), 7070 "movntsd\t{$src, $dst|$dst, $src}", []>, TB, XD; 7071} // SchedRW 7072 7073def : Pat<(nontemporalstore FR32:$src, addr:$dst), 7074 (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7075 7076def : Pat<(nontemporalstore FR64:$src, addr:$dst), 7077 (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7078 7079} // AddedComplexity 7080} // HasSSE4A 7081 7082//===----------------------------------------------------------------------===// 7083// AVX Instructions 7084//===----------------------------------------------------------------------===// 7085 7086//===----------------------------------------------------------------------===// 7087// VBROADCAST - Load from memory and broadcast to all elements of the 7088// destination operand 7089// 7090class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, 7091 X86MemOperand x86memop, ValueType VT, 7092 PatFrag bcast_frag, SchedWrite Sched> : 7093 AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7094 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7095 [(set RC:$dst, (VT (bcast_frag addr:$src)))]>, 7096 Sched<[Sched]>, VEX; 7097 7098// AVX2 adds register forms 7099class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, 7100 ValueType ResVT, ValueType OpVT, SchedWrite Sched> : 7101 AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7102 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7103 [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, 7104 Sched<[Sched]>, VEX; 7105 7106let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { 7107 def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, 7108 f32mem, v4f32, X86VBroadcastld32, 7109 SchedWriteFShuffle.XMM.Folded>; 7110 def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, 7111 f32mem, v8f32, X86VBroadcastld32, 7112 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7113} 7114let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in 7115def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, 7116 v4f64, X86VBroadcastld64, 7117 SchedWriteFShuffle.XMM.Folded>, VEX_L; 7118 7119let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { 7120 def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, 7121 v4f32, v4f32, SchedWriteFShuffle.XMM>; 7122 def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, 7123 v8f32, v4f32, WriteFShuffle256>, VEX_L; 7124} 7125let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in 7126def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, 7127 v4f64, v2f64, WriteFShuffle256>, VEX_L; 7128 7129//===----------------------------------------------------------------------===// 7130// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both 7131// halves of a 256-bit vector. 7132// 7133let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in 7134def VBROADCASTI128rm : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), 7135 (ins i128mem:$src), 7136 "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, 7137 Sched<[WriteShuffleLd]>, VEX, VEX_L; 7138 7139let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], 7140 ExeDomain = SSEPackedSingle in 7141def VBROADCASTF128rm : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), 7142 (ins f128mem:$src), 7143 "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, 7144 Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; 7145 7146let Predicates = [HasAVX, NoVLX] in { 7147def : Pat<(v4f64 (X86SubVBroadcastld128 addr:$src)), 7148 (VBROADCASTF128rm addr:$src)>; 7149def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), 7150 (VBROADCASTF128rm addr:$src)>; 7151// NOTE: We're using FP instructions here, but execution domain fixing can 7152// convert to integer when profitable. 7153def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), 7154 (VBROADCASTF128rm addr:$src)>; 7155def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), 7156 (VBROADCASTF128rm addr:$src)>; 7157def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)), 7158 (VBROADCASTF128rm addr:$src)>; 7159def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), 7160 (VBROADCASTF128rm addr:$src)>; 7161def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)), 7162 (VBROADCASTF128rm addr:$src)>; 7163} 7164 7165let Predicates = [HasAVXNECONVERT, NoVLX] in 7166 def : Pat<(v16bf16 (X86SubVBroadcastld128 addr:$src)), 7167 (VBROADCASTF128rm addr:$src)>; 7168 7169//===----------------------------------------------------------------------===// 7170// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks 7171// 7172 7173let ExeDomain = SSEPackedSingle in { 7174let isCommutable = 1 in 7175def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), 7176 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7177 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7178 VEX, VVVV, VEX_L, Sched<[WriteFShuffle256]>; 7179def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), 7180 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7181 "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7182 VEX, VVVV, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>; 7183} 7184 7185// Immediate transform to help with commuting. 7186def Perm2XCommuteImm : SDNodeXForm<timm, [{ 7187 return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); 7188}]>; 7189 7190multiclass vperm2x128_lowering<string InstrStr, ValueType VT, PatFrag memop_frag> { 7191 def : Pat<(VT (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 timm:$imm))), 7192 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR256:$src2, timm:$imm)>; 7193 def : Pat<(VT (X86VPerm2x128 VR256:$src1, (memop_frag addr:$src2), (i8 timm:$imm))), 7194 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, timm:$imm)>; 7195 // Pattern with load in other operand. 7196 def : Pat<(VT (X86VPerm2x128 (memop_frag addr:$src2), VR256:$src1, (i8 timm:$imm))), 7197 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7198 (Perm2XCommuteImm timm:$imm))>; 7199} 7200 7201let Predicates = [HasAVX] in { 7202 defm : vperm2x128_lowering<"VPERM2F128", v4f64, loadv4f64>; 7203 defm : vperm2x128_lowering<"VPERM2F128", v8f32, loadv8f32>; 7204} 7205 7206let Predicates = [HasAVX1Only] in { 7207 defm : vperm2x128_lowering<"VPERM2F128", v4i64, loadv4i64>; 7208 defm : vperm2x128_lowering<"VPERM2F128", v8i32, loadv8i32>; 7209 defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>; 7210 defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>; 7211 defm : vperm2x128_lowering<"VPERM2F128", v32i8, loadv32i8>; 7212} 7213 7214//===----------------------------------------------------------------------===// 7215// VINSERTF128 - Insert packed floating-point values 7216// 7217let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7218def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), 7219 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7220 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7221 []>, Sched<[WriteFShuffle256]>, VEX, VVVV, VEX_L; 7222let mayLoad = 1 in 7223def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), 7224 (ins VR256:$src1, f128mem:$src2, u8imm:$src3), 7225 "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7226 []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L; 7227} 7228 7229// To create a 256-bit all ones value, we should produce VCMPTRUEPS 7230// with YMM register containing zero. 7231// FIXME: Avoid producing vxorps to clear the fake inputs. 7232let Predicates = [HasAVX1Only] in { 7233def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; 7234} 7235 7236multiclass vinsert_lowering<string InstrStr, string PermStr, 7237 ValueType From, ValueType To, 7238 PatFrag frommemop_frag, PatFrag tomemop_frag> { 7239 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), 7240 (iPTR imm)), 7241 (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, 7242 (INSERT_get_vinsert128_imm VR256:$ins))>; 7243 def : Pat<(vinsert128_insert:$ins (To VR256:$src1), 7244 (From (frommemop_frag addr:$src2)), 7245 (iPTR imm)), 7246 (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, 7247 (INSERT_get_vinsert128_imm VR256:$ins))>; 7248 // Folding "To" vector - convert to perm2x128 and commute inputs. 7249 def : Pat<(vinsert128_insert:$ins (To (tomemop_frag addr:$src1)), 7250 (From VR128:$src2), 7251 (iPTR imm)), 7252 (!cast<Instruction>(PermStr#rm) 7253 (INSERT_SUBREG (To (IMPLICIT_DEF)), VR128:$src2, sub_xmm), 7254 addr:$src1, (INSERT_get_vperm2x128_commutedimm VR256:$ins))>; 7255} 7256 7257let Predicates = [HasAVX, NoVLX] in { 7258 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4f32, v8f32, loadv4f32, loadv8f32>; 7259 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2f64, v4f64, loadv2f64, loadv4f64>; 7260} 7261 7262let Predicates = [HasAVX1Only] in { 7263 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64, loadv2i64, loadv4i64>; 7264 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32, loadv4i32, loadv8i32>; 7265 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>; 7266 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>; 7267 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7268 defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8, loadv16i8, loadv32i8>; 7269} 7270 7271//===----------------------------------------------------------------------===// 7272// VEXTRACTF128 - Extract packed floating-point values 7273// 7274let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { 7275def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), 7276 (ins VR256:$src1, u8imm:$src2), 7277 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7278 []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; 7279let mayStore = 1 in 7280def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), 7281 (ins f128mem:$dst, VR256:$src1, u8imm:$src2), 7282 "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7283 []>, Sched<[WriteFStoreX]>, VEX, VEX_L; 7284} 7285 7286multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { 7287 def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), 7288 (To (!cast<Instruction>(InstrStr#rr) 7289 (From VR256:$src1), 7290 (EXTRACT_get_vextract128_imm VR128:$ext)))>; 7291 def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), 7292 (iPTR imm))), addr:$dst), 7293 (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, 7294 (EXTRACT_get_vextract128_imm VR128:$ext))>; 7295} 7296 7297// AVX1 patterns 7298let Predicates = [HasAVX, NoVLX] in { 7299 defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; 7300 defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; 7301} 7302 7303let Predicates = [HasAVX1Only] in { 7304 defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; 7305 defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; 7306 defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; 7307 defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>; 7308 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7309 defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; 7310} 7311 7312//===----------------------------------------------------------------------===// 7313// VMASKMOV - Conditional SIMD Packed Loads and Stores 7314// 7315multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, 7316 Intrinsic IntLd, Intrinsic IntLd256, 7317 Intrinsic IntSt, Intrinsic IntSt256, 7318 X86SchedWriteMaskMove schedX, 7319 X86SchedWriteMaskMove schedY> { 7320 def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), 7321 (ins VR128:$src1, f128mem:$src2), 7322 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7323 [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, 7324 VEX, VVVV, Sched<[schedX.RM]>; 7325 def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), 7326 (ins VR256:$src1, f256mem:$src2), 7327 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7328 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7329 VEX, VVVV, VEX_L, Sched<[schedY.RM]>; 7330 def mr : AVX8I<opc_mr, MRMDestMem, (outs), 7331 (ins f128mem:$dst, VR128:$src1, VR128:$src2), 7332 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7333 [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, 7334 VEX, VVVV, Sched<[schedX.MR]>; 7335 def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), 7336 (ins f256mem:$dst, VR256:$src1, VR256:$src2), 7337 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7338 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7339 VEX, VVVV, VEX_L, Sched<[schedY.MR]>; 7340} 7341 7342let ExeDomain = SSEPackedSingle in 7343defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", 7344 int_x86_avx_maskload_ps, 7345 int_x86_avx_maskload_ps_256, 7346 int_x86_avx_maskstore_ps, 7347 int_x86_avx_maskstore_ps_256, 7348 WriteFMaskMove32, WriteFMaskMove32Y>; 7349let ExeDomain = SSEPackedDouble in 7350defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", 7351 int_x86_avx_maskload_pd, 7352 int_x86_avx_maskload_pd_256, 7353 int_x86_avx_maskstore_pd, 7354 int_x86_avx_maskstore_pd_256, 7355 WriteFMaskMove64, WriteFMaskMove64Y>; 7356 7357//===----------------------------------------------------------------------===// 7358// AVX_VNNI 7359//===----------------------------------------------------------------------===// 7360let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst", 7361 explicitOpPrefix = ExplicitVEX in 7362multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 7363 bit IsCommutable> { 7364 let isCommutable = IsCommutable in 7365 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 7366 (ins VR128:$src1, VR128:$src2, VR128:$src3), 7367 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7368 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, 7369 VR128:$src2, VR128:$src3)))]>, 7370 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 7371 7372 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 7373 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 7374 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7375 [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2, 7376 (loadv4i32 addr:$src3))))]>, 7377 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM.Folded, 7378 SchedWriteVecIMul.XMM.ReadAfterFold, 7379 SchedWriteVecIMul.XMM.ReadAfterFold]>; 7380 7381 let isCommutable = IsCommutable in 7382 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 7383 (ins VR256:$src1, VR256:$src2, VR256:$src3), 7384 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7385 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, 7386 VR256:$src2, VR256:$src3)))]>, 7387 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 7388 7389 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 7390 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 7391 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 7392 [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2, 7393 (loadv8i32 addr:$src3))))]>, 7394 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded, 7395 SchedWriteVecIMul.YMM.ReadAfterFold, 7396 SchedWriteVecIMul.YMM.ReadAfterFold]>; 7397} 7398 7399defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>; 7400defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>; 7401defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>; 7402defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>; 7403 7404let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI] in { 7405 def : Pat<(v8i32 (add VR256:$src1, 7406 (X86vpmaddwd_su VR256:$src2, VR256:$src3))), 7407 (VPDPWSSDYrr VR256:$src1, VR256:$src2, VR256:$src3)>; 7408 def : Pat<(v8i32 (add VR256:$src1, 7409 (X86vpmaddwd_su VR256:$src2, (load addr:$src3)))), 7410 (VPDPWSSDYrm VR256:$src1, VR256:$src2, addr:$src3)>; 7411 def : Pat<(v4i32 (add VR128:$src1, 7412 (X86vpmaddwd_su VR128:$src2, VR128:$src3))), 7413 (VPDPWSSDrr VR128:$src1, VR128:$src2, VR128:$src3)>; 7414 def : Pat<(v4i32 (add VR128:$src1, 7415 (X86vpmaddwd_su VR128:$src2, (load addr:$src3)))), 7416 (VPDPWSSDrm VR128:$src1, VR128:$src2, addr:$src3)>; 7417} 7418 7419//===----------------------------------------------------------------------===// 7420// VPERMIL - Permute Single and Double Floating-Point Values 7421// 7422 7423multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, 7424 RegisterClass RC, X86MemOperand x86memop_f, 7425 X86MemOperand x86memop_i, 7426 ValueType f_vt, ValueType i_vt, 7427 X86FoldableSchedWrite sched, 7428 X86FoldableSchedWrite varsched> { 7429 let Predicates = [HasAVX, NoVLX] in { 7430 def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), 7431 (ins RC:$src1, RC:$src2), 7432 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7433 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX, VVVV, 7434 Sched<[varsched]>; 7435 def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), 7436 (ins RC:$src1, x86memop_i:$src2), 7437 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7438 [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, 7439 (i_vt (load addr:$src2)))))]>, VEX, VVVV, 7440 Sched<[varsched.Folded, sched.ReadAfterFold]>; 7441 7442 def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), 7443 (ins RC:$src1, u8imm:$src2), 7444 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7445 [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 timm:$src2))))]>, VEX, 7446 Sched<[sched]>; 7447 def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), 7448 (ins x86memop_f:$src1, u8imm:$src2), 7449 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7450 [(set RC:$dst, 7451 (f_vt (X86VPermilpi (load addr:$src1), (i8 timm:$src2))))]>, VEX, 7452 Sched<[sched.Folded]>; 7453 }// Predicates = [HasAVX, NoVLX] 7454} 7455 7456let ExeDomain = SSEPackedSingle in { 7457 defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, 7458 v4f32, v4i32, SchedWriteFShuffle.XMM, 7459 SchedWriteFVarShuffle.XMM>; 7460 defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, 7461 v8f32, v8i32, SchedWriteFShuffle.YMM, 7462 SchedWriteFVarShuffle.YMM>, VEX_L; 7463} 7464let ExeDomain = SSEPackedDouble in { 7465 defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, 7466 v2f64, v2i64, SchedWriteFShuffle.XMM, 7467 SchedWriteFVarShuffle.XMM>; 7468 defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, 7469 v4f64, v4i64, SchedWriteFShuffle.YMM, 7470 SchedWriteFVarShuffle.YMM>, VEX_L; 7471} 7472 7473//===----------------------------------------------------------------------===// 7474// VZERO - Zero YMM registers 7475// Note: These instruction do not affect the YMM16-YMM31. 7476// 7477 7478let SchedRW = [WriteSystem] in { 7479let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 7480 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { 7481 // Zero All YMM registers 7482 def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", 7483 [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, 7484 Requires<[HasAVX]>, WIG; 7485 7486 // Zero Upper bits of YMM registers 7487 def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", 7488 [(int_x86_avx_vzeroupper)]>, TB, VEX, 7489 Requires<[HasAVX]>, WIG; 7490} // Defs 7491} // SchedRW 7492 7493//===----------------------------------------------------------------------===// 7494// Half precision conversion instructions 7495// 7496 7497multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, 7498 X86FoldableSchedWrite sched> { 7499 def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), 7500 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7501 [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>, 7502 T8, PD, VEX, Sched<[sched]>; 7503 let hasSideEffects = 0, mayLoad = 1 in 7504 def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), 7505 "vcvtph2ps\t{$src, $dst|$dst, $src}", 7506 []>, T8, PD, VEX, Sched<[sched.Folded]>; 7507} 7508 7509multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, 7510 SchedWrite RR, SchedWrite MR> { 7511 def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), 7512 (ins RC:$src1, i32u8imm:$src2), 7513 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", 7514 [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>, 7515 TA, PD, VEX, Sched<[RR]>; 7516 let hasSideEffects = 0, mayStore = 1 in 7517 def mr : Ii8<0x1D, MRMDestMem, (outs), 7518 (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), 7519 "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7520 TA, PD, VEX, Sched<[MR]>; 7521} 7522 7523let Predicates = [HasF16C, NoVLX] in { 7524 defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC; 7525 defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC; 7526 defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, 7527 WriteCvtPS2PHSt>, SIMD_EXC; 7528 defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, 7529 WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC; 7530 7531 // Pattern match vcvtph2ps of a scalar i64 load. 7532 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))), 7533 (VCVTPH2PSrm addr:$src)>; 7534 def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 7535 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), 7536 (VCVTPH2PSrm addr:$src)>; 7537 def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))), 7538 (VCVTPH2PSYrm addr:$src)>; 7539 7540 def : Pat<(store (f64 (extractelt 7541 (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7542 (iPTR 0))), addr:$dst), 7543 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7544 def : Pat<(store (i64 (extractelt 7545 (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))), 7546 (iPTR 0))), addr:$dst), 7547 (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>; 7548 def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst), 7549 (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>; 7550} 7551 7552//===----------------------------------------------------------------------===// 7553// AVX2 Instructions 7554//===----------------------------------------------------------------------===// 7555 7556/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate 7557multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, 7558 ValueType OpVT, X86FoldableSchedWrite sched, 7559 RegisterClass RC, 7560 X86MemOperand x86memop, SDNodeXForm commuteXForm> { 7561 let isCommutable = 1 in 7562 def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), 7563 (ins RC:$src1, RC:$src2, u8imm:$src3), 7564 !strconcat(OpcodeStr, 7565 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7566 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>, 7567 Sched<[sched]>, VEX, VVVV; 7568 def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), 7569 (ins RC:$src1, x86memop:$src2, u8imm:$src3), 7570 !strconcat(OpcodeStr, 7571 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), 7572 [(set RC:$dst, 7573 (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>, 7574 Sched<[sched.Folded, sched.ReadAfterFold]>, VEX, VVVV; 7575 7576 // Pattern to commute if load is in first source. 7577 def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)), 7578 (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, 7579 (commuteXForm timm:$src3))>; 7580} 7581 7582let Predicates = [HasAVX2] in { 7583defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, 7584 SchedWriteBlend.XMM, VR128, i128mem, 7585 BlendCommuteImm4>; 7586defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, 7587 SchedWriteBlend.YMM, VR256, i256mem, 7588 BlendCommuteImm8>, VEX_L; 7589 7590def : Pat<(X86Blendi (v4i64 VR256:$src1), (v4i64 VR256:$src2), timm:$src3), 7591 (VPBLENDDYrri VR256:$src1, VR256:$src2, (BlendScaleImm4 timm:$src3))>; 7592def : Pat<(X86Blendi VR256:$src1, (loadv4i64 addr:$src2), timm:$src3), 7593 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleImm4 timm:$src3))>; 7594def : Pat<(X86Blendi (loadv4i64 addr:$src2), VR256:$src1, timm:$src3), 7595 (VPBLENDDYrmi VR256:$src1, addr:$src2, (BlendScaleCommuteImm4 timm:$src3))>; 7596 7597def : Pat<(X86Blendi (v2i64 VR128:$src1), (v2i64 VR128:$src2), timm:$src3), 7598 (VPBLENDDrri VR128:$src1, VR128:$src2, (BlendScaleImm2to4 timm:$src3))>; 7599def : Pat<(X86Blendi VR128:$src1, (loadv2i64 addr:$src2), timm:$src3), 7600 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleImm2to4 timm:$src3))>; 7601def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3), 7602 (VPBLENDDrmi VR128:$src1, addr:$src2, (BlendScaleCommuteImm2to4 timm:$src3))>; 7603} 7604 7605// For insertion into the zero index (low half) of a 256-bit vector, it is 7606// more efficient to generate a blend with immediate instead of an insert*128. 7607// NOTE: We're using FP instructions here, but execution domain fixing should 7608// take care of using integer instructions when profitable. 7609let Predicates = [HasAVX] in { 7610def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), 7611 (VBLENDPSYrri VR256:$src1, 7612 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7613 VR128:$src2, sub_xmm), 0xf)>; 7614def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), 7615 (VBLENDPSYrri VR256:$src1, 7616 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7617 VR128:$src2, sub_xmm), 0xf)>; 7618def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), 7619 (VBLENDPSYrri VR256:$src1, 7620 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7621 VR128:$src2, sub_xmm), 0xf)>; 7622def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)), 7623 (VBLENDPSYrri VR256:$src1, 7624 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7625 VR128:$src2, sub_xmm), 0xf)>; 7626def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), 7627 (VBLENDPSYrri VR256:$src1, 7628 (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7629 VR128:$src2, sub_xmm), 0xf)>; 7630 7631def : Pat<(insert_subvector (loadv8i32 addr:$src2), (v4i32 VR128:$src1), (iPTR 0)), 7632 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7633 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7634def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0)), 7635 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7636 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7637def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)), 7638 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7639 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7640def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)), 7641 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7642 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7643def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)), 7644 (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7645 VR128:$src1, sub_xmm), addr:$src2, 0xf0)>; 7646} 7647 7648//===----------------------------------------------------------------------===// 7649// VPBROADCAST - Load from memory and broadcast to all elements of the 7650// destination operand 7651// 7652multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, 7653 X86MemOperand x86memop, PatFrag bcast_frag, 7654 ValueType OpVT128, ValueType OpVT256, Predicate prd> { 7655 let Predicates = [HasAVX2, prd] in { 7656 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 7657 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7658 [(set VR128:$dst, 7659 (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7660 Sched<[SchedWriteShuffle.XMM]>, VEX; 7661 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), 7662 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7663 [(set VR128:$dst, 7664 (OpVT128 (bcast_frag addr:$src)))]>, 7665 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; 7666 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), 7667 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7668 [(set VR256:$dst, 7669 (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, 7670 Sched<[WriteShuffle256]>, VEX, VEX_L; 7671 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), 7672 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 7673 [(set VR256:$dst, 7674 (OpVT256 (bcast_frag addr:$src)))]>, 7675 Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; 7676 7677 // Provide aliases for broadcast from the same register class that 7678 // automatically does the extract. 7679 def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), 7680 (!cast<Instruction>(NAME#"Yrr") 7681 (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; 7682 } 7683} 7684 7685defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, X86VBroadcastld8, 7686 v16i8, v32i8, NoVLX_Or_NoBWI>; 7687defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, X86VBroadcastld16, 7688 v8i16, v16i16, NoVLX_Or_NoBWI>; 7689defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, X86VBroadcastld32, 7690 v4i32, v8i32, NoVLX>; 7691defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastld64, 7692 v2i64, v4i64, NoVLX>; 7693 7694let Predicates = [HasAVX2, NoVLX] in { 7695 // Provide fallback in case the load node that is used in the patterns above 7696 // is used by additional users, which prevents the pattern selection. 7697 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7698 (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7699 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7700 (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; 7701 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7702 (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7703} 7704 7705let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { 7706 def : Pat<(v16i8 (X86VBroadcast GR8:$src)), 7707 (VPBROADCASTBrr (VMOVDI2PDIrr 7708 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7709 GR8:$src, sub_8bit))))>; 7710 def : Pat<(v32i8 (X86VBroadcast GR8:$src)), 7711 (VPBROADCASTBYrr (VMOVDI2PDIrr 7712 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7713 GR8:$src, sub_8bit))))>; 7714 7715 def : Pat<(v8i16 (X86VBroadcast GR16:$src)), 7716 (VPBROADCASTWrr (VMOVDI2PDIrr 7717 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7718 GR16:$src, sub_16bit))))>; 7719 def : Pat<(v16i16 (X86VBroadcast GR16:$src)), 7720 (VPBROADCASTWYrr (VMOVDI2PDIrr 7721 (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), 7722 GR16:$src, sub_16bit))))>; 7723 7724 def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), 7725 (VPBROADCASTWrm addr:$src)>; 7726 def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), 7727 (VPBROADCASTWYrm addr:$src)>; 7728 7729 def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128:$src))), 7730 (VPBROADCASTWrr VR128:$src)>; 7731 def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128:$src))), 7732 (VPBROADCASTWYrr VR128:$src)>; 7733 7734 def : Pat<(v8f16 (X86VBroadcast (f16 FR16:$src))), 7735 (VPBROADCASTWrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7736 def : Pat<(v16f16 (X86VBroadcast (f16 FR16:$src))), 7737 (VPBROADCASTWYrr (COPY_TO_REGCLASS FR16:$src, VR128))>; 7738} 7739let Predicates = [HasAVX2, NoVLX] in { 7740 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7741 (VPBROADCASTDrr (VMOVDI2PDIrr GR32:$src))>; 7742 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7743 (VPBROADCASTDYrr (VMOVDI2PDIrr GR32:$src))>; 7744 def : Pat<(v2i64 (X86VBroadcast GR64:$src)), 7745 (VPBROADCASTQrr (VMOV64toPQIrr GR64:$src))>; 7746 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7747 (VPBROADCASTQYrr (VMOV64toPQIrr GR64:$src))>; 7748} 7749 7750// AVX1 broadcast patterns 7751let Predicates = [HasAVX1Only] in { 7752def : Pat<(v8i32 (X86VBroadcastld32 addr:$src)), 7753 (VBROADCASTSSYrm addr:$src)>; 7754def : Pat<(v4i64 (X86VBroadcastld64 addr:$src)), 7755 (VBROADCASTSDYrm addr:$src)>; 7756def : Pat<(v4i32 (X86VBroadcastld32 addr:$src)), 7757 (VBROADCASTSSrm addr:$src)>; 7758} 7759 7760 // Provide fallback in case the load node that is used in the patterns above 7761 // is used by additional users, which prevents the pattern selection. 7762let Predicates = [HasAVX, NoVLX] in { 7763 // 128bit broadcasts: 7764 def : Pat<(v2f64 (X86VBroadcast f64:$src)), 7765 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; 7766 def : Pat<(v2f64 (X86VBroadcastld64 addr:$src)), 7767 (VMOVDDUPrm addr:$src)>; 7768 7769 def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), 7770 (VMOVDDUPrr VR128:$src)>; 7771} 7772 7773let Predicates = [HasAVX1Only] in { 7774 def : Pat<(v4f32 (X86VBroadcast FR32:$src)), 7775 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; 7776 def : Pat<(v8f32 (X86VBroadcast FR32:$src)), 7777 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7778 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), 7779 (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; 7780 def : Pat<(v8f32 (X86VBroadcast v4f32:$src)), 7781 (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), 7782 (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm), 7783 (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>; 7784 def : Pat<(v4f64 (X86VBroadcast FR64:$src)), 7785 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7786 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), 7787 (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; 7788 def : Pat<(v4f64 (X86VBroadcast v2f64:$src)), 7789 (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), 7790 (v2f64 (VMOVDDUPrr VR128:$src)), sub_xmm), 7791 (v2f64 (VMOVDDUPrr VR128:$src)), 1)>; 7792 7793 def : Pat<(v4i32 (X86VBroadcast GR32:$src)), 7794 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)>; 7795 def : Pat<(v8i32 (X86VBroadcast GR32:$src)), 7796 (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), 7797 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), sub_xmm), 7798 (v4i32 (VPSHUFDri (VMOVDI2PDIrr GR32:$src), 0)), 1)>; 7799 def : Pat<(v4i64 (X86VBroadcast GR64:$src)), 7800 (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), 7801 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), sub_xmm), 7802 (v4i32 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)), 1)>; 7803 7804 def : Pat<(v2i64 (X86VBroadcast i64:$src)), 7805 (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>; 7806 def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)), 7807 (VMOVDDUPrm addr:$src)>; 7808} 7809 7810//===----------------------------------------------------------------------===// 7811// VPERM - Permute instructions 7812// 7813 7814multiclass avx2_perm<bits<8> opc, string OpcodeStr, 7815 ValueType OpVT, X86FoldableSchedWrite Sched, 7816 X86MemOperand memOp> { 7817 let Predicates = [HasAVX2, NoVLX] in { 7818 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 7819 (ins VR256:$src1, VR256:$src2), 7820 !strconcat(OpcodeStr, 7821 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7822 [(set VR256:$dst, 7823 (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, 7824 Sched<[Sched]>, VEX, VVVV, VEX_L; 7825 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 7826 (ins VR256:$src1, memOp:$src2), 7827 !strconcat(OpcodeStr, 7828 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7829 [(set VR256:$dst, 7830 (OpVT (X86VPermv VR256:$src1, 7831 (load addr:$src2))))]>, 7832 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VVVV, VEX_L; 7833 } 7834} 7835 7836defm VPERMD : avx2_perm<0x36, "vpermd", v8i32, WriteVarShuffle256, i256mem>; 7837let ExeDomain = SSEPackedSingle in 7838defm VPERMPS : avx2_perm<0x16, "vpermps", v8f32, WriteFVarShuffle256, f256mem>; 7839 7840multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, 7841 ValueType OpVT, X86FoldableSchedWrite Sched, 7842 X86MemOperand memOp> { 7843 let Predicates = [HasAVX2, NoVLX] in { 7844 def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), 7845 (ins VR256:$src1, u8imm:$src2), 7846 !strconcat(OpcodeStr, 7847 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7848 [(set VR256:$dst, 7849 (OpVT (X86VPermi VR256:$src1, (i8 timm:$src2))))]>, 7850 Sched<[Sched]>, VEX, VEX_L; 7851 def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), 7852 (ins memOp:$src1, u8imm:$src2), 7853 !strconcat(OpcodeStr, 7854 "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7855 [(set VR256:$dst, 7856 (OpVT (X86VPermi (mem_frag addr:$src1), 7857 (i8 timm:$src2))))]>, 7858 Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VEX_L; 7859 } 7860} 7861 7862defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, 7863 WriteShuffle256, i256mem>, REX_W; 7864let ExeDomain = SSEPackedDouble in 7865defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, 7866 WriteFShuffle256, f256mem>, REX_W; 7867 7868//===----------------------------------------------------------------------===// 7869// VPERM2I128 - Permute Integer vector Values in 128-bit chunks 7870// 7871let isCommutable = 1 in 7872def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), 7873 (ins VR256:$src1, VR256:$src2, u8imm:$src3), 7874 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7875 Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L; 7876def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), 7877 (ins VR256:$src1, f256mem:$src2, u8imm:$src3), 7878 "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>, 7879 Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L; 7880 7881let Predicates = [HasAVX2] in { 7882 defm : vperm2x128_lowering<"VPERM2I128", v4i64, loadv4i64>; 7883 defm : vperm2x128_lowering<"VPERM2I128", v8i32, loadv8i32>; 7884 defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>; 7885 defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>; 7886 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7887 defm : vperm2x128_lowering<"VPERM2I128", v32i8, loadv32i8>; 7888} 7889 7890//===----------------------------------------------------------------------===// 7891// VINSERTI128 - Insert packed integer values 7892// 7893let hasSideEffects = 0 in { 7894def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), 7895 (ins VR256:$src1, VR128:$src2, u8imm:$src3), 7896 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7897 []>, Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L; 7898let mayLoad = 1 in 7899def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), 7900 (ins VR256:$src1, i128mem:$src2, u8imm:$src3), 7901 "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", 7902 []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L; 7903} 7904 7905let Predicates = [HasAVX2, NoVLX] in { 7906 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64, loadv2i64, loadv4i64>; 7907 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32, loadv4i32, loadv8i32>; 7908 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16, loadv16i16>; 7909 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16, loadv16f16>; 7910 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7911 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8, loadv16i8, loadv32i8>; 7912} 7913 7914let Predicates = [HasAVXNECONVERT, NoVLX] in 7915 defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8bf16, v16bf16, loadv8bf16, loadv16bf16>; 7916 7917//===----------------------------------------------------------------------===// 7918// VEXTRACTI128 - Extract packed integer values 7919// 7920def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), 7921 (ins VR256:$src1, u8imm:$src2), 7922 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7923 Sched<[WriteShuffle256]>, VEX, VEX_L; 7924let hasSideEffects = 0, mayStore = 1 in 7925def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), 7926 (ins i128mem:$dst, VR256:$src1, u8imm:$src2), 7927 "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, 7928 Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; 7929 7930let Predicates = [HasAVX2, NoVLX] in { 7931 defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; 7932 defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; 7933 defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; 7934 defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>; 7935 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7936 defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; 7937} 7938 7939let Predicates = [HasAVXNECONVERT, NoVLX] in 7940 defm : vextract_lowering<"VEXTRACTI128", v16bf16, v8bf16>; 7941 7942//===----------------------------------------------------------------------===// 7943// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores 7944// 7945multiclass avx2_pmovmask<string OpcodeStr, 7946 Intrinsic IntLd128, Intrinsic IntLd256, 7947 Intrinsic IntSt128, Intrinsic IntSt256, 7948 X86SchedWriteMaskMove schedX, 7949 X86SchedWriteMaskMove schedY> { 7950 def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), 7951 (ins VR128:$src1, i128mem:$src2), 7952 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7953 [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, 7954 VEX, VVVV, Sched<[schedX.RM]>; 7955 def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), 7956 (ins VR256:$src1, i256mem:$src2), 7957 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7958 [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, 7959 VEX, VVVV, VEX_L, Sched<[schedY.RM]>; 7960 def mr : AVX28I<0x8e, MRMDestMem, (outs), 7961 (ins i128mem:$dst, VR128:$src1, VR128:$src2), 7962 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7963 [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, 7964 VEX, VVVV, Sched<[schedX.MR]>; 7965 def Ymr : AVX28I<0x8e, MRMDestMem, (outs), 7966 (ins i256mem:$dst, VR256:$src1, VR256:$src2), 7967 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 7968 [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, 7969 VEX, VVVV, VEX_L, Sched<[schedY.MR]>; 7970} 7971 7972defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", 7973 int_x86_avx2_maskload_d, 7974 int_x86_avx2_maskload_d_256, 7975 int_x86_avx2_maskstore_d, 7976 int_x86_avx2_maskstore_d_256, 7977 WriteVecMaskMove32, WriteVecMaskMove32Y>; 7978defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", 7979 int_x86_avx2_maskload_q, 7980 int_x86_avx2_maskload_q_256, 7981 int_x86_avx2_maskstore_q, 7982 int_x86_avx2_maskstore_q_256, 7983 WriteVecMaskMove64, WriteVecMaskMove64Y>, REX_W; 7984 7985multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, 7986 ValueType MaskVT> { 7987 // masked store 7988 def: Pat<(masked_store (VT RC:$src), addr:$ptr, (MaskVT RC:$mask)), 7989 (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; 7990 // masked load 7991 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), undef)), 7992 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7993 def: Pat<(VT (masked_load addr:$ptr, (MaskVT RC:$mask), 7994 (VT immAllZerosV))), 7995 (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; 7996} 7997let Predicates = [HasAVX] in { 7998 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32>; 7999 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64>; 8000 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32>; 8001 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64>; 8002} 8003let Predicates = [HasAVX1Only] in { 8004 // load/store i32/i64 not supported use ps/pd version 8005 defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32>; 8006 defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64>; 8007 defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32>; 8008 defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64>; 8009} 8010let Predicates = [HasAVX2] in { 8011 defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32>; 8012 defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64>; 8013 defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32>; 8014 defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64>; 8015} 8016 8017//===----------------------------------------------------------------------===// 8018// Variable Bit Shifts 8019// 8020multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, 8021 ValueType vt128, ValueType vt256> { 8022 def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), 8023 (ins VR128:$src1, VR128:$src2), 8024 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8025 [(set VR128:$dst, 8026 (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, 8027 VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM]>; 8028 def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), 8029 (ins VR128:$src1, i128mem:$src2), 8030 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8031 [(set VR128:$dst, 8032 (vt128 (OpNode VR128:$src1, 8033 (vt128 (load addr:$src2)))))]>, 8034 VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM.Folded, 8035 SchedWriteVarVecShift.XMM.ReadAfterFold]>; 8036 def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), 8037 (ins VR256:$src1, VR256:$src2), 8038 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8039 [(set VR256:$dst, 8040 (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, 8041 VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; 8042 def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), 8043 (ins VR256:$src1, i256mem:$src2), 8044 !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8045 [(set VR256:$dst, 8046 (vt256 (OpNode VR256:$src1, 8047 (vt256 (load addr:$src2)))))]>, 8048 VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, 8049 SchedWriteVarVecShift.YMM.ReadAfterFold]>; 8050} 8051 8052let Predicates = [HasAVX2, NoVLX] in { 8053 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", X86vshlv, v4i32, v8i32>; 8054 defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", X86vshlv, v2i64, v4i64>, REX_W; 8055 defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", X86vsrlv, v4i32, v8i32>; 8056 defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", X86vsrlv, v2i64, v4i64>, REX_W; 8057 defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", X86vsrav, v4i32, v8i32>; 8058} 8059 8060//===----------------------------------------------------------------------===// 8061// VGATHER - GATHER Operations 8062 8063// FIXME: Improve scheduling of gather instructions. 8064multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, 8065 X86MemOperand memop128, X86MemOperand memop256> { 8066let mayLoad = 1, hasSideEffects = 0 in { 8067 def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), 8068 (ins VR128:$src1, memop128:$src2, VR128:$mask), 8069 !strconcat(OpcodeStr, 8070 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8071 []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8072 def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), 8073 (ins RC256:$src1, memop256:$src2, RC256:$mask), 8074 !strconcat(OpcodeStr, 8075 "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), 8076 []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; 8077} 8078} 8079 8080let Predicates = [HasAVX2] in { 8081 let mayLoad = 1, hasSideEffects = 0, Constraints 8082 = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" 8083 in { 8084 defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", 8085 VR256, vx128mem, vx256mem>, REX_W; 8086 defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", 8087 VR256, vx128mem, vy256mem>, REX_W; 8088 defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", 8089 VR256, vx128mem, vy256mem>; 8090 defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", 8091 VR128, vx64mem, vy128mem>; 8092 8093 let ExeDomain = SSEPackedDouble in { 8094 defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", 8095 VR256, vx128mem, vx256mem>, REX_W; 8096 defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", 8097 VR256, vx128mem, vy256mem>, REX_W; 8098 } 8099 8100 let ExeDomain = SSEPackedSingle in { 8101 defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", 8102 VR256, vx128mem, vy256mem>; 8103 defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", 8104 VR128, vx64mem, vy128mem>; 8105 } 8106 } 8107} 8108 8109//===----------------------------------------------------------------------===// 8110// GFNI instructions 8111//===----------------------------------------------------------------------===// 8112 8113multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, 8114 RegisterClass RC, PatFrag MemOpFrag, 8115 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8116 bit Is2Addr = 0> { 8117 let ExeDomain = SSEPackedInt, 8118 AsmString = !if(Is2Addr, 8119 OpcodeStr#"\t{$src2, $dst|$dst, $src2}", 8120 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { 8121 let isCommutable = 1 in 8122 def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", 8123 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, 8124 Sched<[sched]>, T8; 8125 8126 def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", 8127 [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, 8128 (MemOpFrag addr:$src2))))]>, 8129 Sched<[sched.Folded, sched.ReadAfterFold]>, T8; 8130 } 8131} 8132 8133multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, 8134 SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, 8135 X86MemOperand X86MemOp, X86FoldableSchedWrite sched, 8136 bit Is2Addr = 0> { 8137 let AsmString = !if(Is2Addr, 8138 OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8139 OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { 8140 def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), 8141 (ins RC:$src1, RC:$src2, u8imm:$src3), "", 8142 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))], 8143 SSEPackedInt>, Sched<[sched]>; 8144 def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), 8145 (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", 8146 [(set RC:$dst, (OpVT (OpNode RC:$src1, 8147 (MemOpFrag addr:$src2), 8148 timm:$src3)))], SSEPackedInt>, 8149 Sched<[sched.Folded, sched.ReadAfterFold]>; 8150 } 8151} 8152 8153multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { 8154 let Constraints = "$src1 = $dst", 8155 Predicates = [HasGFNI, UseSSE2] in 8156 defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, 8157 VR128, load, i128mem, SchedWriteVecIMul.XMM, 1>; 8158 let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8159 defm V#NAME : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128, 8160 load, i128mem, SchedWriteVecIMul.XMM>, 8161 VEX, VVVV, REX_W; 8162 defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256, 8163 load, i256mem, SchedWriteVecIMul.YMM>, 8164 VEX, VVVV, VEX_L, REX_W; 8165 } 8166} 8167 8168// GF2P8MULB 8169let Constraints = "$src1 = $dst", 8170 Predicates = [HasGFNI, UseSSE2] in 8171defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop, 8172 i128mem, SchedWriteVecALU.XMM, 1>; 8173let Predicates = [HasGFNI, HasAVX, NoVLX] in { 8174 defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load, 8175 i128mem, SchedWriteVecALU.XMM>, VEX, VVVV; 8176 defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load, 8177 i256mem, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L; 8178} 8179// GF2P8AFFINEINVQB, GF2P8AFFINEQB 8180let isCommutable = 0 in { 8181 defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", 8182 X86GF2P8affineinvqb>, TA, PD; 8183 defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", 8184 X86GF2P8affineqb>, TA, PD; 8185} 8186 8187// AVX-IFMA 8188let Predicates = [HasAVXIFMA, NoVLX_Or_NoIFMA], Constraints = "$src1 = $dst" in 8189multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> { 8190 // NOTE: The SDNode have the multiply operands first with the add last. 8191 // This enables commuted load patterns to be autogenerated by tablegen. 8192 let isCommutable = 1 in { 8193 def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst), 8194 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8195 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8196 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8197 VR128:$src3, VR128:$src1)))]>, 8198 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8199 } 8200 def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst), 8201 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8202 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8203 [(set VR128:$dst, (v2i64 (OpNode VR128:$src2, 8204 (loadv2i64 addr:$src3), VR128:$src1)))]>, 8205 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8206 let isCommutable = 1 in { 8207 def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst), 8208 (ins VR256:$src1, VR256:$src2, VR256:$src3), 8209 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8210 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8211 VR256:$src3, VR256:$src1)))]>, 8212 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8213 } 8214 def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst), 8215 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 8216 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8217 [(set VR256:$dst, (v4i64 (OpNode VR256:$src2, 8218 (loadv4i64 addr:$src3), VR256:$src1)))]>, 8219 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8220} 8221 8222defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix; 8223defm VPMADD52LUQ : avx_ifma_rm<0xb4, "vpmadd52luq", x86vpmadd52l>, REX_W, ExplicitVEXPrefix; 8224 8225// AVX-VNNI-INT8 8226let Constraints = "$src1 = $dst" in 8227multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT, 8228 RegisterClass RC, PatFrag MemOpFrag, 8229 X86MemOperand X86memop, SDNode OpNode, 8230 X86FoldableSchedWrite Sched, 8231 bit IsCommutable> { 8232 let isCommutable = IsCommutable in 8233 def rr : I<Opc, MRMSrcReg, (outs RC:$dst), 8234 (ins RC:$src1, RC:$src2, RC:$src3), 8235 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8236 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, 8237 VEX, VVVV, Sched<[Sched]>; 8238 def rm : I<Opc, MRMSrcMem, (outs RC:$dst), 8239 (ins RC:$src1, RC:$src2, X86memop:$src3), 8240 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8241 [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, 8242 (MemOpFrag addr:$src3))))]>, 8243 VEX, VVVV, Sched<[Sched.Folded, Sched.ReadAfterFold]>; 8244} 8245 8246let Predicates = [HasAVXVNNIINT8] in { 8247 defm VPDPBSSD : avx_dotprod_rm<0x50,"vpdpbssd", v4i32, VR128, loadv4i32, 8248 i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM, 8249 1>, T8, XD; 8250 defm VPDPBSSDY : avx_dotprod_rm<0x50,"vpdpbssd", v8i32, VR256, loadv8i32, 8251 i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM, 8252 1>, VEX_L, T8, XD; 8253 defm VPDPBUUD : avx_dotprod_rm<0x50,"vpdpbuud", v4i32, VR128, loadv4i32, 8254 i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM, 8255 1>, T8; 8256 defm VPDPBUUDY : avx_dotprod_rm<0x50,"vpdpbuud", v8i32, VR256, loadv8i32, 8257 i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM, 8258 1>, VEX_L, T8; 8259 defm VPDPBSSDS : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32, 8260 i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM, 8261 1>, T8, XD; 8262 defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32, 8263 i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM, 8264 1>, VEX_L, T8, XD; 8265 defm VPDPBUUDS : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32, 8266 i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM, 8267 1>, T8; 8268 defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32, 8269 i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM, 8270 1>, VEX_L, T8; 8271 defm VPDPBSUD : avx_dotprod_rm<0x50,"vpdpbsud", v4i32, VR128, loadv4i32, 8272 i128mem, X86vpdpbsud, SchedWriteVecIMul.XMM, 8273 0>, T8, XS; 8274 defm VPDPBSUDY : avx_dotprod_rm<0x50,"vpdpbsud", v8i32, VR256, loadv8i32, 8275 i256mem, X86vpdpbsud, SchedWriteVecIMul.YMM, 8276 0>, VEX_L, T8, XS; 8277 defm VPDPBSUDS : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32, 8278 i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM, 8279 0>, T8, XS; 8280 defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32, 8281 i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM, 8282 0>, VEX_L, T8, XS; 8283} 8284 8285// AVX-NE-CONVERT 8286multiclass AVX_NE_CONVERT_BASE<bits<8> Opcode, string OpcodeStr, 8287 X86MemOperand MemOp128, X86MemOperand MemOp256> { 8288 def rm : I<Opcode, MRMSrcMem, (outs VR128:$dst), (ins MemOp128:$src), 8289 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8290 [(set VR128:$dst, 8291 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"128") addr:$src))]>, 8292 Sched<[WriteCvtPH2PS]>, VEX; 8293 def Yrm : I<Opcode, MRMSrcMem, (outs VR256:$dst), (ins MemOp256:$src), 8294 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), 8295 [(set VR256:$dst, 8296 (!cast<Intrinsic>("int_x86_"#OpcodeStr#"256") addr:$src))]>, 8297 Sched<[WriteCvtPH2PSY]>, VEX, VEX_L; 8298} 8299 8300multiclass VCVTNEPS2BF16_BASE { 8301 def rr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), 8302 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8303 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 VR128:$src))]>, 8304 Sched<[WriteCvtPH2PS]>; 8305 def rm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), 8306 "vcvtneps2bf16{x}\t{$src, $dst|$dst, $src}", 8307 [(set VR128:$dst, (int_x86_vcvtneps2bf16128 (loadv4f32 addr:$src)))]>, 8308 Sched<[WriteCvtPH2PS]>; 8309 def Yrr : I<0x72, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), 8310 "vcvtneps2bf16\t{$src, $dst|$dst, $src}", 8311 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 VR256:$src))]>, 8312 Sched<[WriteCvtPH2PSY]>, VEX_L; 8313 def Yrm : I<0x72, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), 8314 "vcvtneps2bf16{y}\t{$src, $dst|$dst, $src}", 8315 [(set VR128:$dst, (int_x86_vcvtneps2bf16256 (loadv8f32 addr:$src)))]>, 8316 Sched<[WriteCvtPH2PSY]>, VEX_L; 8317} 8318 8319let Predicates = [HasAVXNECONVERT] in { 8320 defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem, 8321 f16mem>, T8, XS; 8322 defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>, 8323 T8, PD; 8324 defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem, 8325 f256mem>, T8, XS; 8326 defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem, 8327 f256mem>, T8, PD; 8328 defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem, 8329 f256mem>, T8, XD; 8330 defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem, 8331 f256mem>, T8; 8332 defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8, XS, ExplicitVEXPrefix; 8333 8334 def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))), 8335 (VCVTNEPS2BF16Yrr VR256:$src)>; 8336 def : Pat<(v8bf16 (X86vfpround (loadv8f32 addr:$src))), 8337 (VCVTNEPS2BF16Yrm addr:$src)>; 8338} 8339 8340def : InstAlias<"vcvtneps2bf16x\t{$src, $dst|$dst, $src}", 8341 (VCVTNEPS2BF16rr VR128:$dst, VR128:$src), 0, "att">; 8342def : InstAlias<"vcvtneps2bf16y\t{$src, $dst|$dst, $src}", 8343 (VCVTNEPS2BF16Yrr VR128:$dst, VR256:$src), 0, "att">; 8344 8345// FIXME: Is there a better scheduler class for SHA512 than WriteVecIMul? 8346let Predicates = [HasSHA512], Constraints = "$src1 = $dst" in { 8347def VSHA512MSG1rr : I<0xcc, MRMSrcReg, (outs VR256:$dst), 8348 (ins VR256:$src1, VR128:$src2), 8349 "vsha512msg1\t{$src2, $dst|$dst, $src2}", 8350 [(set VR256:$dst, 8351 (int_x86_vsha512msg1 VR256:$src1, VR128:$src2))]>, VEX_L, 8352 VEX, T8, XD, Sched<[WriteVecIMul]>; 8353def VSHA512MSG2rr : I<0xcd, MRMSrcReg, (outs VR256:$dst), 8354 (ins VR256:$src1, VR256:$src2), 8355 "vsha512msg2\t{$src2, $dst|$dst, $src2}", 8356 [(set VR256:$dst, 8357 (int_x86_vsha512msg2 VR256:$src1, VR256:$src2))]>, VEX_L, 8358 VEX, T8, XD, Sched<[WriteVecIMul]>; 8359def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst), 8360 (ins VR256:$src1, VR256:$src2, VR128:$src3), 8361 "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}", 8362 [(set VR256:$dst, 8363 (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>, 8364 VEX_L, VEX, VVVV, T8, XD, Sched<[WriteVecIMul]>; 8365} 8366 8367// FIXME: Is there a better scheduler class for SM3 than WriteVecIMul? 8368let Predicates = [HasSM3], Constraints = "$src1 = $dst" in { 8369 multiclass SM3_Base<string OpStr> { 8370 def rr : I<0xda, MRMSrcReg, (outs VR128:$dst), 8371 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8372 !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8373 [(set VR128:$dst, 8374 (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1, 8375 VR128:$src2, VR128:$src3))]>, 8376 Sched<[WriteVecIMul]>, VEX, VVVV; 8377 def rm : I<0xda, MRMSrcMem, (outs VR128:$dst), 8378 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8379 !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8380 [(set VR128:$dst, 8381 (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1, 8382 VR128:$src2, (loadv4i32 addr:$src3)))]>, 8383 Sched<[WriteVecIMul]>, VEX, VVVV; 8384 } 8385 8386 multiclass VSM3RNDS2_Base { 8387 def rr : Ii8<0xde, MRMSrcReg, (outs VR128:$dst), 8388 (ins VR128:$src1, VR128:$src2, VR128:$src3, i32u8imm:$src4), 8389 "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}", 8390 [(set VR128:$dst, 8391 (int_x86_vsm3rnds2 VR128:$src1, 8392 VR128:$src2, VR128:$src3, timm:$src4))]>, 8393 Sched<[WriteVecIMul]>; 8394 def rm : Ii8<0xde, MRMSrcMem, (outs VR128:$dst), 8395 (ins VR128:$src1, VR128:$src2, i128mem:$src3, i32u8imm:$src4), 8396 "vsm3rnds2\t{$src4, $src3, $src2, $dst|$dst, $src2, $src3, $src4}", 8397 [(set VR128:$dst, 8398 (int_x86_vsm3rnds2 VR128:$src1, 8399 VR128:$src2, (loadv4i32 addr:$src3), timm:$src4))]>, 8400 Sched<[WriteVecIMul]>; 8401 } 8402} 8403 8404defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8; 8405defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8, PD; 8406defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TA, PD; 8407 8408// FIXME: Is there a better scheduler class for SM4 than WriteVecIMul? 8409let Predicates = [HasSM4] in { 8410 multiclass SM4_Base<string OpStr, RegisterClass RC, string VL, 8411 PatFrag LD, X86MemOperand MemOp> { 8412 def rr : I<0xda, MRMSrcReg, (outs RC:$dst), 8413 (ins RC:$src1, RC:$src2), 8414 !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8415 [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1, 8416 RC:$src2))]>, 8417 Sched<[WriteVecIMul]>; 8418 def rm : I<0xda, MRMSrcMem, (outs RC:$dst), 8419 (ins RC:$src1, MemOp:$src2), 8420 !strconcat(OpStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), 8421 [(set RC:$dst, (!cast<Intrinsic>("int_x86_"#OpStr#VL) RC:$src1, 8422 (LD addr:$src2)))]>, 8423 Sched<[WriteVecIMul]>; 8424 } 8425} 8426 8427defm VSM4KEY4 : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8, XS, VEX, VVVV; 8428defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS, VEX_L, VEX, VVVV; 8429defm VSM4RNDS4 : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV; 8430defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV; 8431 8432let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in 8433multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> { 8434 let isCommutable = IsCommutable in 8435 def rr : I<opc, MRMSrcReg, (outs VR128:$dst), 8436 (ins VR128:$src1, VR128:$src2, VR128:$src3), 8437 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8438 [(set VR128:$dst, 8439 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128") 8440 VR128:$src1, VR128:$src2, VR128:$src3)))]>, 8441 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8442 8443 def rm : I<opc, MRMSrcMem, (outs VR128:$dst), 8444 (ins VR128:$src1, VR128:$src2, i128mem:$src3), 8445 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8446 [(set VR128:$dst, 8447 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128") 8448 VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>, 8449 VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>; 8450 8451 let isCommutable = IsCommutable in 8452 def Yrr : I<opc, MRMSrcReg, (outs VR256:$dst), 8453 (ins VR256:$src1, VR256:$src2, VR256:$src3), 8454 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8455 [(set VR256:$dst, 8456 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256") 8457 VR256:$src1, VR256:$src2, VR256:$src3)))]>, 8458 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8459 8460 def Yrm : I<opc, MRMSrcMem, (outs VR256:$dst), 8461 (ins VR256:$src1, VR256:$src2, i256mem:$src3), 8462 !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 8463 [(set VR256:$dst, 8464 (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256") 8465 VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>, 8466 VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>; 8467} 8468 8469defm VPDPWSUD : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS; 8470defm VPDPWSUDS : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS; 8471defm VPDPWUSD : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD; 8472defm VPDPWUSDS : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD; 8473defm VPDPWUUD : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8; 8474defm VPDPWUUDS : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8; 8475