1//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the machine model for AMD btver2 (Jaguar) to support 10// instruction scheduling and other instruction cost heuristics. Based off AMD Software 11// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. 12// 13//===----------------------------------------------------------------------===// 14 15def BtVer2Model : SchedMachineModel { 16 // All x86 instructions are modeled as a single micro-op, and btver2 can 17 // decode 2 instructions per cycle. 18 let IssueWidth = 2; 19 let MicroOpBufferSize = 64; // Retire Control Unit 20 let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) 21 let HighLatency = 25; 22 let MispredictPenalty = 14; // Minimum branch misdirection penalty 23 let PostRAScheduler = 1; 24 25 // FIXME: SSE4/AVX is unimplemented. This flag is set to allow 26 // the scheduler to assign a default model to unrecognized opcodes. 27 let CompleteModel = 0; 28} 29 30let SchedModel = BtVer2Model in { 31 32// Jaguar can issue up to 6 micro-ops in one cycle 33def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) 34def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV 35def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU 36def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) 37def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA 38def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM 39 40// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and 41// speculative version of the 64-bit integer registers. 42// Reference: www.realworldtech.com/jaguar/4/ 43// 44// The processor always keeps the different parts of an integer register 45// together. An instruction that writes to a part of a register will therefore 46// have a false dependence on any previous write to the same register or any 47// part of it. 48// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register 49// access" - Agner Fog's "microarchitecture.pdf". 50def JIntegerPRF : RegisterFile<64, [GR64, CCR], [1, 1], [1, 0], 51 0, // Max moves that can be eliminated per cycle. 52 1>; // Restrict move elimination to zero regs. 53 54// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE 55// registers. Operations on 256-bit data types are cracked into two COPs. 56// Reference: www.realworldtech.com/jaguar/4/ 57 58// The PRF in the floating point unit can eliminate a move from a MMX or SSE 59// register that is know to be zero (i.e. it has been zeroed using a zero-idiom 60// dependency breaking instruction, or via VZEROALL). 61// Reference: Section 21.8 "AMD Bobcat and Jaguar pipeline: Dependency-breaking 62// instructions" - Agner Fog's "microarchitecture.pdf" 63def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2], [1, 1, 0], 64 0, // Max moves that can be eliminated per cycle. 65 1>; // Restrict move elimination to zero regs. 66 67// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can 68// retire up to two macro-ops per cycle. 69// Reference: "Software Optimization Guide for AMD Family 16h Processors" 70def JRCU : RetireControlUnit<64, 2>; 71 72// Integer Pipe Scheduler 73def JALU01 : ProcResGroup<[JALU0, JALU1]> { 74 let BufferSize=20; 75} 76 77// AGU Pipe Scheduler 78def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { 79 let BufferSize=12; 80} 81 82// Fpu Pipe Scheduler 83def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { 84 let BufferSize=18; 85} 86 87// Functional units 88def JDiv : ProcResource<1>; // integer division 89def JMul : ProcResource<1>; // integer multiplication 90def JVALU0 : ProcResource<1>; // vector integer 91def JVALU1 : ProcResource<1>; // vector integer 92def JVIMUL : ProcResource<1>; // vector integer multiplication 93def JSTC : ProcResource<1>; // vector store/convert 94def JFPM : ProcResource<1>; // FP multiplication 95def JFPA : ProcResource<1>; // FP addition 96 97// Functional unit groups 98def JFPX : ProcResGroup<[JFPA, JFPM]>; 99def JVALU : ProcResGroup<[JVALU0, JVALU1]>; 100 101// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 102// cycles after the memory operand. 103def : ReadAdvance<ReadAfterLd, 3>; 104 105// Vector loads are 5 cycles, so ReadAfterVec*Ld registers needn't be available until 5 106// cycles after the memory operand. 107def : ReadAdvance<ReadAfterVecLd, 5>; 108def : ReadAdvance<ReadAfterVecXLd, 5>; 109def : ReadAdvance<ReadAfterVecYLd, 5>; 110 111/// "Additional 6 cycle transfer operation which moves a floating point 112/// operation input value from the integer unit to the floating point unit. 113/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2). 114def : ReadAdvance<ReadInt2Fpu, -6>; 115 116// Many SchedWrites are defined in pairs with and without a folded load. 117// Instructions with folded loads are usually micro-fused, so they only appear 118// as two micro-ops when dispatched by the schedulers. 119// This multiclass defines the resource usage for variants with and without 120// folded loads. 121multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, 122 list<ProcResourceKind> ExePorts, 123 int Lat, list<int> Res = [], int UOps = 1, 124 int LoadUOps = 0> { 125 // Register variant is using a single cycle on ExePort. 126 def : WriteRes<SchedRW, ExePorts> { 127 let Latency = Lat; 128 let ResourceCycles = Res; 129 let NumMicroOps = UOps; 130 } 131 132 // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the 133 // latency. 134 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 135 let Latency = !add(Lat, 3); 136 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 137 let NumMicroOps = !add(UOps, LoadUOps); 138 } 139} 140 141multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, 142 list<ProcResourceKind> ExePorts, 143 int Lat, list<int> Res = [], int UOps = 1, 144 int LoadUOps = 0> { 145 // Register variant is using a single cycle on ExePort. 146 def : WriteRes<SchedRW, ExePorts> { 147 let Latency = Lat; 148 let ResourceCycles = Res; 149 let NumMicroOps = UOps; 150 } 151 152 // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the 153 // latency. 154 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 155 let Latency = !add(Lat, 5); 156 let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); 157 let NumMicroOps = !add(UOps, LoadUOps); 158 } 159} 160 161multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, 162 list<ProcResourceKind> ExePorts, 163 int Lat, list<int> Res = [2], int UOps = 2, 164 int LoadUOps = 0> { 165 // Register variant is using a single cycle on ExePort. 166 def : WriteRes<SchedRW, ExePorts> { 167 let Latency = Lat; 168 let ResourceCycles = Res; 169 let NumMicroOps = UOps; 170 } 171 172 // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the 173 // latency. 174 def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { 175 let Latency = !add(Lat, 5); 176 let ResourceCycles = !listconcat([2], Res); 177 let NumMicroOps = !add(UOps, LoadUOps); 178 } 179} 180 181// Instructions that have local forwarding disabled have an extra +1cy latency. 182 183// A folded store needs a cycle on the SAGU for the store data, 184// most RMW instructions don't need an extra uop. 185defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>; 186 187//////////////////////////////////////////////////////////////////////////////// 188// Arithmetic. 189//////////////////////////////////////////////////////////////////////////////// 190 191defm : JWriteResIntPair<WriteALU, [JALU01], 1>; 192defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; 193 194defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; 195defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; 196defm : X86WriteRes<WriteCMPXCHG,[JALU01], 1, [1], 1>; 197defm : X86WriteRes<WriteCMPXCHGRMW,[JALU01, JSAGU, JLAGU], 4, [1, 1, 1], 2>; 198defm : X86WriteRes<WriteXCHG, [JALU01], 1, [1], 1>; 199 200defm : JWriteResIntPair<WriteIMul8, [JALU1, JMul], 3, [1, 1], 2>; 201defm : JWriteResIntPair<WriteIMul16, [JALU1, JMul], 3, [1, 1], 2>; 202defm : JWriteResIntPair<WriteIMul16Imm, [JALU1, JMul], 3, [1, 1], 2>; 203defm : JWriteResIntPair<WriteIMul16Reg, [JALU1, JMul], 3, [1, 1], 2>; 204defm : JWriteResIntPair<WriteIMul32, [JALU1, JMul], 3, [1, 1], 2>; 205defm : JWriteResIntPair<WriteIMul32Imm, [JALU1, JMul], 3, [1, 1], 2>; 206defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 2>; 207defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; 208defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 2>; 209defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 2>; 210defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; 211 212defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 213defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 214defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 215defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 216defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; 217defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; 218defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; 219defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; 220 221defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; 222 223defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. 224defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. 225def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. 226def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; 227def : WriteRes<WriteLAHFSAHF, [JALU01]>; 228 229defm : X86WriteRes<WriteBitTest, [JALU01], 1, [1], 1>; 230defm : X86WriteRes<WriteBitTestImmLd, [JALU01,JLAGU], 4, [1,1], 1>; 231defm : X86WriteRes<WriteBitTestRegLd, [JALU01,JLAGU], 4, [1,1], 5>; 232defm : X86WriteRes<WriteBitTestSet, [JALU01], 1, [1], 2>; 233defm : X86WriteRes<WriteBitTestSetImmLd, [JALU01,JLAGU], 4, [1,1], 4>; 234defm : X86WriteRes<WriteBitTestSetRegLd, [JALU01,JLAGU], 4, [1,1], 8>; 235 236// This is for simple LEAs with one or two input operands. 237def : WriteRes<WriteLEA, [JALU01]>; 238 239// Bit counts. 240defm : JWriteResIntPair<WriteBSF, [JALU01], 4, [8], 7>; 241defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [8], 8>; 242defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; 243defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; 244defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2], 2>; 245 246// BMI1 BEXTR/BLS, BMI2 BZHI 247defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; 248defm : JWriteResIntPair<WriteBLS, [JALU01], 2, [2], 2>; 249defm : X86WriteResPairUnsupported<WriteBZHI>; 250 251//////////////////////////////////////////////////////////////////////////////// 252// Integer shifts and rotates. 253//////////////////////////////////////////////////////////////////////////////// 254 255defm : JWriteResIntPair<WriteShift, [JALU01], 1>; 256defm : JWriteResIntPair<WriteShiftCL, [JALU01], 1>; 257defm : JWriteResIntPair<WriteRotate, [JALU01], 1>; 258defm : JWriteResIntPair<WriteRotateCL, [JALU01], 1>; 259 260// SHLD/SHRD. 261defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; 262defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; 263defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; 264defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; 265 266//////////////////////////////////////////////////////////////////////////////// 267// Loads, stores, and moves, not folded with other operations. 268//////////////////////////////////////////////////////////////////////////////// 269 270def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; } 271def : WriteRes<WriteStore, [JSAGU]>; 272def : WriteRes<WriteStoreNT, [JSAGU]>; 273def : WriteRes<WriteMove, [JALU01]>; 274 275// Load/store MXCSR. 276def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; } 277def : WriteRes<WriteSTMXCSR, [JSAGU]>; 278 279// Treat misc copies as a move. 280def : InstRW<[WriteMove], (instrs COPY)>; 281 282//////////////////////////////////////////////////////////////////////////////// 283// Idioms that clear a register, like xorps %xmm0, %xmm0. 284// These can often bypass execution ports completely. 285//////////////////////////////////////////////////////////////////////////////// 286 287def : WriteRes<WriteZero, []>; 288 289//////////////////////////////////////////////////////////////////////////////// 290// Branches don't produce values, so they have no latency, but they still 291// consume resources. Indirect branches can fold loads. 292//////////////////////////////////////////////////////////////////////////////// 293 294defm : JWriteResIntPair<WriteJump, [JALU01], 1>; 295 296//////////////////////////////////////////////////////////////////////////////// 297// Special case scheduling classes. 298//////////////////////////////////////////////////////////////////////////////// 299 300def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } 301def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } 302def : WriteRes<WriteFence, [JSAGU]>; 303 304// Nops don't have dependencies, so there's no actual latency, but we set this 305// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. 306def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } 307 308//////////////////////////////////////////////////////////////////////////////// 309// Floating point. This covers both scalar and vector operations. 310//////////////////////////////////////////////////////////////////////////////// 311 312defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; 313defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; 314defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; 315defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 316defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 317defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; 318defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 2, 2], 1>; 319defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 4, 4], 2>; 320 321defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 322defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 323defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 324defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 325defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; 326defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; 327defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>; 328defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; 329 330defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; 331defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; 332defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; 333 334defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; 335 336defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; 337defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; 338defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; 339defm : X86WriteResPairUnsupported<WriteFAddZ>; 340defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; 341defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; 342defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; 343defm : X86WriteResPairUnsupported<WriteFAdd64Z>; 344defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; 345defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; 346defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; 347defm : X86WriteResPairUnsupported<WriteFCmpZ>; 348defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; 349defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; 350defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; 351defm : X86WriteResPairUnsupported<WriteFCmp64Z>; 352defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; 353defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; 354defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; 355defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; 356defm : X86WriteResPairUnsupported<WriteFMulZ>; 357defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; 358defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; 359defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; 360defm : X86WriteResPairUnsupported<WriteFMul64Z>; 361defm : X86WriteResPairUnsupported<WriteFMA>; 362defm : X86WriteResPairUnsupported<WriteFMAX>; 363defm : X86WriteResPairUnsupported<WriteFMAY>; 364defm : X86WriteResPairUnsupported<WriteFMAZ>; 365defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; 366defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; 367defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; 368defm : X86WriteResPairUnsupported<WriteDPPSZ>; 369defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; 370defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; 371defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; 372defm : X86WriteResPairUnsupported<WriteFRcpZ>; 373defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; 374defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; 375defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; 376defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; 377defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; 378defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; 379defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; 380defm : X86WriteResPairUnsupported<WriteFDivZ>; 381defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; 382defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; 383defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; 384defm : X86WriteResPairUnsupported<WriteFDiv64Z>; 385defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; 386defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; 387defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; 388defm : X86WriteResPairUnsupported<WriteFSqrtZ>; 389defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; 390defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; 391defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; 392defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; 393defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; 394defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; 395defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; 396defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; 397defm : X86WriteResPairUnsupported<WriteFRndZ>; 398defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; 399defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; 400defm : X86WriteResPairUnsupported<WriteFLogicZ>; 401defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; 402defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 403defm : X86WriteResPairUnsupported<WriteFTestZ>; 404defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; 405defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; 406defm : X86WriteResPairUnsupported<WriteFShuffleZ>; 407defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 3, [1, 4], 3>; // +1cy latency. 408defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 4, [2, 6], 6>; // +1cy latency. 409defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; 410defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; 411defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; 412defm : X86WriteResPairUnsupported<WriteFBlendZ>; 413defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [4, 4], 3>; 414defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [6, 6], 6>; 415defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; 416defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1, [2, 2], 2>; 417defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; 418 419//////////////////////////////////////////////////////////////////////////////// 420// Conversions. 421//////////////////////////////////////////////////////////////////////////////// 422 423defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 424defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; 425defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; 426defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; 427defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPU0, JFPA, JALU0], 7, [1,1,1,1,1], 2>; 428defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; 429defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 430defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; 431 432defm : X86WriteRes<WriteCvtI2SS, [JFPU1, JSTC], 4, [1,1], 2>; 433defm : X86WriteRes<WriteCvtI2SSLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 434defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; 435defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 436defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; 437defm : X86WriteRes<WriteCvtI2SD, [JFPU1, JSTC], 4, [1,1], 2>; 438defm : X86WriteRes<WriteCvtI2SDLd, [JLAGU, JFPU1, JSTC], 9, [1,1,1], 1>; 439defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; 440defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; 441defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; 442 443defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; 444defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; 445defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; 446defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; 447 448defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; 449defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; 450defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; 451defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; 452 453defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; 454defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; 455defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; 456 457defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; 458defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; 459defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; 460defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; 461defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; 462defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; 463 464//////////////////////////////////////////////////////////////////////////////// 465// Vector integer operations. 466//////////////////////////////////////////////////////////////////////////////// 467 468defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 469defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 470defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 471defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 472defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; 473defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 2, 2], 1>; 474defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 4, 4], 2>; 475 476defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 477defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 478defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; 479defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; 480defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; 481defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; 482defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; 483 484defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; 485defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; 486defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; 487defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; 488defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; 489 490defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; 491defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; 492defm : X86WriteResPairUnsupported<WriteVecALUY>; 493defm : X86WriteResPairUnsupported<WriteVecALUZ>; 494defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; 495defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 2>; // +1cy latency. 496defm : X86WriteResPairUnsupported<WriteVecShiftY>; 497defm : X86WriteResPairUnsupported<WriteVecShiftZ>; 498defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; 499defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 2>; // +1cy latency. 500defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; 501defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; 502defm : X86WriteResPairUnsupported<WriteVarVecShift>; 503defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; 504defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; 505defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; 506defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; 507defm : X86WriteResPairUnsupported<WriteVecIMulY>; 508defm : X86WriteResPairUnsupported<WriteVecIMulZ>; 509defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; 510defm : X86WriteResPairUnsupported<WritePMULLDY>; 511defm : X86WriteResPairUnsupported<WritePMULLDZ>; 512defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2], 3>; 513defm : X86WriteResPairUnsupported<WriteMPSADY>; 514defm : X86WriteResPairUnsupported<WriteMPSADZ>; 515defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; 516defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; 517defm : X86WriteResPairUnsupported<WritePSADBWY>; 518defm : X86WriteResPairUnsupported<WritePSADBWZ>; 519defm : JWriteResFpuPair<WritePHMINPOS, [JFPU01, JVALU], 2>; 520defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; 521defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; 522defm : X86WriteResPairUnsupported<WriteShuffleY>; 523defm : X86WriteResPairUnsupported<WriteShuffleZ>; 524defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 1], 1>; 525defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; 526defm : X86WriteResPairUnsupported<WriteVarShuffleY>; 527defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; 528defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; 529defm : X86WriteResPairUnsupported<WriteBlendY>; 530defm : X86WriteResPairUnsupported<WriteBlendZ>; 531defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [4, 4], 3>; 532defm : X86WriteResPairUnsupported<WriteVarBlendY>; 533defm : X86WriteResPairUnsupported<WriteVarBlendZ>; 534defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; 535defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; 536defm : X86WriteResPairUnsupported<WriteVecLogicY>; 537defm : X86WriteResPairUnsupported<WriteVecLogicZ>; 538defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; 539defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; 540defm : X86WriteResPairUnsupported<WriteVecTestZ>; 541defm : X86WriteResPairUnsupported<WriteShuffle256>; 542defm : X86WriteResPairUnsupported<WriteVarShuffle256>; 543 544//////////////////////////////////////////////////////////////////////////////// 545// Vector insert/extract operations. 546//////////////////////////////////////////////////////////////////////////////// 547 548defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 1, [1,1], 2>; 549defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; 550defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; 551defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; 552 553//////////////////////////////////////////////////////////////////////////////// 554// SSE42 String instructions. 555//////////////////////////////////////////////////////////////////////////////// 556 557defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 7, [2, 2, 1, 1, 1], 3>; 558defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPU0, JFPA, JALU0], 8, [2, 2, 1, 1, 1], 3>; 559defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 560defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; 561 562//////////////////////////////////////////////////////////////////////////////// 563// MOVMSK Instructions. 564//////////////////////////////////////////////////////////////////////////////// 565 566def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 567def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 568defm : X86WriteResUnsupported<WriteVecMOVMSKY>; 569def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } 570 571//////////////////////////////////////////////////////////////////////////////// 572// AES Instructions. 573//////////////////////////////////////////////////////////////////////////////// 574 575defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; 576defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; 577defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,1,1,1], 2>; 578 579//////////////////////////////////////////////////////////////////////////////// 580// Horizontal add/sub instructions. 581//////////////////////////////////////////////////////////////////////////////// 582 583defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 4>; // +1cy latency. 584defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 4, [2,2], 2>; // +1cy latency. 585defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; 586defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 2>; // +1cy latency. 587defm : X86WriteResPairUnsupported<WritePHAddY>; 588 589//////////////////////////////////////////////////////////////////////////////// 590// Carry-less multiplication instructions. 591//////////////////////////////////////////////////////////////////////////////// 592 593defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; 594 595//////////////////////////////////////////////////////////////////////////////// 596// SSE4A instructions. 597//////////////////////////////////////////////////////////////////////////////// 598 599def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { 600 let Latency = 2; 601 let ResourceCycles = [1, 4]; 602} 603def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; 604 605//////////////////////////////////////////////////////////////////////////////// 606// AVX instructions. 607//////////////////////////////////////////////////////////////////////////////// 608 609def JWriteVecExtractF128: SchedWriteRes<[JFPU01, JFPX]>; 610def : InstRW<[JWriteVecExtractF128], (instrs VEXTRACTF128rr)>; 611 612def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { 613 let Latency = 6; 614 let ResourceCycles = [1, 2, 4]; 615 let NumMicroOps = 2; 616} 617def : InstRW<[JWriteVBROADCASTYLd], (instrs VBROADCASTSDYrm, 618 VBROADCASTSSYrm, 619 VBROADCASTF128)>; 620 621def JWriteJVZEROALL: SchedWriteRes<[]> { 622 let Latency = 90; 623 let NumMicroOps = 73; 624} 625def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; 626 627def JWriteJVZEROUPPER: SchedWriteRes<[]> { 628 let Latency = 46; 629 let NumMicroOps = 37; 630} 631def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; 632 633/////////////////////////////////////////////////////////////////////////////// 634// SchedWriteVariant definitions. 635/////////////////////////////////////////////////////////////////////////////// 636 637def JWriteZeroLatency : SchedWriteRes<[]> { 638 let Latency = 0; 639} 640 641def JWriteZeroIdiomYmm : SchedWriteRes<[JFPU01, JFPX]> { 642 let NumMicroOps = 2; 643} 644 645// Certain instructions that use the same register for both source 646// operands do not have a real dependency on the previous contents of the 647// register, and thus, do not have to wait before completing. They can be 648// optimized out at register renaming stage. 649// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family 650// 15h Processors". 651// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", 652// Section 21.8 [Dependency-breaking instructions]. 653 654def JWriteZeroIdiom : SchedWriteVariant<[ 655 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 656 SchedVar<NoSchedPred, [WriteALU]> 657]>; 658def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, 659 XOR32rr, XOR64rr)>; 660 661def JWriteFZeroIdiom : SchedWriteVariant<[ 662 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 663 SchedVar<NoSchedPred, [WriteFLogic]> 664]>; 665def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, 666 ANDNPSrr, VANDNPSrr, 667 ANDNPDrr, VANDNPDrr)>; 668 669def JWriteFZeroIdiomY : SchedWriteVariant<[ 670 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroIdiomYmm]>, 671 SchedVar<NoSchedPred, [WriteFLogicY]> 672]>; 673def : InstRW<[JWriteFZeroIdiomY], (instrs VXORPSYrr, VXORPDYrr, 674 VANDNPSYrr, VANDNPDYrr)>; 675 676def JWriteVZeroIdiomLogic : SchedWriteVariant<[ 677 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 678 SchedVar<NoSchedPred, [WriteVecLogic]> 679]>; 680def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; 681 682def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ 683 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 684 SchedVar<NoSchedPred, [WriteVecLogicX]> 685]>; 686def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, 687 PANDNrr, VPANDNrr)>; 688 689def JWriteVZeroIdiomALU : SchedWriteVariant<[ 690 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 691 SchedVar<NoSchedPred, [WriteVecALU]> 692]>; 693def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, 694 MMX_PSUBQirr, MMX_PSUBWirr, 695 MMX_PSUBSBirr, MMX_PSUBSWirr, 696 MMX_PSUBUSBirr, MMX_PSUBUSWirr, 697 MMX_PCMPGTBirr, MMX_PCMPGTDirr, 698 MMX_PCMPGTWirr)>; 699 700def JWriteVZeroIdiomALUX : SchedWriteVariant<[ 701 SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, 702 SchedVar<NoSchedPred, [WriteVecALUX]> 703]>; 704def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, 705 PSUBDrr, VPSUBDrr, 706 PSUBQrr, VPSUBQrr, 707 PSUBWrr, VPSUBWrr, 708 PSUBSBrr, VPSUBSBrr, 709 PSUBSWrr, VPSUBSWrr, 710 PSUBUSBrr, VPSUBUSBrr, 711 PSUBUSWrr, VPSUBUSWrr, 712 PCMPGTBrr, VPCMPGTBrr, 713 PCMPGTDrr, VPCMPGTDrr, 714 PCMPGTQrr, VPCMPGTQrr, 715 PCMPGTWrr, VPCMPGTWrr)>; 716 717def JWriteVPERM2F128 : SchedWriteVariant<[ 718 SchedVar<MCSchedPredicate<ZeroIdiomVPERMPredicate>, [JWriteZeroIdiomYmm]>, 719 SchedVar<NoSchedPred, [WriteFShuffle256]> 720]>; 721def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; 722 723// This write is used for slow LEA instructions. 724def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { 725 let Latency = 2; 726} 727 728// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA 729// with a `Scale` value different than 1. 730def JSlowLEAPredicate : MCSchedPredicate< 731 CheckAny<[ 732 // A 3-operand LEA (base, index, offset). 733 IsThreeOperandsLEAFn, 734 // An LEA with a "Scale" different than 1. 735 CheckAll<[ 736 CheckIsImmOperand<2>, 737 CheckNot<CheckImmOperand<2, 1>> 738 ]> 739 ]> 740>; 741 742def JWriteLEA : SchedWriteVariant<[ 743 SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, 744 SchedVar<NoSchedPred, [WriteLEA]> 745]>; 746 747def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; 748 749def JSlowLEA16r : SchedWriteRes<[JALU01]> { 750 let Latency = 3; 751 let ResourceCycles = [4]; 752} 753 754def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; 755 756/////////////////////////////////////////////////////////////////////////////// 757// Dependency breaking instructions. 758/////////////////////////////////////////////////////////////////////////////// 759 760def : IsZeroIdiomFunction<[ 761 // GPR Zero-idioms. 762 DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>, 763 764 // MMX Zero-idioms. 765 DepBreakingClass<[ 766 MMX_PXORirr, MMX_PANDNirr, MMX_PSUBBirr, 767 MMX_PSUBDirr, MMX_PSUBQirr, MMX_PSUBWirr, 768 MMX_PSUBSBirr, MMX_PSUBSWirr, MMX_PSUBUSBirr, MMX_PSUBUSWirr, 769 MMX_PCMPGTBirr, MMX_PCMPGTDirr, MMX_PCMPGTWirr 770 ], ZeroIdiomPredicate>, 771 772 // SSE Zero-idioms. 773 DepBreakingClass<[ 774 // fp variants. 775 XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr, 776 777 // int variants. 778 PXORrr, PANDNrr, 779 PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr, 780 PSUBSBrr, PSUBSWrr, PSUBUSBrr, PSUBUSWrr, 781 PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr 782 ], ZeroIdiomPredicate>, 783 784 // AVX Zero-idioms. 785 DepBreakingClass<[ 786 // xmm fp variants. 787 VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr, 788 789 // xmm int variants. 790 VPXORrr, VPANDNrr, 791 VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr, 792 VPSUBSBrr, VPSUBSWrr, VPSUBUSBrr, VPSUBUSWrr, 793 VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr, 794 795 // ymm variants. 796 VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr 797 ], ZeroIdiomPredicate>, 798 799 DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> 800]>; 801 802def : IsDepBreakingFunction<[ 803 // GPR 804 DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>, 805 DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >, 806 807 // MMX 808 DepBreakingClass<[ 809 MMX_PCMPEQBirr, MMX_PCMPEQDirr, MMX_PCMPEQWirr 810 ], ZeroIdiomPredicate>, 811 812 // SSE 813 DepBreakingClass<[ 814 PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr 815 ], ZeroIdiomPredicate>, 816 817 // AVX 818 DepBreakingClass<[ 819 VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr 820 ], ZeroIdiomPredicate> 821]>; 822 823def : IsOptimizableRegisterMove<[ 824 InstructionEquivalenceClass<[ 825 // GPR variants. 826 MOV32rr, MOV64rr, 827 828 // MMX variants. 829 MMX_MOVQ64rr, 830 831 // SSE variants. 832 MOVAPSrr, MOVUPSrr, 833 MOVAPDrr, MOVUPDrr, 834 MOVDQArr, MOVDQUrr, 835 836 // AVX variants. 837 VMOVAPSrr, VMOVUPSrr, 838 VMOVAPDrr, VMOVUPDrr, 839 VMOVDQArr, VMOVDQUrr 840 ], TruePred > 841]>; 842 843} // SchedModel 844