1//=- ARMScheduleM7.td - ARM Cortex-M7 Scheduling Definitions -*- tablegen -*-=// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines the SchedRead/Write data for the ARM Cortex-M7 processor. 10// 11//===----------------------------------------------------------------------===// 12 13def CortexM7Model : SchedMachineModel { 14 let IssueWidth = 2; // Dual issue for most instructions. 15 let MicroOpBufferSize = 0; // The Cortex-M7 is in-order. 16 let LoadLatency = 2; // Best case for load-use case. 17 let MispredictPenalty = 4; // Mispredict cost for forward branches is 6, 18 // but 4 works better 19 let CompleteModel = 0; 20} 21 22let SchedModel = CortexM7Model in { 23 24//===--------------------------------------------------------------------===// 25// The Cortex-M7 has two ALU, two LOAD, a STORE, a MAC, a BRANCH and a VFP 26// pipe. The stages relevant to scheduling are as follows: 27// 28// EX1: address generation shifts 29// EX2: fast load data ALUs FP operation 30// EX3: slow load data integer writeback FP operation 31// EX4: store data FP writeback 32// 33// There are shifters in both EX1 and EX2, and some instructions can be 34// flexibly allocated between them. EX2 is used as the "zero" point 35// for scheduling, so simple ALU operations executing in EX2 will have 36// ReadAdvance<0> (the default) for their source operands and Latency = 1. 37 38def M7UnitLoadL : ProcResource<1> { let BufferSize = 0; } 39def M7UnitLoadH : ProcResource<1> { let BufferSize = 0; } 40def M7UnitLoad : ProcResGroup<[M7UnitLoadL,M7UnitLoadH]> { let BufferSize = 0; } 41def M7UnitStore : ProcResource<1> { let BufferSize = 0; } 42def M7UnitALU : ProcResource<2>; 43def M7UnitShift1 : ProcResource<1> { let BufferSize = 0; } 44def M7UnitShift2 : ProcResource<1> { let BufferSize = 0; } 45def M7UnitMAC : ProcResource<1> { let BufferSize = 0; } 46def M7UnitBranch : ProcResource<1> { let BufferSize = 0; } 47def M7UnitVFP : ProcResource<1> { let BufferSize = 0; } 48def M7UnitVPortL : ProcResource<1> { let BufferSize = 0; } 49def M7UnitVPortH : ProcResource<1> { let BufferSize = 0; } 50def M7UnitVPort : ProcResGroup<[M7UnitVPortL,M7UnitVPortH]> { let BufferSize = 0; } 51def M7UnitSIMD : ProcResource<1> { let BufferSize = 0; } 52 53//===---------------------------------------------------------------------===// 54// Subtarget-specific SchedWrite types with map ProcResources and set latency. 55 56def : WriteRes<WriteALU, [M7UnitALU]> { let Latency = 1; } 57 58// Basic ALU with shifts. 59let Latency = 1 in { 60 def : WriteRes<WriteALUsi, [M7UnitALU, M7UnitShift1]>; 61 def : WriteRes<WriteALUsr, [M7UnitALU, M7UnitShift1]>; 62 def : WriteRes<WriteALUSsr, [M7UnitALU, M7UnitShift1]>; 63} 64 65// Compares. 66def : WriteRes<WriteCMP, [M7UnitALU]> { let Latency = 1; } 67def : WriteRes<WriteCMPsi, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } 68def : WriteRes<WriteCMPsr, [M7UnitALU, M7UnitShift1]> { let Latency = 2; } 69 70// Multiplies. 71let Latency = 2 in { 72 def : WriteRes<WriteMUL16, [M7UnitMAC]>; 73 def : WriteRes<WriteMUL32, [M7UnitMAC]>; 74 def : WriteRes<WriteMUL64Lo, [M7UnitMAC]>; 75 def : WriteRes<WriteMUL64Hi, []> { let NumMicroOps = 0; } 76} 77 78// Multiply-accumulates. 79let Latency = 2 in { 80 def : WriteRes<WriteMAC16, [M7UnitMAC]>; 81 def : WriteRes<WriteMAC32, [M7UnitMAC]>; 82 def : WriteRes<WriteMAC64Lo, [M7UnitMAC]> { let Latency = 2; } 83 def : WriteRes<WriteMAC64Hi, []> { let NumMicroOps = 0; } 84} 85 86// Divisions. 87// These cannot be dual-issued with any instructions. 88def : WriteRes<WriteDIV, [M7UnitALU]> { 89 let Latency = 7; 90 let SingleIssue = 1; 91} 92 93// Loads/Stores. 94def : WriteRes<WriteLd, [M7UnitLoad]> { let Latency = 1; } 95def : WriteRes<WritePreLd, [M7UnitLoad]> { let Latency = 2; } 96def : WriteRes<WriteST, [M7UnitStore]> { let Latency = 2; } 97 98// Branches. 99def : WriteRes<WriteBr, [M7UnitBranch]> { let Latency = 2; } 100def : WriteRes<WriteBrL, [M7UnitBranch]> { let Latency = 2; } 101def : WriteRes<WriteBrTbl, [M7UnitBranch]> { let Latency = 2; } 102 103// Noop. 104def : WriteRes<WriteNoop, []> { let Latency = 0; } 105 106//===---------------------------------------------------------------------===// 107// Sched definitions for floating-point instructions 108// 109// Floating point conversions. 110def : WriteRes<WriteFPCVT, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } 111def : WriteRes<WriteFPMOV, [M7UnitVPort]> { let Latency = 3; } 112def M7WriteFPMOV64 : SchedWriteRes<[M7UnitVPortL, M7UnitVPortH]> { 113 let Latency = 3; 114} 115 116// The FP pipeline has a latency of 3 cycles. 117// ALU operations (32/64-bit). These go down the FP pipeline. 118def : WriteRes<WriteFPALU32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } 119def : WriteRes<WriteFPALU64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> { 120 let Latency = 4; 121 let BeginGroup = 1; 122} 123 124// Multiplication 125def : WriteRes<WriteFPMUL32, [M7UnitVFP, M7UnitVPort]> { let Latency = 3; } 126def : WriteRes<WriteFPMUL64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> { 127 let Latency = 7; 128 let BeginGroup = 1; 129} 130 131// Multiply-accumulate. FPMAC goes down the FP Pipeline. 132def : WriteRes<WriteFPMAC32, [M7UnitVFP, M7UnitVPort]> { let Latency = 6; } 133def : WriteRes<WriteFPMAC64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> { 134 let Latency = 11; 135 let BeginGroup = 1; 136} 137 138// Division. Effective scheduling latency is 3, though real latency is larger 139def : WriteRes<WriteFPDIV32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } 140def : WriteRes<WriteFPDIV64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> { 141 let Latency = 30; 142 let BeginGroup = 1; 143} 144 145// Square-root. Effective scheduling latency is 3; real latency is larger 146def : WriteRes<WriteFPSQRT32, [M7UnitVFP, M7UnitVPort]> { let Latency = 16; } 147def : WriteRes<WriteFPSQRT64, [M7UnitVFP, M7UnitVPortL, M7UnitVPortH]> { 148 let Latency = 30; 149 let BeginGroup = 1; 150} 151 152def M7WriteShift2 : SchedWriteRes<[M7UnitALU, M7UnitShift2]> {} 153 154// Not used for M7, but needing definitions anyway 155def : WriteRes<WriteVLD1, []>; 156def : WriteRes<WriteVLD2, []>; 157def : WriteRes<WriteVLD3, []>; 158def : WriteRes<WriteVLD4, []>; 159def : WriteRes<WriteVST1, []>; 160def : WriteRes<WriteVST2, []>; 161def : WriteRes<WriteVST3, []>; 162def : WriteRes<WriteVST4, []>; 163 164def M7SingleIssue : SchedWriteRes<[]> { 165 let SingleIssue = 1; 166 let NumMicroOps = 0; 167} 168def M7Slot0Only : SchedWriteRes<[]> { 169 let BeginGroup = 1; 170 let NumMicroOps = 0; 171} 172 173// What pipeline stage operands need to be ready for depending on 174// where they come from. 175def : ReadAdvance<ReadALUsr, 0>; 176def : ReadAdvance<ReadMUL, 0>; 177def : ReadAdvance<ReadMAC, 1>; 178def : ReadAdvance<ReadALU, 0>; 179def : ReadAdvance<ReadFPMUL, 0>; 180def : ReadAdvance<ReadFPMAC, 3>; 181def M7Read_ISS : SchedReadAdvance<-1>; // operands needed at EX1 182def M7Read_EX2 : SchedReadAdvance<1>; // operands needed at EX3 183def M7Read_EX3 : SchedReadAdvance<2>; // operands needed at EX4 184 185// Non general purpose instructions may not be dual issued. These 186// use both issue units. 187def M7NonGeneralPurpose : SchedWriteRes<[]> { 188 // Assume that these will go down the main ALU pipeline. 189 // In reality, many look likely to stall the whole pipeline. 190 let Latency = 3; 191 let SingleIssue = 1; 192} 193 194// List the non general purpose instructions. 195def : InstRW<[M7NonGeneralPurpose], (instregex "t2MRS", "tSVC", "tBKPT", 196 "t2MSR", "t2DMB", "t2DSB", "t2ISB", 197 "t2HVC", "t2SMC", "t2UDF", "ERET", 198 "tHINT", "t2HINT", "t2CLREX", "BUNDLE")>; 199 200//===---------------------------------------------------------------------===// 201// Sched definitions for load/store 202// 203// Mark whether the loads/stores must be single-issue 204// Address operands are needed earlier 205// Data operands are needed later 206 207def M7BaseUpdate : SchedWriteRes<[]> { 208 let Latency = 0; // Update is bypassable out of EX1 209 let NumMicroOps = 0; 210} 211def M7LoadLatency1 : SchedWriteRes<[]> { 212 let Latency = 1; 213 let NumMicroOps = 0; 214} 215def M7SlowLoad : SchedWriteRes<[M7UnitLoad]> { let Latency = 2; } 216 217// Byte and half-word loads should have greater latency than other loads. 218// So should load exclusive. 219 220def : InstRW<[M7SlowLoad], 221 (instregex "t2LDR(B|H|SB|SH)pc")>; 222def : InstRW<[M7SlowLoad, M7Read_ISS], 223 (instregex "t2LDR(B|H|SB|SH)T", "t2LDR(B|H|SB|SH)i", 224 "tLDR(B|H)i")>; 225def : InstRW<[M7SlowLoad, M7Read_ISS, M7Read_ISS], 226 (instregex "t2LDR(B|H|SB|SH)s", "tLDR(B|H)r", "tLDR(SB|SH)")>; 227def : InstRW<[M7SlowLoad, M7BaseUpdate, M7Read_ISS], 228 (instregex "t2LDR(B|H|SB|SH)_(POST|PRE)")>; 229 230// Exclusive loads/stores cannot be dual-issued 231def : InstRW<[WriteLd, M7Slot0Only, M7Read_ISS], 232 (instregex "t2LDREX$")>; 233def : InstRW<[M7SlowLoad, M7Slot0Only, M7Read_ISS], 234 (instregex "t2LDREX(B|H)")>; 235def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_ISS], 236 (instregex "t2STREX(B|H)?$")>; 237 238// Load/store multiples cannot be dual-issued. Note that default scheduling 239// occurs around read/write times of individual registers in the list; read 240// time for STM cannot be overridden because it is a variadic source operand. 241 242def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], 243 (instregex "(t|t2)LDM(DB|IA)$")>; 244def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], 245 (instregex "(t|t2)STM(DB|IA)$")>; 246def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], 247 (instregex "(t|t2)LDM(DB|IA)_UPD$", "tPOP")>; 248def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], 249 (instregex "(t|t2)STM(DB|IA)_UPD$", "tPUSH")>; 250 251// Load/store doubles cannot be dual-issued. 252 253def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, 254 M7Read_EX2, M7Read_EX2, M7Read_ISS], 255 (instregex "t2STRD_(PRE|POST)")>; 256def : InstRW<[WriteST, M7SingleIssue, M7Read_EX2, M7Read_EX2, M7Read_ISS], 257 (instregex "t2STRDi")>; 258def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7BaseUpdate, M7Read_ISS], 259 (instregex "t2LDRD_(PRE|POST)")>; 260def : InstRW<[WriteLd, M7LoadLatency1, M7SingleIssue, M7Read_ISS], 261 (instregex "t2LDRDi")>; 262 263// Word load / preload 264def : InstRW<[WriteLd], 265 (instregex "t2LDRpc", "t2PL[DI]pci", "tLDRpci")>; 266def : InstRW<[WriteLd, M7Read_ISS], 267 (instregex "t2LDR(i|T)", "t2PL[DI](W)?i", "tLDRi", "tLDRspi")>; 268def : InstRW<[WriteLd, M7Read_ISS, M7Read_ISS], 269 (instregex "t2LDRs", "t2PL[DI](w)?s", "tLDRr")>; 270def : InstRW<[WriteLd, M7BaseUpdate, M7Read_ISS], 271 (instregex "t2LDR_(POST|PRE)")>; 272 273// Stores 274def : InstRW<[M7BaseUpdate, WriteST, M7Read_EX2, M7Read_ISS], 275 (instregex "t2STR(B|H)?_(POST|PRE)")>; 276def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS, M7Read_ISS], 277 (instregex "t2STR(B|H)?s$", "tSTR(B|H)?r$")>; 278def : InstRW<[WriteST, M7Read_EX2, M7Read_ISS], 279 (instregex "t2STR(B|H)?(i|T)", "tSTR(B|H)?i$", "tSTRspi")>; 280 281// TBB/TBH - single-issue only; takes two cycles to issue 282 283def M7TableLoad : SchedWriteRes<[M7UnitLoad]> { 284 let NumMicroOps = 2; 285 let SingleIssue = 1; 286} 287 288def : InstRW<[M7TableLoad, M7Read_ISS, M7Read_ISS], (instregex "t2TB")>; 289 290// VFP loads and stores 291 292def M7LoadSP : SchedWriteRes<[M7UnitLoad, M7UnitVPort]> { let Latency = 1; } 293def M7LoadDP : SchedWriteRes<[M7UnitLoadL, M7UnitLoadH, M7UnitVPortL, M7UnitVPortH]> { 294 let Latency = 2; 295 let SingleIssue = 1; 296} 297def M7StoreSP : SchedWriteRes<[M7UnitStore, M7UnitVPort]>; 298def M7StoreDP : SchedWriteRes<[M7UnitStore, M7UnitVPortL, M7UnitVPortH]> { 299 let SingleIssue = 1; 300} 301 302def : InstRW<[M7LoadSP, M7Read_ISS], (instregex "VLDR(S|H)$")>; 303def : InstRW<[M7LoadDP, M7Read_ISS], (instregex "VLDRD$")>; 304def : InstRW<[M7StoreSP, M7Read_EX3, M7Read_ISS], (instregex "VSTR(S|H)$")>; 305def : InstRW<[M7StoreDP, M7Read_EX3, M7Read_ISS], (instregex "VSTRD$")>; 306 307// Load/store multiples cannot be dual-issued. 308 309def : InstRW<[WriteLd, M7SingleIssue, M7Read_ISS], 310 (instregex "VLDM(S|D|Q)(DB|IA)$")>; 311def : InstRW<[WriteST, M7SingleIssue, M7Read_ISS], 312 (instregex "VSTM(S|D|Q)(DB|IA)$")>; 313def : InstRW<[M7BaseUpdate, WriteLd, M7SingleIssue, M7Read_ISS], 314 (instregex "VLDM(S|D|Q)(DB|IA)_UPD$")>; 315def : InstRW<[M7BaseUpdate, WriteST, M7SingleIssue, M7Read_ISS], 316 (instregex "VSTM(S|D|Q)(DB|IA)_UPD$")>; 317 318//===---------------------------------------------------------------------===// 319// Sched definitions for ALU 320// 321 322// Shifted ALU operands are read a cycle early. 323def M7Ex1ReadNoFastBypass : SchedReadAdvance<-1, [WriteLd, M7LoadLatency1]>; 324 325def : InstRW<[WriteALUsi, M7Ex1ReadNoFastBypass, M7Read_ISS], 326 (instregex "t2(ADC|ADDS|ADD|BIC|EOR|ORN|ORR|RSBS|RSB|SBC|SUBS)rs$", 327 "t2(SUB|CMP|CMNz|TEQ|TST)rs$", 328 "t2MOVsr(a|l)")>; 329def : InstRW<[WriteALUsi, M7Read_ISS], 330 (instregex "t2MVNs")>; 331 332// Treat pure shift operations (except for RRX) as if they used the EX1 333// shifter but have timing as if they used the EX2 shifter as they usually 334// can choose the EX2 shifter when needed. Will miss a few dual-issue cases, 335// but the results prove to be better than trying to get them exact. 336 337def : InstRW<[M7WriteShift2, M7Read_ISS], (instregex "t2RRX$")>; 338def : InstRW<[WriteALUsi], (instregex "(t|t2)(LSL|LSR|ASR|ROR)")>; 339 340// Instructions that use the shifter, but have normal timing. 341 342def : InstRW<[WriteALUsi,M7Slot0Only], (instregex "t2(BFC|BFI)$")>; 343 344// Instructions which are slot zero only but otherwise normal. 345 346def : InstRW<[WriteALU, M7Slot0Only], (instregex "t2CLZ")>; 347 348// MAC operations that don't have SchedRW set. 349 350def : InstRW<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC], (instregex "t2SML[AS]D")>; 351 352// Divides are special because they stall for their latency, and so look like a 353// single-cycle as far as scheduling opportunities go. By putting WriteALU 354// first, we make the operand latency 1, but keep the instruction latency 7. 355 356def : InstRW<[WriteALU, WriteDIV], (instregex "t2(S|U)DIV")>; 357 358// DSP extension operations 359 360def M7WriteSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { 361 let Latency = 1; 362 let BeginGroup = 1; 363} 364def M7WriteSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU]> { 365 let Latency = 2; 366 let BeginGroup = 1; 367} 368def M7WriteShSIMD1 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { 369 let Latency = 1; 370 let BeginGroup = 1; 371} 372def M7WriteShSIMD0 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { 373 let Latency = 0; // Bypassable out of EX1 374 let BeginGroup = 1; 375} 376def M7WriteShSIMD2 : SchedWriteRes<[M7UnitSIMD, M7UnitALU, M7UnitShift1]> { 377 let Latency = 2; 378 let BeginGroup = 1; 379} 380 381def : InstRW<[M7WriteShSIMD2, M7Read_ISS], 382 (instregex "t2(S|U)SAT")>; 383def : InstRW<[M7WriteSIMD1, ReadALU], 384 (instregex "(t|t2)(S|U)XT(B|H)")>; 385def : InstRW<[M7WriteSIMD1, ReadALU, ReadALU], 386 (instregex "t2(S|SH|U|UH)(ADD16|ADD8|ASX|SAX|SUB16|SUB8)", 387 "t2SEL")>; 388def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU], 389 (instregex "t2(Q|UQ)(ADD|ASX|SAX|SUB)", "t2USAD8")>; 390def : InstRW<[M7WriteShSIMD2, M7Read_ISS, M7Read_ISS], 391 (instregex "t2QD(ADD|SUB)")>; 392def : InstRW<[M7WriteShSIMD0, M7Read_ISS], 393 (instregex "t2(RBIT|REV)", "tREV")>; 394def : InstRW<[M7WriteShSIMD1, M7Read_ISS], 395 (instregex "t2(SBFX|UBFX)")>; 396def : InstRW<[M7WriteShSIMD1, ReadALU, M7Read_ISS], 397 (instregex "t2PKH(BT|TB)", "t2(S|U)XTA")>; 398def : InstRW<[M7WriteSIMD2, ReadALU, ReadALU, M7Read_EX2], 399 (instregex "t2USADA8")>; 400 401// MSR/MRS 402def : InstRW<[M7NonGeneralPurpose], (instregex "MSR", "MRS")>; 403 404//===---------------------------------------------------------------------===// 405// Sched definitions for FP operations 406// 407 408// Effective scheduling latency is really 3 for nearly all FP operations, 409// even if their true latency is higher. 410def M7WriteVFPLatOverride : SchedWriteRes<[]> { 411 let Latency = 3; 412 let NumMicroOps = 0; 413} 414def M7WriteVFPExtraVPort : SchedWriteRes<[M7UnitVPort]> { 415 let Latency = 3; 416 let NumMicroOps = 0; 417} 418 419// Instructions which are missing default schedules. 420def : InstRW<[WriteFPALU32], 421 (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)S$")>; 422def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], 423 (instregex "V(ABS|CVT.*|NEG|FP_VMAX.*|FP_VMIN.*|RINT.*)D$")>; 424 425// VCMP 426def M7WriteVCMPS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let Latency = 0; } 427def M7WriteVCMPD : SchedWriteRes<[M7UnitVFP, M7UnitVPort, M7UnitVPort]> { 428 let Latency = 0; 429 let BeginGroup = 1; 430} 431def : InstRW<[M7WriteVCMPS], (instregex "VCMPS$")>; 432def : InstRW<[M7WriteVCMPD], (instregex "VCMPD$")>; 433 434 // VMRS/VMSR 435def M7VMRS : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } 436def M7VMSR : SchedWriteRes<[M7UnitVFP, M7UnitVPort]> { let SingleIssue = 1; } 437def : InstRW<[M7VMRS], (instregex "FMSTAT")>; 438def : InstRW<[M7VMSR], (instregex "VMSR")>; 439 440// VSEL cannot bypass in its implied $cpsr operand; model as earlier read 441def : InstRW<[WriteFPALU32, M7Slot0Only, ReadALU, ReadALU, M7Read_ISS], 442 (instregex "VSEL.*S$")>; 443def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64, M7Slot0Only, 444 ReadALU, ReadALU, M7Read_ISS], 445 (instregex "VSEL.*D$")>; 446 447// VMOV 448def : InstRW<[WriteFPMOV], 449 (instregex "VMOV(H|S)$", "FCONST(H|S)")>; 450def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], 451 (instregex "VMOVD$")>; 452def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7Slot0Only], 453 (instregex "FCONSTD")>; 454def : InstRW<[WriteFPMOV, M7WriteVFPExtraVPort, M7SingleIssue], 455 (instregex "VMOV(DRR|RRD|RRS|SRR)")>; 456 457// Larger-latency overrides. 458 459def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV32], (instregex "VDIVS")>; 460def : InstRW<[M7WriteVFPLatOverride, WriteFPDIV64], (instregex "VDIVD")>; 461def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT32], (instregex "VSQRTS")>; 462def : InstRW<[M7WriteVFPLatOverride, WriteFPSQRT64], (instregex "VSQRTD")>; 463def : InstRW<[M7WriteVFPLatOverride, WriteFPMUL64], 464 (instregex "V(MUL|NMUL)D")>; 465def : InstRW<[M7WriteVFPLatOverride, WriteFPALU64], 466 (instregex "V(ADD|SUB)D")>; 467 468// Multiply-accumulate. Chained SP timing is correct; rest need overrides 469// Double-precision chained MAC stalls the pipeline behind it for 3 cycles, 470// making it appear to have 3 cycle latency for scheduling. 471 472def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, 473 ReadFPMAC, ReadFPMUL, ReadFPMUL], 474 (instregex "V(N)?ML(A|S)D$")>; 475 476// Single-precision fused MACs look like latency 5 with advance of 2. 477 478def M7WriteVFPLatOverride5 : SchedWriteRes<[]> { 479 let Latency = 5; 480 let NumMicroOps = 0; 481} 482def M7ReadFPMAC2 : SchedReadAdvance<2>; 483 484def : InstRW<[M7WriteVFPLatOverride5, WriteFPMAC32, 485 M7ReadFPMAC2, ReadFPMUL, ReadFPMUL], 486 (instregex "VF(N)?M(A|S)S$")>; 487 488// Double-precision fused MAC stalls the pipeline behind it for 2 cycles, making 489// it appear to have 3 cycle latency for scheduling. 490 491def : InstRW<[M7WriteVFPLatOverride, WriteFPMAC64, 492 ReadFPMAC, ReadFPMUL, ReadFPMUL], 493 (instregex "VF(N)?M(A|S)D$")>; 494 495} // SchedModel = CortexM7Model 496