1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/BasicAliasAnalysis.h" 20 #include "llvm/Analysis/CGSCCPassManager.h" 21 #include "llvm/Analysis/GlobalsModRef.h" 22 #include "llvm/Analysis/InlineAdvisor.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/ScopedNoAliasAA.h" 25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 26 #include "llvm/IR/PassManager.h" 27 #include "llvm/Passes/OptimizationLevel.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/PGOOptions.h" 32 #include "llvm/Support/VirtualFileSystem.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 35 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 36 #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h" 37 #include "llvm/Transforms/Coroutines/CoroEarly.h" 38 #include "llvm/Transforms/Coroutines/CoroElide.h" 39 #include "llvm/Transforms/Coroutines/CoroSplit.h" 40 #include "llvm/Transforms/HipStdPar/HipStdPar.h" 41 #include "llvm/Transforms/IPO/AlwaysInliner.h" 42 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 43 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 44 #include "llvm/Transforms/IPO/Attributor.h" 45 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 46 #include "llvm/Transforms/IPO/ConstantMerge.h" 47 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 48 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 49 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 50 #include "llvm/Transforms/IPO/EmbedBitcodePass.h" 51 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 52 #include "llvm/Transforms/IPO/FunctionAttrs.h" 53 #include "llvm/Transforms/IPO/GlobalDCE.h" 54 #include "llvm/Transforms/IPO/GlobalOpt.h" 55 #include "llvm/Transforms/IPO/GlobalSplit.h" 56 #include "llvm/Transforms/IPO/HotColdSplitting.h" 57 #include "llvm/Transforms/IPO/IROutliner.h" 58 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 59 #include "llvm/Transforms/IPO/Inliner.h" 60 #include "llvm/Transforms/IPO/LowerTypeTests.h" 61 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 62 #include "llvm/Transforms/IPO/MergeFunctions.h" 63 #include "llvm/Transforms/IPO/ModuleInliner.h" 64 #include "llvm/Transforms/IPO/OpenMPOpt.h" 65 #include "llvm/Transforms/IPO/PartialInlining.h" 66 #include "llvm/Transforms/IPO/SCCP.h" 67 #include "llvm/Transforms/IPO/SampleProfile.h" 68 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 69 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 70 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 71 #include "llvm/Transforms/InstCombine/InstCombine.h" 72 #include "llvm/Transforms/Instrumentation/CGProfile.h" 73 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 74 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 75 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 76 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 77 #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h" 78 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h" 79 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 80 #include "llvm/Transforms/Scalar/ADCE.h" 81 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 82 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 83 #include "llvm/Transforms/Scalar/BDCE.h" 84 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 85 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 86 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 87 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 88 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 89 #include "llvm/Transforms/Scalar/DivRemPairs.h" 90 #include "llvm/Transforms/Scalar/EarlyCSE.h" 91 #include "llvm/Transforms/Scalar/Float2Int.h" 92 #include "llvm/Transforms/Scalar/GVN.h" 93 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 94 #include "llvm/Transforms/Scalar/InferAlignment.h" 95 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 96 #include "llvm/Transforms/Scalar/JumpTableToSwitch.h" 97 #include "llvm/Transforms/Scalar/JumpThreading.h" 98 #include "llvm/Transforms/Scalar/LICM.h" 99 #include "llvm/Transforms/Scalar/LoopDeletion.h" 100 #include "llvm/Transforms/Scalar/LoopDistribute.h" 101 #include "llvm/Transforms/Scalar/LoopFlatten.h" 102 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 103 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 104 #include "llvm/Transforms/Scalar/LoopInterchange.h" 105 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 106 #include "llvm/Transforms/Scalar/LoopPassManager.h" 107 #include "llvm/Transforms/Scalar/LoopRotation.h" 108 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 109 #include "llvm/Transforms/Scalar/LoopSink.h" 110 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 111 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 112 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h" 113 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 114 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 115 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 116 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 117 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 118 #include "llvm/Transforms/Scalar/NewGVN.h" 119 #include "llvm/Transforms/Scalar/Reassociate.h" 120 #include "llvm/Transforms/Scalar/SCCP.h" 121 #include "llvm/Transforms/Scalar/SROA.h" 122 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 123 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 124 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 125 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 126 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 127 #include "llvm/Transforms/Utils/AddDiscriminators.h" 128 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 129 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 130 #include "llvm/Transforms/Utils/CountVisits.h" 131 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" 132 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 133 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 134 #include "llvm/Transforms/Utils/Mem2Reg.h" 135 #include "llvm/Transforms/Utils/MoveAutoInit.h" 136 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 137 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 138 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 139 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 140 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 141 #include "llvm/Transforms/Vectorize/VectorCombine.h" 142 143 using namespace llvm; 144 145 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 146 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 147 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 148 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 149 "Heuristics-based inliner version"), 150 clEnumValN(InliningAdvisorMode::Development, "development", 151 "Use development mode (runtime-loadable model)"), 152 clEnumValN(InliningAdvisorMode::Release, "release", 153 "Use release mode (AOT-compiled model)"))); 154 155 static cl::opt<bool> EnableSyntheticCounts( 156 "enable-npm-synthetic-counts", cl::Hidden, 157 cl::desc("Run synthetic function entry count generation " 158 "pass")); 159 160 /// Flag to enable inline deferral during PGO. 161 static cl::opt<bool> 162 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 163 cl::Hidden, 164 cl::desc("Enable inline deferral during PGO")); 165 166 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 167 cl::init(false), cl::Hidden, 168 cl::desc("Enable module inliner")); 169 170 static cl::opt<bool> PerformMandatoryInliningsFirst( 171 "mandatory-inlining-first", cl::init(false), cl::Hidden, 172 cl::desc("Perform mandatory inlinings module-wide, before performing " 173 "inlining")); 174 175 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 176 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 177 cl::desc("Eagerly invalidate more analyses in default pipelines")); 178 179 static cl::opt<bool> EnableMergeFunctions( 180 "enable-merge-functions", cl::init(false), cl::Hidden, 181 cl::desc("Enable function merging as part of the optimization pipeline")); 182 183 static cl::opt<bool> EnablePostPGOLoopRotation( 184 "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden, 185 cl::desc("Run the loop rotation transformation after PGO instrumentation")); 186 187 static cl::opt<bool> EnableGlobalAnalyses( 188 "enable-global-analyses", cl::init(true), cl::Hidden, 189 cl::desc("Enable inter-procedural analyses")); 190 191 static cl::opt<bool> 192 RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, 193 cl::desc("Run Partial inlinining pass")); 194 195 static cl::opt<bool> ExtraVectorizerPasses( 196 "extra-vectorizer-passes", cl::init(false), cl::Hidden, 197 cl::desc("Run cleanup optimization passes after vectorization")); 198 199 static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, 200 cl::desc("Run the NewGVN pass")); 201 202 static cl::opt<bool> EnableLoopInterchange( 203 "enable-loopinterchange", cl::init(false), cl::Hidden, 204 cl::desc("Enable the experimental LoopInterchange Pass")); 205 206 static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", 207 cl::init(false), cl::Hidden, 208 cl::desc("Enable Unroll And Jam Pass")); 209 210 static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false), 211 cl::Hidden, 212 cl::desc("Enable the LoopFlatten Pass")); 213 214 // Experimentally allow loop header duplication. This should allow for better 215 // optimization at Oz, since loop-idiom recognition can then recognize things 216 // like memcpy. If this ends up being useful for many targets, we should drop 217 // this flag and make a code generation option that can be controlled 218 // independent of the opt level and exposed through the frontend. 219 static cl::opt<bool> EnableLoopHeaderDuplication( 220 "enable-loop-header-duplication", cl::init(false), cl::Hidden, 221 cl::desc("Enable loop header duplication at any optimization level")); 222 223 static cl::opt<bool> 224 EnableDFAJumpThreading("enable-dfa-jump-thread", 225 cl::desc("Enable DFA jump threading"), 226 cl::init(false), cl::Hidden); 227 228 // TODO: turn on and remove flag 229 static cl::opt<bool> EnablePGOForceFunctionAttrs( 230 "enable-pgo-force-function-attrs", 231 cl::desc("Enable pass to set function attributes based on PGO profiles"), 232 cl::init(false)); 233 234 static cl::opt<bool> 235 EnableHotColdSplit("hot-cold-split", 236 cl::desc("Enable hot-cold splitting pass")); 237 238 static cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), 239 cl::Hidden, 240 cl::desc("Enable ir outliner pass")); 241 242 static cl::opt<bool> 243 DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden, 244 cl::desc("Disable pre-instrumentation inliner")); 245 246 static cl::opt<int> PreInlineThreshold( 247 "preinline-threshold", cl::Hidden, cl::init(75), 248 cl::desc("Control the amount of inlining in pre-instrumentation inliner " 249 "(default = 75)")); 250 251 static cl::opt<bool> 252 EnableGVNHoist("enable-gvn-hoist", 253 cl::desc("Enable the GVN hoisting pass (default = off)")); 254 255 static cl::opt<bool> 256 EnableGVNSink("enable-gvn-sink", 257 cl::desc("Enable the GVN sinking pass (default = off)")); 258 259 static cl::opt<bool> EnableJumpTableToSwitch( 260 "enable-jump-table-to-switch", 261 cl::desc("Enable JumpTableToSwitch pass (default = off)")); 262 263 // This option is used in simplifying testing SampleFDO optimizations for 264 // profile loading. 265 static cl::opt<bool> 266 EnableCHR("enable-chr", cl::init(true), cl::Hidden, 267 cl::desc("Enable control height reduction optimization (CHR)")); 268 269 static cl::opt<bool> FlattenedProfileUsed( 270 "flattened-profile-used", cl::init(false), cl::Hidden, 271 cl::desc("Indicate the sample profile being used is flattened, i.e., " 272 "no inline hierachy exists in the profile")); 273 274 static cl::opt<bool> EnableOrderFileInstrumentation( 275 "enable-order-file-instrumentation", cl::init(false), cl::Hidden, 276 cl::desc("Enable order file instrumentation (default = off)")); 277 278 static cl::opt<bool> 279 EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, 280 cl::desc("Enable lowering of the matrix intrinsics")); 281 282 static cl::opt<bool> EnableConstraintElimination( 283 "enable-constraint-elimination", cl::init(true), cl::Hidden, 284 cl::desc( 285 "Enable pass to eliminate conditions based on linear constraints")); 286 287 static cl::opt<AttributorRunOption> AttributorRun( 288 "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), 289 cl::desc("Enable the attributor inter-procedural deduction pass"), 290 cl::values(clEnumValN(AttributorRunOption::ALL, "all", 291 "enable all attributor runs"), 292 clEnumValN(AttributorRunOption::MODULE, "module", 293 "enable module-wide attributor runs"), 294 clEnumValN(AttributorRunOption::CGSCC, "cgscc", 295 "enable call graph SCC attributor runs"), 296 clEnumValN(AttributorRunOption::NONE, "none", 297 "disable attributor runs"))); 298 299 static cl::opt<bool> EnableSampledInstr( 300 "enable-sampled-instrumentation", cl::init(false), cl::Hidden, 301 cl::desc("Enable profile instrumentation sampling (default = off)")); 302 static cl::opt<bool> UseLoopVersioningLICM( 303 "enable-loop-versioning-licm", cl::init(false), cl::Hidden, 304 cl::desc("Enable the experimental Loop Versioning LICM pass")); 305 306 namespace llvm { 307 extern cl::opt<bool> EnableMemProfContextDisambiguation; 308 309 extern cl::opt<bool> EnableInferAlignmentPass; 310 } // namespace llvm 311 312 PipelineTuningOptions::PipelineTuningOptions() { 313 LoopInterleaving = true; 314 LoopVectorization = true; 315 SLPVectorization = false; 316 LoopUnrolling = true; 317 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 318 LicmMssaOptCap = SetLicmMssaOptCap; 319 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 320 CallGraphProfile = true; 321 UnifiedLTO = false; 322 MergeFunctions = EnableMergeFunctions; 323 InlinerThreshold = -1; 324 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 325 } 326 327 namespace llvm { 328 extern cl::opt<unsigned> MaxDevirtIterations; 329 } // namespace llvm 330 331 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 332 OptimizationLevel Level) { 333 for (auto &C : PeepholeEPCallbacks) 334 C(FPM, Level); 335 } 336 void PassBuilder::invokeLateLoopOptimizationsEPCallbacks( 337 LoopPassManager &LPM, OptimizationLevel Level) { 338 for (auto &C : LateLoopOptimizationsEPCallbacks) 339 C(LPM, Level); 340 } 341 void PassBuilder::invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM, 342 OptimizationLevel Level) { 343 for (auto &C : LoopOptimizerEndEPCallbacks) 344 C(LPM, Level); 345 } 346 void PassBuilder::invokeScalarOptimizerLateEPCallbacks( 347 FunctionPassManager &FPM, OptimizationLevel Level) { 348 for (auto &C : ScalarOptimizerLateEPCallbacks) 349 C(FPM, Level); 350 } 351 void PassBuilder::invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM, 352 OptimizationLevel Level) { 353 for (auto &C : CGSCCOptimizerLateEPCallbacks) 354 C(CGPM, Level); 355 } 356 void PassBuilder::invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM, 357 OptimizationLevel Level) { 358 for (auto &C : VectorizerStartEPCallbacks) 359 C(FPM, Level); 360 } 361 void PassBuilder::invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM, 362 OptimizationLevel Level) { 363 for (auto &C : OptimizerEarlyEPCallbacks) 364 C(MPM, Level); 365 } 366 void PassBuilder::invokeOptimizerLastEPCallbacks(ModulePassManager &MPM, 367 OptimizationLevel Level) { 368 for (auto &C : OptimizerLastEPCallbacks) 369 C(MPM, Level); 370 } 371 void PassBuilder::invokeFullLinkTimeOptimizationEarlyEPCallbacks( 372 ModulePassManager &MPM, OptimizationLevel Level) { 373 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks) 374 C(MPM, Level); 375 } 376 void PassBuilder::invokeFullLinkTimeOptimizationLastEPCallbacks( 377 ModulePassManager &MPM, OptimizationLevel Level) { 378 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 379 C(MPM, Level); 380 } 381 void PassBuilder::invokePipelineStartEPCallbacks(ModulePassManager &MPM, 382 OptimizationLevel Level) { 383 for (auto &C : PipelineStartEPCallbacks) 384 C(MPM, Level); 385 } 386 void PassBuilder::invokePipelineEarlySimplificationEPCallbacks( 387 ModulePassManager &MPM, OptimizationLevel Level) { 388 for (auto &C : PipelineEarlySimplificationEPCallbacks) 389 C(MPM, Level); 390 } 391 392 // Helper to add AnnotationRemarksPass. 393 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 394 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 395 } 396 397 // Helper to check if the current compilation phase is preparing for LTO 398 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 399 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 400 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 401 } 402 403 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 404 FunctionPassManager 405 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 406 ThinOrFullLTOPhase Phase) { 407 408 FunctionPassManager FPM; 409 410 if (AreStatisticsEnabled()) 411 FPM.addPass(CountVisitsPass()); 412 413 // Form SSA out of local memory accesses after breaking apart aggregates into 414 // scalars. 415 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 416 417 // Catch trivial redundancies 418 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 419 420 // Hoisting of scalars and load expressions. 421 FPM.addPass( 422 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 423 FPM.addPass(InstCombinePass()); 424 425 FPM.addPass(LibCallsShrinkWrapPass()); 426 427 invokePeepholeEPCallbacks(FPM, Level); 428 429 FPM.addPass( 430 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 431 432 // Form canonically associated expression trees, and simplify the trees using 433 // basic mathematical properties. For example, this will form (nearly) 434 // minimal multiplication trees. 435 FPM.addPass(ReassociatePass()); 436 437 // Add the primary loop simplification pipeline. 438 // FIXME: Currently this is split into two loop pass pipelines because we run 439 // some function passes in between them. These can and should be removed 440 // and/or replaced by scheduling the loop pass equivalents in the correct 441 // positions. But those equivalent passes aren't powerful enough yet. 442 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 443 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 444 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 445 // `LoopInstSimplify`. 446 LoopPassManager LPM1, LPM2; 447 448 // Simplify the loop body. We do this initially to clean up after other loop 449 // passes run, either when iterating on a loop or on inner loops with 450 // implications on the outer loop. 451 LPM1.addPass(LoopInstSimplifyPass()); 452 LPM1.addPass(LoopSimplifyCFGPass()); 453 454 // Try to remove as much code from the loop header as possible, 455 // to reduce amount of IR that will have to be duplicated. However, 456 // do not perform speculative hoisting the first time as LICM 457 // will destroy metadata that may not need to be destroyed if run 458 // after loop rotation. 459 // TODO: Investigate promotion cap for O1. 460 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 461 /*AllowSpeculation=*/false)); 462 463 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 464 isLTOPreLink(Phase))); 465 // TODO: Investigate promotion cap for O1. 466 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 467 /*AllowSpeculation=*/true)); 468 LPM1.addPass(SimpleLoopUnswitchPass()); 469 if (EnableLoopFlatten) 470 LPM1.addPass(LoopFlattenPass()); 471 472 LPM2.addPass(LoopIdiomRecognizePass()); 473 LPM2.addPass(IndVarSimplifyPass()); 474 475 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level); 476 477 LPM2.addPass(LoopDeletionPass()); 478 479 if (EnableLoopInterchange) 480 LPM2.addPass(LoopInterchangePass()); 481 482 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 483 // because it changes IR to makes profile annotation in back compile 484 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 485 // attributes so we need to make sure and allow the full unroll pass to pay 486 // attention to it. 487 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 488 PGOOpt->Action != PGOOptions::SampleUse) 489 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 490 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 491 PTO.ForgetAllSCEVInLoopUnroll)); 492 493 invokeLoopOptimizerEndEPCallbacks(LPM2, Level); 494 495 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 496 /*UseMemorySSA=*/true, 497 /*UseBlockFrequencyInfo=*/true)); 498 FPM.addPass( 499 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 500 FPM.addPass(InstCombinePass()); 501 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 502 // *All* loop passes must preserve it, in order to be able to use it. 503 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 504 /*UseMemorySSA=*/false, 505 /*UseBlockFrequencyInfo=*/false)); 506 507 // Delete small array after loop unroll. 508 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 509 510 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 511 FPM.addPass(MemCpyOptPass()); 512 513 // Sparse conditional constant propagation. 514 // FIXME: It isn't clear why we do this *after* loop passes rather than 515 // before... 516 FPM.addPass(SCCPPass()); 517 518 // Delete dead bit computations (instcombine runs after to fold away the dead 519 // computations, and then ADCE will run later to exploit any new DCE 520 // opportunities that creates). 521 FPM.addPass(BDCEPass()); 522 523 // Run instcombine after redundancy and dead bit elimination to exploit 524 // opportunities opened up by them. 525 FPM.addPass(InstCombinePass()); 526 invokePeepholeEPCallbacks(FPM, Level); 527 528 FPM.addPass(CoroElidePass()); 529 530 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 531 532 // Finally, do an expensive DCE pass to catch all the dead code exposed by 533 // the simplifications and basic cleanup after all the simplifications. 534 // TODO: Investigate if this is too expensive. 535 FPM.addPass(ADCEPass()); 536 FPM.addPass( 537 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 538 FPM.addPass(InstCombinePass()); 539 invokePeepholeEPCallbacks(FPM, Level); 540 541 return FPM; 542 } 543 544 FunctionPassManager 545 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 546 ThinOrFullLTOPhase Phase) { 547 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 548 549 // The O1 pipeline has a separate pipeline creation function to simplify 550 // construction readability. 551 if (Level.getSpeedupLevel() == 1) 552 return buildO1FunctionSimplificationPipeline(Level, Phase); 553 554 FunctionPassManager FPM; 555 556 if (AreStatisticsEnabled()) 557 FPM.addPass(CountVisitsPass()); 558 559 // Form SSA out of local memory accesses after breaking apart aggregates into 560 // scalars. 561 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 562 563 // Catch trivial redundancies 564 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 565 if (EnableKnowledgeRetention) 566 FPM.addPass(AssumeSimplifyPass()); 567 568 // Hoisting of scalars and load expressions. 569 if (EnableGVNHoist) 570 FPM.addPass(GVNHoistPass()); 571 572 // Global value numbering based sinking. 573 if (EnableGVNSink) { 574 FPM.addPass(GVNSinkPass()); 575 FPM.addPass( 576 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 577 } 578 579 // Speculative execution if the target has divergent branches; otherwise nop. 580 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 581 582 // Optimize based on known information about branches, and cleanup afterward. 583 FPM.addPass(JumpThreadingPass()); 584 FPM.addPass(CorrelatedValuePropagationPass()); 585 586 // Jump table to switch conversion. 587 if (EnableJumpTableToSwitch) 588 FPM.addPass(JumpTableToSwitchPass()); 589 590 FPM.addPass( 591 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 592 FPM.addPass(InstCombinePass()); 593 FPM.addPass(AggressiveInstCombinePass()); 594 595 if (!Level.isOptimizingForSize()) 596 FPM.addPass(LibCallsShrinkWrapPass()); 597 598 invokePeepholeEPCallbacks(FPM, Level); 599 600 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 601 // using the size value profile. Don't perform this when optimizing for size. 602 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 603 !Level.isOptimizingForSize()) 604 FPM.addPass(PGOMemOPSizeOpt()); 605 606 FPM.addPass(TailCallElimPass()); 607 FPM.addPass( 608 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 609 610 // Form canonically associated expression trees, and simplify the trees using 611 // basic mathematical properties. For example, this will form (nearly) 612 // minimal multiplication trees. 613 FPM.addPass(ReassociatePass()); 614 615 if (EnableConstraintElimination) 616 FPM.addPass(ConstraintEliminationPass()); 617 618 // Add the primary loop simplification pipeline. 619 // FIXME: Currently this is split into two loop pass pipelines because we run 620 // some function passes in between them. These can and should be removed 621 // and/or replaced by scheduling the loop pass equivalents in the correct 622 // positions. But those equivalent passes aren't powerful enough yet. 623 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 624 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 625 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 626 // `LoopInstSimplify`. 627 LoopPassManager LPM1, LPM2; 628 629 // Simplify the loop body. We do this initially to clean up after other loop 630 // passes run, either when iterating on a loop or on inner loops with 631 // implications on the outer loop. 632 LPM1.addPass(LoopInstSimplifyPass()); 633 LPM1.addPass(LoopSimplifyCFGPass()); 634 635 // Try to remove as much code from the loop header as possible, 636 // to reduce amount of IR that will have to be duplicated. However, 637 // do not perform speculative hoisting the first time as LICM 638 // will destroy metadata that may not need to be destroyed if run 639 // after loop rotation. 640 // TODO: Investigate promotion cap for O1. 641 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 642 /*AllowSpeculation=*/false)); 643 644 // Disable header duplication in loop rotation at -Oz. 645 LPM1.addPass(LoopRotatePass(EnableLoopHeaderDuplication || 646 Level != OptimizationLevel::Oz, 647 isLTOPreLink(Phase))); 648 // TODO: Investigate promotion cap for O1. 649 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 650 /*AllowSpeculation=*/true)); 651 LPM1.addPass( 652 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); 653 if (EnableLoopFlatten) 654 LPM1.addPass(LoopFlattenPass()); 655 656 LPM2.addPass(LoopIdiomRecognizePass()); 657 LPM2.addPass(IndVarSimplifyPass()); 658 659 { 660 ExtraSimpleLoopUnswitchPassManager ExtraPasses; 661 ExtraPasses.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 662 OptimizationLevel::O3)); 663 LPM2.addPass(std::move(ExtraPasses)); 664 } 665 666 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level); 667 668 LPM2.addPass(LoopDeletionPass()); 669 670 if (EnableLoopInterchange) 671 LPM2.addPass(LoopInterchangePass()); 672 673 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 674 // because it changes IR to makes profile annotation in back compile 675 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 676 // attributes so we need to make sure and allow the full unroll pass to pay 677 // attention to it. 678 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 679 PGOOpt->Action != PGOOptions::SampleUse) 680 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 681 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 682 PTO.ForgetAllSCEVInLoopUnroll)); 683 684 invokeLoopOptimizerEndEPCallbacks(LPM2, Level); 685 686 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 687 /*UseMemorySSA=*/true, 688 /*UseBlockFrequencyInfo=*/true)); 689 FPM.addPass( 690 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 691 FPM.addPass(InstCombinePass()); 692 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 693 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 694 // *All* loop passes must preserve it, in order to be able to use it. 695 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 696 /*UseMemorySSA=*/false, 697 /*UseBlockFrequencyInfo=*/false)); 698 699 // Delete small array after loop unroll. 700 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 701 702 // Try vectorization/scalarization transforms that are both improvements 703 // themselves and can allow further folds with GVN and InstCombine. 704 FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true)); 705 706 // Eliminate redundancies. 707 FPM.addPass(MergedLoadStoreMotionPass()); 708 if (RunNewGVN) 709 FPM.addPass(NewGVNPass()); 710 else 711 FPM.addPass(GVNPass()); 712 713 // Sparse conditional constant propagation. 714 // FIXME: It isn't clear why we do this *after* loop passes rather than 715 // before... 716 FPM.addPass(SCCPPass()); 717 718 // Delete dead bit computations (instcombine runs after to fold away the dead 719 // computations, and then ADCE will run later to exploit any new DCE 720 // opportunities that creates). 721 FPM.addPass(BDCEPass()); 722 723 // Run instcombine after redundancy and dead bit elimination to exploit 724 // opportunities opened up by them. 725 FPM.addPass(InstCombinePass()); 726 invokePeepholeEPCallbacks(FPM, Level); 727 728 // Re-consider control flow based optimizations after redundancy elimination, 729 // redo DCE, etc. 730 if (EnableDFAJumpThreading) 731 FPM.addPass(DFAJumpThreadingPass()); 732 733 FPM.addPass(JumpThreadingPass()); 734 FPM.addPass(CorrelatedValuePropagationPass()); 735 736 // Finally, do an expensive DCE pass to catch all the dead code exposed by 737 // the simplifications and basic cleanup after all the simplifications. 738 // TODO: Investigate if this is too expensive. 739 FPM.addPass(ADCEPass()); 740 741 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 742 FPM.addPass(MemCpyOptPass()); 743 744 FPM.addPass(DSEPass()); 745 FPM.addPass(MoveAutoInitPass()); 746 747 FPM.addPass(createFunctionToLoopPassAdaptor( 748 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 749 /*AllowSpeculation=*/true), 750 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 751 752 FPM.addPass(CoroElidePass()); 753 754 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 755 756 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 757 .convertSwitchRangeToICmp(true) 758 .hoistCommonInsts(true) 759 .sinkCommonInsts(true))); 760 FPM.addPass(InstCombinePass()); 761 invokePeepholeEPCallbacks(FPM, Level); 762 763 return FPM; 764 } 765 766 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 767 MPM.addPass(CanonicalizeAliasesPass()); 768 MPM.addPass(NameAnonGlobalPass()); 769 } 770 771 void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM, 772 OptimizationLevel Level, 773 ThinOrFullLTOPhase LTOPhase) { 774 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 775 if (DisablePreInliner) 776 return; 777 InlineParams IP; 778 779 IP.DefaultThreshold = PreInlineThreshold; 780 781 // FIXME: The hint threshold has the same value used by the regular inliner 782 // when not optimzing for size. This should probably be lowered after 783 // performance testing. 784 // FIXME: this comment is cargo culted from the old pass manager, revisit). 785 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 786 ModuleInlinerWrapperPass MIWP( 787 IP, /* MandatoryFirst */ true, 788 InlineContext{LTOPhase, InlinePass::EarlyInliner}); 789 CGSCCPassManager &CGPipeline = MIWP.getPM(); 790 791 FunctionPassManager FPM; 792 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 793 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 794 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( 795 true))); // Merge & remove basic blocks. 796 FPM.addPass(InstCombinePass()); // Combine silly sequences. 797 invokePeepholeEPCallbacks(FPM, Level); 798 799 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 800 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 801 802 MPM.addPass(std::move(MIWP)); 803 804 // Delete anything that is now dead to make sure that we don't instrument 805 // dead code. Instrumentation can end up keeping dead code around and 806 // dramatically increase code size. 807 MPM.addPass(GlobalDCEPass()); 808 } 809 810 void PassBuilder::addPostPGOLoopRotation(ModulePassManager &MPM, 811 OptimizationLevel Level) { 812 if (EnablePostPGOLoopRotation) { 813 // Disable header duplication in loop rotation at -Oz. 814 MPM.addPass(createModuleToFunctionPassAdaptor( 815 createFunctionToLoopPassAdaptor( 816 LoopRotatePass(EnableLoopHeaderDuplication || 817 Level != OptimizationLevel::Oz), 818 /*UseMemorySSA=*/false, 819 /*UseBlockFrequencyInfo=*/false), 820 PTO.EagerlyInvalidateAnalyses)); 821 } 822 } 823 824 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 825 OptimizationLevel Level, bool RunProfileGen, 826 bool IsCS, bool AtomicCounterUpdate, 827 std::string ProfileFile, 828 std::string ProfileRemappingFile, 829 IntrusiveRefCntPtr<vfs::FileSystem> FS) { 830 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 831 832 if (!RunProfileGen) { 833 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 834 MPM.addPass( 835 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); 836 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 837 // RequireAnalysisPass for PSI before subsequent non-module passes. 838 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 839 return; 840 } 841 842 // Perform PGO instrumentation. 843 MPM.addPass(PGOInstrumentationGen(IsCS)); 844 845 addPostPGOLoopRotation(MPM, Level); 846 // Add the profile lowering pass. 847 InstrProfOptions Options; 848 if (!ProfileFile.empty()) 849 Options.InstrProfileOutput = ProfileFile; 850 // Do counter promotion at Level greater than O0. 851 Options.DoCounterPromotion = true; 852 Options.UseBFIInPromotion = IsCS; 853 if (EnableSampledInstr) { 854 Options.Sampling = true; 855 // With sampling, there is little beneifit to enable counter promotion. 856 // But note that sampling does work with counter promotion. 857 Options.DoCounterPromotion = false; 858 } 859 Options.Atomic = AtomicCounterUpdate; 860 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); 861 } 862 863 void PassBuilder::addPGOInstrPassesForO0( 864 ModulePassManager &MPM, bool RunProfileGen, bool IsCS, 865 bool AtomicCounterUpdate, std::string ProfileFile, 866 std::string ProfileRemappingFile, IntrusiveRefCntPtr<vfs::FileSystem> FS) { 867 if (!RunProfileGen) { 868 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 869 MPM.addPass( 870 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); 871 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 872 // RequireAnalysisPass for PSI before subsequent non-module passes. 873 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 874 return; 875 } 876 877 // Perform PGO instrumentation. 878 MPM.addPass(PGOInstrumentationGen(IsCS)); 879 // Add the profile lowering pass. 880 InstrProfOptions Options; 881 if (!ProfileFile.empty()) 882 Options.InstrProfileOutput = ProfileFile; 883 // Do not do counter promotion at O0. 884 Options.DoCounterPromotion = false; 885 Options.UseBFIInPromotion = IsCS; 886 Options.Atomic = AtomicCounterUpdate; 887 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); 888 } 889 890 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 891 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 892 } 893 894 ModuleInlinerWrapperPass 895 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 896 ThinOrFullLTOPhase Phase) { 897 InlineParams IP; 898 if (PTO.InlinerThreshold == -1) 899 IP = getInlineParamsFromOptLevel(Level); 900 else 901 IP = getInlineParams(PTO.InlinerThreshold); 902 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 903 // disable hot callsite inline (as much as possible [1]) because it makes 904 // profile annotation in the backend inaccurate. 905 // 906 // [1] Note the cost of a function could be below zero due to erased 907 // prologue / epilogue. 908 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 909 PGOOpt->Action == PGOOptions::SampleUse) 910 IP.HotCallSiteThreshold = 0; 911 912 if (PGOOpt) 913 IP.EnableDeferral = EnablePGOInlineDeferral; 914 915 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, 916 InlineContext{Phase, InlinePass::CGSCCInliner}, 917 UseInlineAdvisor, MaxDevirtIterations); 918 919 // Require the GlobalsAA analysis for the module so we can query it within 920 // the CGSCC pipeline. 921 if (EnableGlobalAnalyses) { 922 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 923 // Invalidate AAManager so it can be recreated and pick up the newly 924 // available GlobalsAA. 925 MIWP.addModulePass( 926 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 927 } 928 929 // Require the ProfileSummaryAnalysis for the module so we can query it within 930 // the inliner pass. 931 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 932 933 // Now begin the main postorder CGSCC pipeline. 934 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 935 // manager and trying to emulate its precise behavior. Much of this doesn't 936 // make a lot of sense and we should revisit the core CGSCC structure. 937 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 938 939 // Note: historically, the PruneEH pass was run first to deduce nounwind and 940 // generally clean up exception handling overhead. It isn't clear this is 941 // valuable as the inliner doesn't currently care whether it is inlining an 942 // invoke or a call. 943 944 if (AttributorRun & AttributorRunOption::CGSCC) 945 MainCGPipeline.addPass(AttributorCGSCCPass()); 946 947 // Deduce function attributes. We do another run of this after the function 948 // simplification pipeline, so this only needs to run when it could affect the 949 // function simplification pipeline, which is only the case with recursive 950 // functions. 951 MainCGPipeline.addPass(PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true)); 952 953 // When at O3 add argument promotion to the pass pipeline. 954 // FIXME: It isn't at all clear why this should be limited to O3. 955 if (Level == OptimizationLevel::O3) 956 MainCGPipeline.addPass(ArgumentPromotionPass()); 957 958 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 959 // there are no OpenMP runtime calls present in the module. 960 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 961 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 962 963 invokeCGSCCOptimizerLateEPCallbacks(MainCGPipeline, Level); 964 965 // Add the core function simplification pipeline nested inside the 966 // CGSCC walk. 967 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 968 buildFunctionSimplificationPipeline(Level, Phase), 969 PTO.EagerlyInvalidateAnalyses, /*NoRerun=*/true)); 970 971 // Finally, deduce any function attributes based on the fully simplified 972 // function. 973 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 974 975 // Mark that the function is fully simplified and that it shouldn't be 976 // simplified again if we somehow revisit it due to CGSCC mutations unless 977 // it's been modified since. 978 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 979 RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>())); 980 981 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 982 983 // Make sure we don't affect potential future NoRerun CGSCC adaptors. 984 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 985 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 986 987 return MIWP; 988 } 989 990 ModulePassManager 991 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 992 ThinOrFullLTOPhase Phase) { 993 ModulePassManager MPM; 994 995 InlineParams IP = getInlineParamsFromOptLevel(Level); 996 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 997 // disable hot callsite inline (as much as possible [1]) because it makes 998 // profile annotation in the backend inaccurate. 999 // 1000 // [1] Note the cost of a function could be below zero due to erased 1001 // prologue / epilogue. 1002 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 1003 PGOOpt->Action == PGOOptions::SampleUse) 1004 IP.HotCallSiteThreshold = 0; 1005 1006 if (PGOOpt) 1007 IP.EnableDeferral = EnablePGOInlineDeferral; 1008 1009 // The inline deferral logic is used to avoid losing some 1010 // inlining chance in future. It is helpful in SCC inliner, in which 1011 // inlining is processed in bottom-up order. 1012 // While in module inliner, the inlining order is a priority-based order 1013 // by default. The inline deferral is unnecessary there. So we disable the 1014 // inline deferral logic in module inliner. 1015 IP.EnableDeferral = false; 1016 1017 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor, Phase)); 1018 1019 MPM.addPass(createModuleToFunctionPassAdaptor( 1020 buildFunctionSimplificationPipeline(Level, Phase), 1021 PTO.EagerlyInvalidateAnalyses)); 1022 1023 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 1024 CoroSplitPass(Level != OptimizationLevel::O0))); 1025 1026 return MPM; 1027 } 1028 1029 ModulePassManager 1030 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 1031 ThinOrFullLTOPhase Phase) { 1032 assert(Level != OptimizationLevel::O0 && 1033 "Should not be used for O0 pipeline"); 1034 1035 assert(Phase != ThinOrFullLTOPhase::FullLTOPostLink && 1036 "FullLTOPostLink shouldn't call buildModuleSimplificationPipeline!"); 1037 1038 ModulePassManager MPM; 1039 1040 // Place pseudo probe instrumentation as the first pass of the pipeline to 1041 // minimize the impact of optimization changes. 1042 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1043 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 1044 MPM.addPass(SampleProfileProbePass(TM)); 1045 1046 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 1047 1048 // In ThinLTO mode, when flattened profile is used, all the available 1049 // profile information will be annotated in PreLink phase so there is 1050 // no need to load the profile again in PostLink. 1051 bool LoadSampleProfile = 1052 HasSampleProfile && 1053 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 1054 1055 // During the ThinLTO backend phase we perform early indirect call promotion 1056 // here, before globalopt. Otherwise imported available_externally functions 1057 // look unreferenced and are removed. If we are going to load the sample 1058 // profile then defer until later. 1059 // TODO: See if we can move later and consolidate with the location where 1060 // we perform ICP when we are loading a sample profile. 1061 // TODO: We pass HasSampleProfile (whether there was a sample profile file 1062 // passed to the compile) to the SamplePGO flag of ICP. This is used to 1063 // determine whether the new direct calls are annotated with prof metadata. 1064 // Ideally this should be determined from whether the IR is annotated with 1065 // sample profile, and not whether the a sample profile was provided on the 1066 // command line. E.g. for flattened profiles where we will not be reloading 1067 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 1068 // provide the sample profile file. 1069 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 1070 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 1071 1072 // Create an early function pass manager to cleanup the output of the 1073 // frontend. Not necessary with LTO post link pipelines since the pre link 1074 // pipeline already cleaned up the frontend output. 1075 if (Phase != ThinOrFullLTOPhase::ThinLTOPostLink) { 1076 // Do basic inference of function attributes from known properties of system 1077 // libraries and other oracles. 1078 MPM.addPass(InferFunctionAttrsPass()); 1079 MPM.addPass(CoroEarlyPass()); 1080 1081 FunctionPassManager EarlyFPM; 1082 EarlyFPM.addPass(EntryExitInstrumenterPass(/*PostInlining=*/false)); 1083 // Lower llvm.expect to metadata before attempting transforms. 1084 // Compare/branch metadata may alter the behavior of passes like 1085 // SimplifyCFG. 1086 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 1087 EarlyFPM.addPass(SimplifyCFGPass()); 1088 EarlyFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 1089 EarlyFPM.addPass(EarlyCSEPass()); 1090 if (Level == OptimizationLevel::O3) 1091 EarlyFPM.addPass(CallSiteSplittingPass()); 1092 MPM.addPass(createModuleToFunctionPassAdaptor( 1093 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses)); 1094 } 1095 1096 if (LoadSampleProfile) { 1097 // Annotate sample profile right after early FPM to ensure freshness of 1098 // the debug info. 1099 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1100 PGOOpt->ProfileRemappingFile, Phase)); 1101 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1102 // RequireAnalysisPass for PSI before subsequent non-module passes. 1103 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1104 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 1105 // for the profile annotation to be accurate in the LTO backend. 1106 if (!isLTOPreLink(Phase)) 1107 // We perform early indirect call promotion here, before globalopt. 1108 // This is important for the ThinLTO backend phase because otherwise 1109 // imported available_externally functions look unreferenced and are 1110 // removed. 1111 MPM.addPass( 1112 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 1113 } 1114 1115 // Try to perform OpenMP specific optimizations on the module. This is a 1116 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 1117 MPM.addPass(OpenMPOptPass()); 1118 1119 if (AttributorRun & AttributorRunOption::MODULE) 1120 MPM.addPass(AttributorPass()); 1121 1122 // Lower type metadata and the type.test intrinsic in the ThinLTO 1123 // post link pipeline after ICP. This is to enable usage of the type 1124 // tests in ICP sequences. 1125 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 1126 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1127 1128 invokePipelineEarlySimplificationEPCallbacks(MPM, Level); 1129 1130 // Interprocedural constant propagation now that basic cleanup has occurred 1131 // and prior to optimizing globals. 1132 // FIXME: This position in the pipeline hasn't been carefully considered in 1133 // years, it should be re-analyzed. 1134 MPM.addPass(IPSCCPPass( 1135 IPSCCPOptions(/*AllowFuncSpec=*/ 1136 Level != OptimizationLevel::Os && 1137 Level != OptimizationLevel::Oz && 1138 !isLTOPreLink(Phase)))); 1139 1140 // Attach metadata to indirect call sites indicating the set of functions 1141 // they may target at run-time. This should follow IPSCCP. 1142 MPM.addPass(CalledValuePropagationPass()); 1143 1144 // Optimize globals to try and fold them into constants. 1145 MPM.addPass(GlobalOptPass()); 1146 1147 // Create a small function pass pipeline to cleanup after all the global 1148 // optimizations. 1149 FunctionPassManager GlobalCleanupPM; 1150 // FIXME: Should this instead by a run of SROA? 1151 GlobalCleanupPM.addPass(PromotePass()); 1152 GlobalCleanupPM.addPass(InstCombinePass()); 1153 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 1154 GlobalCleanupPM.addPass( 1155 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1156 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 1157 PTO.EagerlyInvalidateAnalyses)); 1158 1159 // We already asserted this happens in non-FullLTOPostLink earlier. 1160 const bool IsPreLink = Phase != ThinOrFullLTOPhase::ThinLTOPostLink; 1161 const bool IsPGOPreLink = PGOOpt && IsPreLink; 1162 const bool IsPGOInstrGen = 1163 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRInstr; 1164 const bool IsPGOInstrUse = 1165 IsPGOPreLink && PGOOpt->Action == PGOOptions::IRUse; 1166 const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty(); 1167 // We don't want to mix pgo ctx gen and pgo gen; we also don't currently 1168 // enable ctx profiling from the frontend. 1169 assert( 1170 !(IsPGOInstrGen && PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) && 1171 "Enabling both instrumented FDO and contextual instrumentation is not " 1172 "supported."); 1173 // Enable contextual profiling instrumentation. 1174 const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink && 1175 PGOCtxProfLoweringPass::isContextualIRPGOEnabled(); 1176 1177 if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen) 1178 addPreInlinerPasses(MPM, Level, Phase); 1179 1180 // Add all the requested passes for instrumentation PGO, if requested. 1181 if (IsPGOInstrGen || IsPGOInstrUse) { 1182 addPGOInstrPasses(MPM, Level, 1183 /*RunProfileGen=*/IsPGOInstrGen, 1184 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, 1185 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1186 PGOOpt->FS); 1187 } else if (IsCtxProfGen) { 1188 MPM.addPass(PGOInstrumentationGen(false)); 1189 addPostPGOLoopRotation(MPM, Level); 1190 MPM.addPass(PGOCtxProfLoweringPass()); 1191 } 1192 1193 if (IsPGOInstrGen || IsPGOInstrUse || IsCtxProfGen) 1194 MPM.addPass(PGOIndirectCallPromotion(false, false)); 1195 1196 if (IsPGOPreLink && PGOOpt->CSAction == PGOOptions::CSIRInstr) 1197 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile, 1198 EnableSampledInstr)); 1199 1200 if (IsMemprofUse) 1201 MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS)); 1202 1203 // Synthesize function entry counts for non-PGO compilation. 1204 if (EnableSyntheticCounts && !PGOOpt) 1205 MPM.addPass(SyntheticCountsPropagation()); 1206 1207 if (EnablePGOForceFunctionAttrs && PGOOpt) 1208 MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType)); 1209 1210 MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true)); 1211 1212 if (EnableModuleInliner) 1213 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 1214 else 1215 MPM.addPass(buildInlinerPipeline(Level, Phase)); 1216 1217 // Remove any dead arguments exposed by cleanups, constant folding globals, 1218 // and argument promotion. 1219 MPM.addPass(DeadArgumentEliminationPass()); 1220 1221 MPM.addPass(CoroCleanupPass()); 1222 1223 // Optimize globals now that functions are fully simplified. 1224 MPM.addPass(GlobalOptPass()); 1225 MPM.addPass(GlobalDCEPass()); 1226 1227 return MPM; 1228 } 1229 1230 /// TODO: Should LTO cause any differences to this set of passes? 1231 void PassBuilder::addVectorPasses(OptimizationLevel Level, 1232 FunctionPassManager &FPM, bool IsFullLTO) { 1233 FPM.addPass(LoopVectorizePass( 1234 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 1235 1236 if (EnableInferAlignmentPass) 1237 FPM.addPass(InferAlignmentPass()); 1238 if (IsFullLTO) { 1239 // The vectorizer may have significantly shortened a loop body; unroll 1240 // again. Unroll small loops to hide loop backedge latency and saturate any 1241 // parallel execution resources of an out-of-order processor. We also then 1242 // need to clean up redundancies and loop invariant code. 1243 // FIXME: It would be really good to use a loop-integrated instruction 1244 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1245 // across the loop nests. 1246 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1247 if (EnableUnrollAndJam && PTO.LoopUnrolling) 1248 FPM.addPass(createFunctionToLoopPassAdaptor( 1249 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1250 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1251 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1252 PTO.ForgetAllSCEVInLoopUnroll))); 1253 FPM.addPass(WarnMissedTransformationsPass()); 1254 // Now that we are done with loop unrolling, be it either by LoopVectorizer, 1255 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have 1256 // become constant-offset, thus enabling SROA and alloca promotion. Do so. 1257 // NOTE: we are very late in the pipeline, and we don't have any LICM 1258 // or SimplifyCFG passes scheduled after us, that would cleanup 1259 // the CFG mess this may created if allowed to modify CFG, so forbid that. 1260 FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); 1261 } 1262 1263 if (!IsFullLTO) { 1264 // Eliminate loads by forwarding stores from the previous iteration to loads 1265 // of the current iteration. 1266 FPM.addPass(LoopLoadEliminationPass()); 1267 } 1268 // Cleanup after the loop optimization passes. 1269 FPM.addPass(InstCombinePass()); 1270 1271 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1272 ExtraVectorPassManager ExtraPasses; 1273 // At higher optimization levels, try to clean up any runtime overlap and 1274 // alignment checks inserted by the vectorizer. We want to track correlated 1275 // runtime checks for two inner loops in the same outer loop, fold any 1276 // common computations, hoist loop-invariant aspects out of any outer loop, 1277 // and unswitch the runtime checks if possible. Once hoisted, we may have 1278 // dead (or speculatable) control flows or more combining opportunities. 1279 ExtraPasses.addPass(EarlyCSEPass()); 1280 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1281 ExtraPasses.addPass(InstCombinePass()); 1282 LoopPassManager LPM; 1283 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1284 /*AllowSpeculation=*/true)); 1285 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1286 OptimizationLevel::O3)); 1287 ExtraPasses.addPass( 1288 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1289 /*UseBlockFrequencyInfo=*/true)); 1290 ExtraPasses.addPass( 1291 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1292 ExtraPasses.addPass(InstCombinePass()); 1293 FPM.addPass(std::move(ExtraPasses)); 1294 } 1295 1296 // Now that we've formed fast to execute loop structures, we do further 1297 // optimizations. These are run afterward as they might block doing complex 1298 // analyses and transforms such as what are needed for loop vectorization. 1299 1300 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1301 // GVN, loop transforms, and others have already run, so it's now better to 1302 // convert to more optimized IR using more aggressive simplify CFG options. 1303 // The extra sinking transform can create larger basic blocks, so do this 1304 // before SLP vectorization. 1305 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1306 .forwardSwitchCondToPhi(true) 1307 .convertSwitchRangeToICmp(true) 1308 .convertSwitchToLookupTable(true) 1309 .needCanonicalLoops(false) 1310 .hoistCommonInsts(true) 1311 .sinkCommonInsts(true))); 1312 1313 if (IsFullLTO) { 1314 FPM.addPass(SCCPPass()); 1315 FPM.addPass(InstCombinePass()); 1316 FPM.addPass(BDCEPass()); 1317 } 1318 1319 // Optimize parallel scalar instruction chains into SIMD instructions. 1320 if (PTO.SLPVectorization) { 1321 FPM.addPass(SLPVectorizerPass()); 1322 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1323 FPM.addPass(EarlyCSEPass()); 1324 } 1325 } 1326 // Enhance/cleanup vector code. 1327 FPM.addPass(VectorCombinePass()); 1328 1329 if (!IsFullLTO) { 1330 FPM.addPass(InstCombinePass()); 1331 // Unroll small loops to hide loop backedge latency and saturate any 1332 // parallel execution resources of an out-of-order processor. We also then 1333 // need to clean up redundancies and loop invariant code. 1334 // FIXME: It would be really good to use a loop-integrated instruction 1335 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1336 // across the loop nests. 1337 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1338 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1339 FPM.addPass(createFunctionToLoopPassAdaptor( 1340 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1341 } 1342 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1343 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1344 PTO.ForgetAllSCEVInLoopUnroll))); 1345 FPM.addPass(WarnMissedTransformationsPass()); 1346 // Now that we are done with loop unrolling, be it either by LoopVectorizer, 1347 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have 1348 // become constant-offset, thus enabling SROA and alloca promotion. Do so. 1349 // NOTE: we are very late in the pipeline, and we don't have any LICM 1350 // or SimplifyCFG passes scheduled after us, that would cleanup 1351 // the CFG mess this may created if allowed to modify CFG, so forbid that. 1352 FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); 1353 } 1354 1355 if (EnableInferAlignmentPass) 1356 FPM.addPass(InferAlignmentPass()); 1357 FPM.addPass(InstCombinePass()); 1358 1359 // This is needed for two reasons: 1360 // 1. It works around problems that instcombine introduces, such as sinking 1361 // expensive FP divides into loops containing multiplications using the 1362 // divide result. 1363 // 2. It helps to clean up some loop-invariant code created by the loop 1364 // unroll pass when IsFullLTO=false. 1365 FPM.addPass(createFunctionToLoopPassAdaptor( 1366 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1367 /*AllowSpeculation=*/true), 1368 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1369 1370 // Now that we've vectorized and unrolled loops, we may have more refined 1371 // alignment information, try to re-derive it here. 1372 FPM.addPass(AlignmentFromAssumptionsPass()); 1373 } 1374 1375 ModulePassManager 1376 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1377 ThinOrFullLTOPhase LTOPhase) { 1378 const bool LTOPreLink = isLTOPreLink(LTOPhase); 1379 ModulePassManager MPM; 1380 1381 // Run partial inlining pass to partially inline functions that have 1382 // large bodies. 1383 if (RunPartialInlining) 1384 MPM.addPass(PartialInlinerPass()); 1385 1386 // Remove avail extern fns and globals definitions since we aren't compiling 1387 // an object file for later LTO. For LTO we want to preserve these so they 1388 // are eligible for inlining at link-time. Note if they are unreferenced they 1389 // will be removed by GlobalDCE later, so this only impacts referenced 1390 // available externally globals. Eventually they will be suppressed during 1391 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1392 // may make globals referenced by available external functions dead and saves 1393 // running remaining passes on the eliminated functions. These should be 1394 // preserved during prelinking for link-time inlining decisions. 1395 if (!LTOPreLink) 1396 MPM.addPass(EliminateAvailableExternallyPass()); 1397 1398 if (EnableOrderFileInstrumentation) 1399 MPM.addPass(InstrOrderFilePass()); 1400 1401 // Do RPO function attribute inference across the module to forward-propagate 1402 // attributes where applicable. 1403 // FIXME: Is this really an optimization rather than a canonicalization? 1404 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1405 1406 // Do a post inline PGO instrumentation and use pass. This is a context 1407 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1408 // cross-module inline has not been done yet. The context sensitive 1409 // instrumentation is after all the inlines are done. 1410 if (!LTOPreLink && PGOOpt) { 1411 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1412 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, 1413 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1414 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, 1415 PGOOpt->FS); 1416 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1417 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, 1418 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1419 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1420 PGOOpt->FS); 1421 } 1422 1423 // Re-compute GlobalsAA here prior to function passes. This is particularly 1424 // useful as the above will have inlined, DCE'ed, and function-attr 1425 // propagated everything. We should at this point have a reasonably minimal 1426 // and richly annotated call graph. By computing aliasing and mod/ref 1427 // information for all local globals here, the late loop passes and notably 1428 // the vectorizer will be able to use them to help recognize vectorizable 1429 // memory operations. 1430 if (EnableGlobalAnalyses) 1431 MPM.addPass(RecomputeGlobalsAAPass()); 1432 1433 invokeOptimizerEarlyEPCallbacks(MPM, Level); 1434 1435 FunctionPassManager OptimizePM; 1436 // Scheduling LoopVersioningLICM when inlining is over, because after that 1437 // we may see more accurate aliasing. Reason to run this late is that too 1438 // early versioning may prevent further inlining due to increase of code 1439 // size. Other optimizations which runs later might get benefit of no-alias 1440 // assumption in clone loop. 1441 if (UseLoopVersioningLICM) { 1442 OptimizePM.addPass( 1443 createFunctionToLoopPassAdaptor(LoopVersioningLICMPass())); 1444 // LoopVersioningLICM pass might increase new LICM opportunities. 1445 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1446 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1447 /*AllowSpeculation=*/true), 1448 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1449 } 1450 1451 OptimizePM.addPass(Float2IntPass()); 1452 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1453 1454 if (EnableMatrix) { 1455 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1456 OptimizePM.addPass(EarlyCSEPass()); 1457 } 1458 1459 // CHR pass should only be applied with the profile information. 1460 // The check is to check the profile summary information in CHR. 1461 if (EnableCHR && Level == OptimizationLevel::O3) 1462 OptimizePM.addPass(ControlHeightReductionPass()); 1463 1464 // FIXME: We need to run some loop optimizations to re-rotate loops after 1465 // simplifycfg and others undo their rotation. 1466 1467 // Optimize the loop execution. These passes operate on entire loop nests 1468 // rather than on each loop in an inside-out manner, and so they are actually 1469 // function passes. 1470 1471 invokeVectorizerStartEPCallbacks(OptimizePM, Level); 1472 1473 LoopPassManager LPM; 1474 // First rotate loops that may have been un-rotated by prior passes. 1475 // Disable header duplication at -Oz. 1476 LPM.addPass(LoopRotatePass(EnableLoopHeaderDuplication || 1477 Level != OptimizationLevel::Oz, 1478 LTOPreLink)); 1479 // Some loops may have become dead by now. Try to delete them. 1480 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1481 // this may need to be revisited once we run GVN before loop deletion 1482 // in the simplification pipeline. 1483 LPM.addPass(LoopDeletionPass()); 1484 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1485 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1486 1487 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1488 // into separate loop that would otherwise inhibit vectorization. This is 1489 // currently only performed for loops marked with the metadata 1490 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1491 OptimizePM.addPass(LoopDistributePass()); 1492 1493 // Populates the VFABI attribute with the scalar-to-vector mappings 1494 // from the TargetLibraryInfo. 1495 OptimizePM.addPass(InjectTLIMappings()); 1496 1497 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1498 1499 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1500 // canonicalization pass that enables other optimizations. As a result, 1501 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1502 // result too early. 1503 OptimizePM.addPass(LoopSinkPass()); 1504 1505 // And finally clean up LCSSA form before generating code. 1506 OptimizePM.addPass(InstSimplifyPass()); 1507 1508 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1509 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1510 // flattening of blocks. 1511 OptimizePM.addPass(DivRemPairsPass()); 1512 1513 // Try to annotate calls that were created during optimization. 1514 OptimizePM.addPass(TailCallElimPass()); 1515 1516 // LoopSink (and other loop passes since the last simplifyCFG) might have 1517 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1518 OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1519 .convertSwitchRangeToICmp(true) 1520 .speculateUnpredictables(true))); 1521 1522 // Add the core optimizing pipeline. 1523 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1524 PTO.EagerlyInvalidateAnalyses)); 1525 1526 invokeOptimizerLastEPCallbacks(MPM, Level); 1527 1528 // Split out cold code. Splitting is done late to avoid hiding context from 1529 // other optimizations and inadvertently regressing performance. The tradeoff 1530 // is that this has a higher code size cost than splitting early. 1531 if (EnableHotColdSplit && !LTOPreLink) 1532 MPM.addPass(HotColdSplittingPass()); 1533 1534 // Search the code for similar regions of code. If enough similar regions can 1535 // be found where extracting the regions into their own function will decrease 1536 // the size of the program, we extract the regions, a deduplicate the 1537 // structurally similar regions. 1538 if (EnableIROutliner) 1539 MPM.addPass(IROutlinerPass()); 1540 1541 // Now we need to do some global optimization transforms. 1542 // FIXME: It would seem like these should come first in the optimization 1543 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1544 // ordering here. 1545 MPM.addPass(GlobalDCEPass()); 1546 MPM.addPass(ConstantMergePass()); 1547 1548 // Merge functions if requested. It has a better chance to merge functions 1549 // after ConstantMerge folded jump tables. 1550 if (PTO.MergeFunctions) 1551 MPM.addPass(MergeFunctionsPass()); 1552 1553 if (PTO.CallGraphProfile && !LTOPreLink) 1554 MPM.addPass(CGProfilePass(LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink || 1555 LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink)); 1556 1557 // TODO: Relative look table converter pass caused an issue when full lto is 1558 // enabled. See https://reviews.llvm.org/D94355 for more details. 1559 // Until the issue fixed, disable this pass during pre-linking phase. 1560 if (!LTOPreLink) 1561 MPM.addPass(RelLookupTableConverterPass()); 1562 1563 return MPM; 1564 } 1565 1566 ModulePassManager 1567 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1568 bool LTOPreLink) { 1569 if (Level == OptimizationLevel::O0) 1570 return buildO0DefaultPipeline(Level, LTOPreLink); 1571 1572 ModulePassManager MPM; 1573 1574 // Convert @llvm.global.annotations to !annotation metadata. 1575 MPM.addPass(Annotation2MetadataPass()); 1576 1577 // Force any function attributes we want the rest of the pipeline to observe. 1578 MPM.addPass(ForceFunctionAttrsPass()); 1579 1580 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1581 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1582 1583 // Apply module pipeline start EP callback. 1584 invokePipelineStartEPCallbacks(MPM, Level); 1585 1586 const ThinOrFullLTOPhase LTOPhase = LTOPreLink 1587 ? ThinOrFullLTOPhase::FullLTOPreLink 1588 : ThinOrFullLTOPhase::None; 1589 // Add the core simplification pipeline. 1590 MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase)); 1591 1592 // Now add the optimization pipeline. 1593 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPhase)); 1594 1595 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1596 PGOOpt->Action == PGOOptions::SampleUse) 1597 MPM.addPass(PseudoProbeUpdatePass()); 1598 1599 // Emit annotation remarks. 1600 addAnnotationRemarksPass(MPM); 1601 1602 if (LTOPreLink) 1603 addRequiredLTOPreLinkPasses(MPM); 1604 return MPM; 1605 } 1606 1607 ModulePassManager 1608 PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, 1609 bool EmitSummary) { 1610 ModulePassManager MPM; 1611 if (ThinLTO) 1612 MPM.addPass(buildThinLTOPreLinkDefaultPipeline(Level)); 1613 else 1614 MPM.addPass(buildLTOPreLinkDefaultPipeline(Level)); 1615 MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary)); 1616 1617 // Use the ThinLTO post-link pipeline with sample profiling 1618 if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) 1619 MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr)); 1620 else { 1621 // otherwise, just use module optimization 1622 MPM.addPass( 1623 buildModuleOptimizationPipeline(Level, ThinOrFullLTOPhase::None)); 1624 // Emit annotation remarks. 1625 addAnnotationRemarksPass(MPM); 1626 } 1627 return MPM; 1628 } 1629 1630 ModulePassManager 1631 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1632 if (Level == OptimizationLevel::O0) 1633 return buildO0DefaultPipeline(Level, /*LTOPreLink*/true); 1634 1635 ModulePassManager MPM; 1636 1637 // Convert @llvm.global.annotations to !annotation metadata. 1638 MPM.addPass(Annotation2MetadataPass()); 1639 1640 // Force any function attributes we want the rest of the pipeline to observe. 1641 MPM.addPass(ForceFunctionAttrsPass()); 1642 1643 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1644 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1645 1646 // Apply module pipeline start EP callback. 1647 invokePipelineStartEPCallbacks(MPM, Level); 1648 1649 // If we are planning to perform ThinLTO later, we don't bloat the code with 1650 // unrolling/vectorization/... now. Just simplify the module as much as we 1651 // can. 1652 MPM.addPass(buildModuleSimplificationPipeline( 1653 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1654 1655 // Run partial inlining pass to partially inline functions that have 1656 // large bodies. 1657 // FIXME: It isn't clear whether this is really the right place to run this 1658 // in ThinLTO. Because there is another canonicalization and simplification 1659 // phase that will run after the thin link, running this here ends up with 1660 // less information than will be available later and it may grow functions in 1661 // ways that aren't beneficial. 1662 if (RunPartialInlining) 1663 MPM.addPass(PartialInlinerPass()); 1664 1665 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1666 PGOOpt->Action == PGOOptions::SampleUse) 1667 MPM.addPass(PseudoProbeUpdatePass()); 1668 1669 // Handle Optimizer{Early,Last}EPCallbacks added by clang on PreLink. Actual 1670 // optimization is going to be done in PostLink stage, but clang can't add 1671 // callbacks there in case of in-process ThinLTO called by linker. 1672 invokeOptimizerEarlyEPCallbacks(MPM, Level); 1673 invokeOptimizerLastEPCallbacks(MPM, Level); 1674 1675 // Emit annotation remarks. 1676 addAnnotationRemarksPass(MPM); 1677 1678 addRequiredLTOPreLinkPasses(MPM); 1679 1680 return MPM; 1681 } 1682 1683 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1684 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1685 ModulePassManager MPM; 1686 1687 if (ImportSummary) { 1688 // For ThinLTO we must apply the context disambiguation decisions early, to 1689 // ensure we can correctly match the callsites to summary data. 1690 if (EnableMemProfContextDisambiguation) 1691 MPM.addPass(MemProfContextDisambiguation(ImportSummary)); 1692 1693 // These passes import type identifier resolutions for whole-program 1694 // devirtualization and CFI. They must run early because other passes may 1695 // disturb the specific instruction patterns that these passes look for, 1696 // creating dependencies on resolutions that may not appear in the summary. 1697 // 1698 // For example, GVN may transform the pattern assume(type.test) appearing in 1699 // two basic blocks into assume(phi(type.test, type.test)), which would 1700 // transform a dependency on a WPD resolution into a dependency on a type 1701 // identifier resolution for CFI. 1702 // 1703 // Also, WPD has access to more precise information than ICP and can 1704 // devirtualize more effectively, so it should operate on the IR first. 1705 // 1706 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1707 // metadata and intrinsics. 1708 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1709 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1710 } 1711 1712 if (Level == OptimizationLevel::O0) { 1713 // Run a second time to clean up any type tests left behind by WPD for use 1714 // in ICP. 1715 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1716 // Drop available_externally and unreferenced globals. This is necessary 1717 // with ThinLTO in order to avoid leaving undefined references to dead 1718 // globals in the object file. 1719 MPM.addPass(EliminateAvailableExternallyPass()); 1720 MPM.addPass(GlobalDCEPass()); 1721 return MPM; 1722 } 1723 1724 // Add the core simplification pipeline. 1725 MPM.addPass(buildModuleSimplificationPipeline( 1726 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1727 1728 // Now add the optimization pipeline. 1729 MPM.addPass(buildModuleOptimizationPipeline( 1730 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1731 1732 // Emit annotation remarks. 1733 addAnnotationRemarksPass(MPM); 1734 1735 return MPM; 1736 } 1737 1738 ModulePassManager 1739 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1740 // FIXME: We should use a customized pre-link pipeline! 1741 return buildPerModuleDefaultPipeline(Level, 1742 /* LTOPreLink */ true); 1743 } 1744 1745 ModulePassManager 1746 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1747 ModuleSummaryIndex *ExportSummary) { 1748 ModulePassManager MPM; 1749 1750 invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level); 1751 1752 // Create a function that performs CFI checks for cross-DSO calls with targets 1753 // in the current module. 1754 MPM.addPass(CrossDSOCFIPass()); 1755 1756 if (Level == OptimizationLevel::O0) { 1757 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1758 // metadata and intrinsics. 1759 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1760 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1761 // Run a second time to clean up any type tests left behind by WPD for use 1762 // in ICP. 1763 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1764 1765 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1766 1767 // Emit annotation remarks. 1768 addAnnotationRemarksPass(MPM); 1769 1770 return MPM; 1771 } 1772 1773 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1774 // Load sample profile before running the LTO optimization pipeline. 1775 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1776 PGOOpt->ProfileRemappingFile, 1777 ThinOrFullLTOPhase::FullLTOPostLink)); 1778 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1779 // RequireAnalysisPass for PSI before subsequent non-module passes. 1780 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1781 } 1782 1783 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1784 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); 1785 1786 // Remove unused virtual tables to improve the quality of code generated by 1787 // whole-program devirtualization and bitset lowering. 1788 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1789 1790 // Do basic inference of function attributes from known properties of system 1791 // libraries and other oracles. 1792 MPM.addPass(InferFunctionAttrsPass()); 1793 1794 if (Level.getSpeedupLevel() > 1) { 1795 MPM.addPass(createModuleToFunctionPassAdaptor( 1796 CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses)); 1797 1798 // Indirect call promotion. This should promote all the targets that are 1799 // left by the earlier promotion pass that promotes intra-module targets. 1800 // This two-step promotion is to save the compile time. For LTO, it should 1801 // produce the same result as if we only do promotion here. 1802 MPM.addPass(PGOIndirectCallPromotion( 1803 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1804 1805 // Propagate constants at call sites into the functions they call. This 1806 // opens opportunities for globalopt (and inlining) by substituting function 1807 // pointers passed as arguments to direct uses of functions. 1808 MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/ 1809 Level != OptimizationLevel::Os && 1810 Level != OptimizationLevel::Oz))); 1811 1812 // Attach metadata to indirect call sites indicating the set of functions 1813 // they may target at run-time. This should follow IPSCCP. 1814 MPM.addPass(CalledValuePropagationPass()); 1815 } 1816 1817 // Now deduce any function attributes based in the current code. 1818 MPM.addPass( 1819 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1820 1821 // Do RPO function attribute inference across the module to forward-propagate 1822 // attributes where applicable. 1823 // FIXME: Is this really an optimization rather than a canonicalization? 1824 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1825 1826 // Use in-range annotations on GEP indices to split globals where beneficial. 1827 MPM.addPass(GlobalSplitPass()); 1828 1829 // Run whole program optimization of virtual call when the list of callees 1830 // is fixed. 1831 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1832 1833 // Stop here at -O1. 1834 if (Level == OptimizationLevel::O1) { 1835 // The LowerTypeTestsPass needs to run to lower type metadata and the 1836 // type.test intrinsics. The pass does nothing if CFI is disabled. 1837 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1838 // Run a second time to clean up any type tests left behind by WPD for use 1839 // in ICP (which is performed earlier than this in the regular LTO 1840 // pipeline). 1841 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1842 1843 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1844 1845 // Emit annotation remarks. 1846 addAnnotationRemarksPass(MPM); 1847 1848 return MPM; 1849 } 1850 1851 // Optimize globals to try and fold them into constants. 1852 MPM.addPass(GlobalOptPass()); 1853 1854 // Promote any localized globals to SSA registers. 1855 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1856 1857 // Linking modules together can lead to duplicate global constant, only 1858 // keep one copy of each constant. 1859 MPM.addPass(ConstantMergePass()); 1860 1861 // Remove unused arguments from functions. 1862 MPM.addPass(DeadArgumentEliminationPass()); 1863 1864 // Reduce the code after globalopt and ipsccp. Both can open up significant 1865 // simplification opportunities, and both can propagate functions through 1866 // function pointers. When this happens, we often have to resolve varargs 1867 // calls, etc, so let instcombine do this. 1868 FunctionPassManager PeepholeFPM; 1869 PeepholeFPM.addPass(InstCombinePass()); 1870 if (Level.getSpeedupLevel() > 1) 1871 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1872 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1873 1874 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1875 PTO.EagerlyInvalidateAnalyses)); 1876 1877 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1878 // generally clean up exception handling overhead. It isn't clear this is 1879 // valuable as the inliner doesn't currently care whether it is inlining an 1880 // invoke or a call. 1881 // Run the inliner now. 1882 if (EnableModuleInliner) { 1883 MPM.addPass(ModuleInlinerPass(getInlineParamsFromOptLevel(Level), 1884 UseInlineAdvisor, 1885 ThinOrFullLTOPhase::FullLTOPostLink)); 1886 } else { 1887 MPM.addPass(ModuleInlinerWrapperPass( 1888 getInlineParamsFromOptLevel(Level), 1889 /* MandatoryFirst */ true, 1890 InlineContext{ThinOrFullLTOPhase::FullLTOPostLink, 1891 InlinePass::CGSCCInliner})); 1892 } 1893 1894 // Perform context disambiguation after inlining, since that would reduce the 1895 // amount of additional cloning required to distinguish the allocation 1896 // contexts. 1897 if (EnableMemProfContextDisambiguation) 1898 MPM.addPass(MemProfContextDisambiguation()); 1899 1900 // Optimize globals again after we ran the inliner. 1901 MPM.addPass(GlobalOptPass()); 1902 1903 // Run the OpenMPOpt pass again after global optimizations. 1904 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); 1905 1906 // Garbage collect dead functions. 1907 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1908 1909 // If we didn't decide to inline a function, check to see if we can 1910 // transform it to pass arguments by value instead of by reference. 1911 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1912 1913 FunctionPassManager FPM; 1914 // The IPO Passes may leave cruft around. Clean up after them. 1915 FPM.addPass(InstCombinePass()); 1916 invokePeepholeEPCallbacks(FPM, Level); 1917 1918 if (EnableConstraintElimination) 1919 FPM.addPass(ConstraintEliminationPass()); 1920 1921 FPM.addPass(JumpThreadingPass()); 1922 1923 // Do a post inline PGO instrumentation and use pass. This is a context 1924 // sensitive PGO pass. 1925 if (PGOOpt) { 1926 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1927 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, 1928 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1929 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, 1930 PGOOpt->FS); 1931 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1932 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, 1933 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1934 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1935 PGOOpt->FS); 1936 } 1937 1938 // Break up allocas 1939 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 1940 1941 // LTO provides additional opportunities for tailcall elimination due to 1942 // link-time inlining, and visibility of nocapture attribute. 1943 FPM.addPass(TailCallElimPass()); 1944 1945 // Run a few AA driver optimizations here and now to cleanup the code. 1946 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 1947 PTO.EagerlyInvalidateAnalyses)); 1948 1949 MPM.addPass( 1950 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1951 1952 // Require the GlobalsAA analysis for the module so we can query it within 1953 // MainFPM. 1954 if (EnableGlobalAnalyses) { 1955 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1956 // Invalidate AAManager so it can be recreated and pick up the newly 1957 // available GlobalsAA. 1958 MPM.addPass( 1959 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 1960 } 1961 1962 FunctionPassManager MainFPM; 1963 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1964 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1965 /*AllowSpeculation=*/true), 1966 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1967 1968 if (RunNewGVN) 1969 MainFPM.addPass(NewGVNPass()); 1970 else 1971 MainFPM.addPass(GVNPass()); 1972 1973 // Remove dead memcpy()'s. 1974 MainFPM.addPass(MemCpyOptPass()); 1975 1976 // Nuke dead stores. 1977 MainFPM.addPass(DSEPass()); 1978 MainFPM.addPass(MoveAutoInitPass()); 1979 MainFPM.addPass(MergedLoadStoreMotionPass()); 1980 1981 LoopPassManager LPM; 1982 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 1983 LPM.addPass(LoopFlattenPass()); 1984 LPM.addPass(IndVarSimplifyPass()); 1985 LPM.addPass(LoopDeletionPass()); 1986 // FIXME: Add loop interchange. 1987 1988 // Unroll small loops and perform peeling. 1989 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 1990 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 1991 PTO.ForgetAllSCEVInLoopUnroll)); 1992 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 1993 // *All* loop passes must preserve it, in order to be able to use it. 1994 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1995 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 1996 1997 MainFPM.addPass(LoopDistributePass()); 1998 1999 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 2000 2001 // Run the OpenMPOpt CGSCC pass again late. 2002 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 2003 OpenMPOptCGSCCPass(ThinOrFullLTOPhase::FullLTOPostLink))); 2004 2005 invokePeepholeEPCallbacks(MainFPM, Level); 2006 MainFPM.addPass(JumpThreadingPass()); 2007 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 2008 PTO.EagerlyInvalidateAnalyses)); 2009 2010 // Lower type metadata and the type.test intrinsic. This pass supports 2011 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 2012 // to be run at link time if CFI is enabled. This pass does nothing if 2013 // CFI is disabled. 2014 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 2015 // Run a second time to clean up any type tests left behind by WPD for use 2016 // in ICP (which is performed earlier than this in the regular LTO pipeline). 2017 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 2018 2019 // Enable splitting late in the FullLTO post-link pipeline. 2020 if (EnableHotColdSplit) 2021 MPM.addPass(HotColdSplittingPass()); 2022 2023 // Add late LTO optimization passes. 2024 FunctionPassManager LateFPM; 2025 2026 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 2027 // canonicalization pass that enables other optimizations. As a result, 2028 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 2029 // result too early. 2030 LateFPM.addPass(LoopSinkPass()); 2031 2032 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 2033 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 2034 // flattening of blocks. 2035 LateFPM.addPass(DivRemPairsPass()); 2036 2037 // Delete basic blocks, which optimization passes may have killed. 2038 LateFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 2039 .convertSwitchRangeToICmp(true) 2040 .hoistCommonInsts(true) 2041 .speculateUnpredictables(true))); 2042 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM))); 2043 2044 // Drop bodies of available eternally objects to improve GlobalDCE. 2045 MPM.addPass(EliminateAvailableExternallyPass()); 2046 2047 // Now that we have optimized the program, discard unreachable functions. 2048 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 2049 2050 if (PTO.MergeFunctions) 2051 MPM.addPass(MergeFunctionsPass()); 2052 2053 if (PTO.CallGraphProfile) 2054 MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true)); 2055 2056 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 2057 2058 // Emit annotation remarks. 2059 addAnnotationRemarksPass(MPM); 2060 2061 return MPM; 2062 } 2063 2064 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 2065 bool LTOPreLink) { 2066 assert(Level == OptimizationLevel::O0 && 2067 "buildO0DefaultPipeline should only be used with O0"); 2068 2069 ModulePassManager MPM; 2070 2071 // Perform pseudo probe instrumentation in O0 mode. This is for the 2072 // consistency between different build modes. For example, a LTO build can be 2073 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 2074 // the postlink will require pseudo probe instrumentation in the prelink. 2075 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 2076 MPM.addPass(SampleProfileProbePass(TM)); 2077 2078 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 2079 PGOOpt->Action == PGOOptions::IRUse)) 2080 addPGOInstrPassesForO0( 2081 MPM, 2082 /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr), 2083 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, PGOOpt->ProfileFile, 2084 PGOOpt->ProfileRemappingFile, PGOOpt->FS); 2085 2086 // Instrument function entry and exit before all inlining. 2087 MPM.addPass(createModuleToFunctionPassAdaptor( 2088 EntryExitInstrumenterPass(/*PostInlining=*/false))); 2089 2090 invokePipelineStartEPCallbacks(MPM, Level); 2091 2092 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 2093 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 2094 2095 invokePipelineEarlySimplificationEPCallbacks(MPM, Level); 2096 2097 // Build a minimal pipeline based on the semantics required by LLVM, 2098 // which is just that always inlining occurs. Further, disable generating 2099 // lifetime intrinsics to avoid enabling further optimizations during 2100 // code generation. 2101 MPM.addPass(AlwaysInlinerPass( 2102 /*InsertLifetimeIntrinsics=*/false)); 2103 2104 if (PTO.MergeFunctions) 2105 MPM.addPass(MergeFunctionsPass()); 2106 2107 if (EnableMatrix) 2108 MPM.addPass( 2109 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 2110 2111 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 2112 CGSCCPassManager CGPM; 2113 invokeCGSCCOptimizerLateEPCallbacks(CGPM, Level); 2114 if (!CGPM.isEmpty()) 2115 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 2116 } 2117 if (!LateLoopOptimizationsEPCallbacks.empty()) { 2118 LoopPassManager LPM; 2119 invokeLateLoopOptimizationsEPCallbacks(LPM, Level); 2120 if (!LPM.isEmpty()) { 2121 MPM.addPass(createModuleToFunctionPassAdaptor( 2122 createFunctionToLoopPassAdaptor(std::move(LPM)))); 2123 } 2124 } 2125 if (!LoopOptimizerEndEPCallbacks.empty()) { 2126 LoopPassManager LPM; 2127 invokeLoopOptimizerEndEPCallbacks(LPM, Level); 2128 if (!LPM.isEmpty()) { 2129 MPM.addPass(createModuleToFunctionPassAdaptor( 2130 createFunctionToLoopPassAdaptor(std::move(LPM)))); 2131 } 2132 } 2133 if (!ScalarOptimizerLateEPCallbacks.empty()) { 2134 FunctionPassManager FPM; 2135 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 2136 if (!FPM.isEmpty()) 2137 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2138 } 2139 2140 invokeOptimizerEarlyEPCallbacks(MPM, Level); 2141 2142 if (!VectorizerStartEPCallbacks.empty()) { 2143 FunctionPassManager FPM; 2144 invokeVectorizerStartEPCallbacks(FPM, Level); 2145 if (!FPM.isEmpty()) 2146 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2147 } 2148 2149 ModulePassManager CoroPM; 2150 CoroPM.addPass(CoroEarlyPass()); 2151 CGSCCPassManager CGPM; 2152 CGPM.addPass(CoroSplitPass()); 2153 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 2154 CoroPM.addPass(CoroCleanupPass()); 2155 CoroPM.addPass(GlobalDCEPass()); 2156 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); 2157 2158 invokeOptimizerLastEPCallbacks(MPM, Level); 2159 2160 if (LTOPreLink) 2161 addRequiredLTOPreLinkPasses(MPM); 2162 2163 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 2164 2165 return MPM; 2166 } 2167 2168 AAManager PassBuilder::buildDefaultAAPipeline() { 2169 AAManager AA; 2170 2171 // The order in which these are registered determines their priority when 2172 // being queried. 2173 2174 // First we register the basic alias analysis that provides the majority of 2175 // per-function local AA logic. This is a stateless, on-demand local set of 2176 // AA techniques. 2177 AA.registerFunctionAnalysis<BasicAA>(); 2178 2179 // Next we query fast, specialized alias analyses that wrap IR-embedded 2180 // information about aliasing. 2181 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 2182 AA.registerFunctionAnalysis<TypeBasedAA>(); 2183 2184 // Add support for querying global aliasing information when available. 2185 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 2186 // analysis, all that the `AAManager` can do is query for any *cached* 2187 // results from `GlobalsAA` through a readonly proxy. 2188 if (EnableGlobalAnalyses) 2189 AA.registerModuleAnalysis<GlobalsAA>(); 2190 2191 // Add target-specific alias analyses. 2192 if (TM) 2193 TM->registerDefaultAliasAnalyses(AA); 2194 2195 return AA; 2196 } 2197