1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/ADT/Statistic.h" 18 #include "llvm/Analysis/AliasAnalysis.h" 19 #include "llvm/Analysis/BasicAliasAnalysis.h" 20 #include "llvm/Analysis/CGSCCPassManager.h" 21 #include "llvm/Analysis/GlobalsModRef.h" 22 #include "llvm/Analysis/InlineAdvisor.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/ScopedNoAliasAA.h" 25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 26 #include "llvm/IR/PassManager.h" 27 #include "llvm/Passes/OptimizationLevel.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/PGOOptions.h" 32 #include "llvm/Support/VirtualFileSystem.h" 33 #include "llvm/Target/TargetMachine.h" 34 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 35 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 36 #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h" 37 #include "llvm/Transforms/Coroutines/CoroEarly.h" 38 #include "llvm/Transforms/Coroutines/CoroElide.h" 39 #include "llvm/Transforms/Coroutines/CoroSplit.h" 40 #include "llvm/Transforms/HipStdPar/HipStdPar.h" 41 #include "llvm/Transforms/IPO/AlwaysInliner.h" 42 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 43 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 44 #include "llvm/Transforms/IPO/Attributor.h" 45 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 46 #include "llvm/Transforms/IPO/ConstantMerge.h" 47 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 48 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 49 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 50 #include "llvm/Transforms/IPO/EmbedBitcodePass.h" 51 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 52 #include "llvm/Transforms/IPO/FunctionAttrs.h" 53 #include "llvm/Transforms/IPO/GlobalDCE.h" 54 #include "llvm/Transforms/IPO/GlobalOpt.h" 55 #include "llvm/Transforms/IPO/GlobalSplit.h" 56 #include "llvm/Transforms/IPO/HotColdSplitting.h" 57 #include "llvm/Transforms/IPO/IROutliner.h" 58 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 59 #include "llvm/Transforms/IPO/Inliner.h" 60 #include "llvm/Transforms/IPO/LowerTypeTests.h" 61 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 62 #include "llvm/Transforms/IPO/MergeFunctions.h" 63 #include "llvm/Transforms/IPO/ModuleInliner.h" 64 #include "llvm/Transforms/IPO/OpenMPOpt.h" 65 #include "llvm/Transforms/IPO/PartialInlining.h" 66 #include "llvm/Transforms/IPO/SCCP.h" 67 #include "llvm/Transforms/IPO/SampleProfile.h" 68 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 69 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 70 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 71 #include "llvm/Transforms/InstCombine/InstCombine.h" 72 #include "llvm/Transforms/Instrumentation/CGProfile.h" 73 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 74 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 75 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 76 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 77 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 78 #include "llvm/Transforms/Scalar/ADCE.h" 79 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 80 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 81 #include "llvm/Transforms/Scalar/BDCE.h" 82 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 83 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 84 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 85 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 86 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 87 #include "llvm/Transforms/Scalar/DivRemPairs.h" 88 #include "llvm/Transforms/Scalar/EarlyCSE.h" 89 #include "llvm/Transforms/Scalar/Float2Int.h" 90 #include "llvm/Transforms/Scalar/GVN.h" 91 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 92 #include "llvm/Transforms/Scalar/InferAlignment.h" 93 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 94 #include "llvm/Transforms/Scalar/JumpThreading.h" 95 #include "llvm/Transforms/Scalar/LICM.h" 96 #include "llvm/Transforms/Scalar/LoopDeletion.h" 97 #include "llvm/Transforms/Scalar/LoopDistribute.h" 98 #include "llvm/Transforms/Scalar/LoopFlatten.h" 99 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 100 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 101 #include "llvm/Transforms/Scalar/LoopInterchange.h" 102 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 103 #include "llvm/Transforms/Scalar/LoopPassManager.h" 104 #include "llvm/Transforms/Scalar/LoopRotation.h" 105 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 106 #include "llvm/Transforms/Scalar/LoopSink.h" 107 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 108 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 109 #include "llvm/Transforms/Scalar/LoopVersioningLICM.h" 110 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 111 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 112 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 113 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 114 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 115 #include "llvm/Transforms/Scalar/NewGVN.h" 116 #include "llvm/Transforms/Scalar/Reassociate.h" 117 #include "llvm/Transforms/Scalar/SCCP.h" 118 #include "llvm/Transforms/Scalar/SROA.h" 119 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 120 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 121 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 122 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 123 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 124 #include "llvm/Transforms/Utils/AddDiscriminators.h" 125 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 126 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 127 #include "llvm/Transforms/Utils/CountVisits.h" 128 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 129 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 130 #include "llvm/Transforms/Utils/Mem2Reg.h" 131 #include "llvm/Transforms/Utils/MoveAutoInit.h" 132 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 133 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 134 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 135 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 136 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 137 #include "llvm/Transforms/Vectorize/VectorCombine.h" 138 139 using namespace llvm; 140 141 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 142 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 143 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 144 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 145 "Heuristics-based inliner version"), 146 clEnumValN(InliningAdvisorMode::Development, "development", 147 "Use development mode (runtime-loadable model)"), 148 clEnumValN(InliningAdvisorMode::Release, "release", 149 "Use release mode (AOT-compiled model)"))); 150 151 static cl::opt<bool> EnableSyntheticCounts( 152 "enable-npm-synthetic-counts", cl::Hidden, 153 cl::desc("Run synthetic function entry count generation " 154 "pass")); 155 156 /// Flag to enable inline deferral during PGO. 157 static cl::opt<bool> 158 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 159 cl::Hidden, 160 cl::desc("Enable inline deferral during PGO")); 161 162 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 163 cl::init(false), cl::Hidden, 164 cl::desc("Enable module inliner")); 165 166 static cl::opt<bool> PerformMandatoryInliningsFirst( 167 "mandatory-inlining-first", cl::init(false), cl::Hidden, 168 cl::desc("Perform mandatory inlinings module-wide, before performing " 169 "inlining")); 170 171 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 172 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 173 cl::desc("Eagerly invalidate more analyses in default pipelines")); 174 175 static cl::opt<bool> EnableMergeFunctions( 176 "enable-merge-functions", cl::init(false), cl::Hidden, 177 cl::desc("Enable function merging as part of the optimization pipeline")); 178 179 static cl::opt<bool> EnablePostPGOLoopRotation( 180 "enable-post-pgo-loop-rotation", cl::init(true), cl::Hidden, 181 cl::desc("Run the loop rotation transformation after PGO instrumentation")); 182 183 static cl::opt<bool> EnableGlobalAnalyses( 184 "enable-global-analyses", cl::init(true), cl::Hidden, 185 cl::desc("Enable inter-procedural analyses")); 186 187 static cl::opt<bool> 188 RunPartialInlining("enable-partial-inlining", cl::init(false), cl::Hidden, 189 cl::desc("Run Partial inlinining pass")); 190 191 static cl::opt<bool> ExtraVectorizerPasses( 192 "extra-vectorizer-passes", cl::init(false), cl::Hidden, 193 cl::desc("Run cleanup optimization passes after vectorization")); 194 195 static cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden, 196 cl::desc("Run the NewGVN pass")); 197 198 static cl::opt<bool> EnableLoopInterchange( 199 "enable-loopinterchange", cl::init(false), cl::Hidden, 200 cl::desc("Enable the experimental LoopInterchange Pass")); 201 202 static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", 203 cl::init(false), cl::Hidden, 204 cl::desc("Enable Unroll And Jam Pass")); 205 206 static cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false), 207 cl::Hidden, 208 cl::desc("Enable the LoopFlatten Pass")); 209 210 static cl::opt<bool> 211 EnableDFAJumpThreading("enable-dfa-jump-thread", 212 cl::desc("Enable DFA jump threading"), 213 cl::init(false), cl::Hidden); 214 215 static cl::opt<bool> 216 EnableHotColdSplit("hot-cold-split", 217 cl::desc("Enable hot-cold splitting pass")); 218 219 static cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), 220 cl::Hidden, 221 cl::desc("Enable ir outliner pass")); 222 223 static cl::opt<bool> 224 DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden, 225 cl::desc("Disable pre-instrumentation inliner")); 226 227 static cl::opt<int> PreInlineThreshold( 228 "preinline-threshold", cl::Hidden, cl::init(75), 229 cl::desc("Control the amount of inlining in pre-instrumentation inliner " 230 "(default = 75)")); 231 232 static cl::opt<bool> 233 EnableGVNHoist("enable-gvn-hoist", 234 cl::desc("Enable the GVN hoisting pass (default = off)")); 235 236 static cl::opt<bool> 237 EnableGVNSink("enable-gvn-sink", 238 cl::desc("Enable the GVN sinking pass (default = off)")); 239 240 // This option is used in simplifying testing SampleFDO optimizations for 241 // profile loading. 242 static cl::opt<bool> 243 EnableCHR("enable-chr", cl::init(true), cl::Hidden, 244 cl::desc("Enable control height reduction optimization (CHR)")); 245 246 static cl::opt<bool> FlattenedProfileUsed( 247 "flattened-profile-used", cl::init(false), cl::Hidden, 248 cl::desc("Indicate the sample profile being used is flattened, i.e., " 249 "no inline hierachy exists in the profile")); 250 251 static cl::opt<bool> EnableOrderFileInstrumentation( 252 "enable-order-file-instrumentation", cl::init(false), cl::Hidden, 253 cl::desc("Enable order file instrumentation (default = off)")); 254 255 static cl::opt<bool> 256 EnableMatrix("enable-matrix", cl::init(false), cl::Hidden, 257 cl::desc("Enable lowering of the matrix intrinsics")); 258 259 static cl::opt<bool> EnableConstraintElimination( 260 "enable-constraint-elimination", cl::init(true), cl::Hidden, 261 cl::desc( 262 "Enable pass to eliminate conditions based on linear constraints")); 263 264 static cl::opt<AttributorRunOption> AttributorRun( 265 "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE), 266 cl::desc("Enable the attributor inter-procedural deduction pass"), 267 cl::values(clEnumValN(AttributorRunOption::ALL, "all", 268 "enable all attributor runs"), 269 clEnumValN(AttributorRunOption::MODULE, "module", 270 "enable module-wide attributor runs"), 271 clEnumValN(AttributorRunOption::CGSCC, "cgscc", 272 "enable call graph SCC attributor runs"), 273 clEnumValN(AttributorRunOption::NONE, "none", 274 "disable attributor runs"))); 275 276 static cl::opt<bool> UseLoopVersioningLICM( 277 "enable-loop-versioning-licm", cl::init(false), cl::Hidden, 278 cl::desc("Enable the experimental Loop Versioning LICM pass")); 279 280 namespace llvm { 281 cl::opt<bool> EnableMemProfContextDisambiguation( 282 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, 283 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation")); 284 285 extern cl::opt<bool> EnableInferAlignmentPass; 286 } // namespace llvm 287 288 PipelineTuningOptions::PipelineTuningOptions() { 289 LoopInterleaving = true; 290 LoopVectorization = true; 291 SLPVectorization = false; 292 LoopUnrolling = true; 293 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 294 LicmMssaOptCap = SetLicmMssaOptCap; 295 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 296 CallGraphProfile = true; 297 UnifiedLTO = false; 298 MergeFunctions = EnableMergeFunctions; 299 InlinerThreshold = -1; 300 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 301 } 302 303 namespace llvm { 304 extern cl::opt<unsigned> MaxDevirtIterations; 305 } // namespace llvm 306 307 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 308 OptimizationLevel Level) { 309 for (auto &C : PeepholeEPCallbacks) 310 C(FPM, Level); 311 } 312 void PassBuilder::invokeLateLoopOptimizationsEPCallbacks( 313 LoopPassManager &LPM, OptimizationLevel Level) { 314 for (auto &C : LateLoopOptimizationsEPCallbacks) 315 C(LPM, Level); 316 } 317 void PassBuilder::invokeLoopOptimizerEndEPCallbacks(LoopPassManager &LPM, 318 OptimizationLevel Level) { 319 for (auto &C : LoopOptimizerEndEPCallbacks) 320 C(LPM, Level); 321 } 322 void PassBuilder::invokeScalarOptimizerLateEPCallbacks( 323 FunctionPassManager &FPM, OptimizationLevel Level) { 324 for (auto &C : ScalarOptimizerLateEPCallbacks) 325 C(FPM, Level); 326 } 327 void PassBuilder::invokeCGSCCOptimizerLateEPCallbacks(CGSCCPassManager &CGPM, 328 OptimizationLevel Level) { 329 for (auto &C : CGSCCOptimizerLateEPCallbacks) 330 C(CGPM, Level); 331 } 332 void PassBuilder::invokeVectorizerStartEPCallbacks(FunctionPassManager &FPM, 333 OptimizationLevel Level) { 334 for (auto &C : VectorizerStartEPCallbacks) 335 C(FPM, Level); 336 } 337 void PassBuilder::invokeOptimizerEarlyEPCallbacks(ModulePassManager &MPM, 338 OptimizationLevel Level) { 339 for (auto &C : OptimizerEarlyEPCallbacks) 340 C(MPM, Level); 341 } 342 void PassBuilder::invokeOptimizerLastEPCallbacks(ModulePassManager &MPM, 343 OptimizationLevel Level) { 344 for (auto &C : OptimizerLastEPCallbacks) 345 C(MPM, Level); 346 } 347 void PassBuilder::invokeFullLinkTimeOptimizationEarlyEPCallbacks( 348 ModulePassManager &MPM, OptimizationLevel Level) { 349 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks) 350 C(MPM, Level); 351 } 352 void PassBuilder::invokeFullLinkTimeOptimizationLastEPCallbacks( 353 ModulePassManager &MPM, OptimizationLevel Level) { 354 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 355 C(MPM, Level); 356 } 357 void PassBuilder::invokePipelineStartEPCallbacks(ModulePassManager &MPM, 358 OptimizationLevel Level) { 359 for (auto &C : PipelineStartEPCallbacks) 360 C(MPM, Level); 361 } 362 void PassBuilder::invokePipelineEarlySimplificationEPCallbacks( 363 ModulePassManager &MPM, OptimizationLevel Level) { 364 for (auto &C : PipelineEarlySimplificationEPCallbacks) 365 C(MPM, Level); 366 } 367 368 // Helper to add AnnotationRemarksPass. 369 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 370 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 371 } 372 373 // Helper to check if the current compilation phase is preparing for LTO 374 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 375 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 376 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 377 } 378 379 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 380 FunctionPassManager 381 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 382 ThinOrFullLTOPhase Phase) { 383 384 FunctionPassManager FPM; 385 386 if (AreStatisticsEnabled()) 387 FPM.addPass(CountVisitsPass()); 388 389 // Form SSA out of local memory accesses after breaking apart aggregates into 390 // scalars. 391 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 392 393 // Catch trivial redundancies 394 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 395 396 // Hoisting of scalars and load expressions. 397 FPM.addPass( 398 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 399 FPM.addPass(InstCombinePass()); 400 401 FPM.addPass(LibCallsShrinkWrapPass()); 402 403 invokePeepholeEPCallbacks(FPM, Level); 404 405 FPM.addPass( 406 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 407 408 // Form canonically associated expression trees, and simplify the trees using 409 // basic mathematical properties. For example, this will form (nearly) 410 // minimal multiplication trees. 411 FPM.addPass(ReassociatePass()); 412 413 // Add the primary loop simplification pipeline. 414 // FIXME: Currently this is split into two loop pass pipelines because we run 415 // some function passes in between them. These can and should be removed 416 // and/or replaced by scheduling the loop pass equivalents in the correct 417 // positions. But those equivalent passes aren't powerful enough yet. 418 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 419 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 420 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 421 // `LoopInstSimplify`. 422 LoopPassManager LPM1, LPM2; 423 424 // Simplify the loop body. We do this initially to clean up after other loop 425 // passes run, either when iterating on a loop or on inner loops with 426 // implications on the outer loop. 427 LPM1.addPass(LoopInstSimplifyPass()); 428 LPM1.addPass(LoopSimplifyCFGPass()); 429 430 // Try to remove as much code from the loop header as possible, 431 // to reduce amount of IR that will have to be duplicated. However, 432 // do not perform speculative hoisting the first time as LICM 433 // will destroy metadata that may not need to be destroyed if run 434 // after loop rotation. 435 // TODO: Investigate promotion cap for O1. 436 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 437 /*AllowSpeculation=*/false)); 438 439 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 440 isLTOPreLink(Phase))); 441 // TODO: Investigate promotion cap for O1. 442 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 443 /*AllowSpeculation=*/true)); 444 LPM1.addPass(SimpleLoopUnswitchPass()); 445 if (EnableLoopFlatten) 446 LPM1.addPass(LoopFlattenPass()); 447 448 LPM2.addPass(LoopIdiomRecognizePass()); 449 LPM2.addPass(IndVarSimplifyPass()); 450 451 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level); 452 453 LPM2.addPass(LoopDeletionPass()); 454 455 if (EnableLoopInterchange) 456 LPM2.addPass(LoopInterchangePass()); 457 458 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 459 // because it changes IR to makes profile annotation in back compile 460 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 461 // attributes so we need to make sure and allow the full unroll pass to pay 462 // attention to it. 463 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 464 PGOOpt->Action != PGOOptions::SampleUse) 465 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 466 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 467 PTO.ForgetAllSCEVInLoopUnroll)); 468 469 invokeLoopOptimizerEndEPCallbacks(LPM2, Level); 470 471 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 472 /*UseMemorySSA=*/true, 473 /*UseBlockFrequencyInfo=*/true)); 474 FPM.addPass( 475 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 476 FPM.addPass(InstCombinePass()); 477 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 478 // *All* loop passes must preserve it, in order to be able to use it. 479 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 480 /*UseMemorySSA=*/false, 481 /*UseBlockFrequencyInfo=*/false)); 482 483 // Delete small array after loop unroll. 484 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 485 486 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 487 FPM.addPass(MemCpyOptPass()); 488 489 // Sparse conditional constant propagation. 490 // FIXME: It isn't clear why we do this *after* loop passes rather than 491 // before... 492 FPM.addPass(SCCPPass()); 493 494 // Delete dead bit computations (instcombine runs after to fold away the dead 495 // computations, and then ADCE will run later to exploit any new DCE 496 // opportunities that creates). 497 FPM.addPass(BDCEPass()); 498 499 // Run instcombine after redundancy and dead bit elimination to exploit 500 // opportunities opened up by them. 501 FPM.addPass(InstCombinePass()); 502 invokePeepholeEPCallbacks(FPM, Level); 503 504 FPM.addPass(CoroElidePass()); 505 506 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 507 508 // Finally, do an expensive DCE pass to catch all the dead code exposed by 509 // the simplifications and basic cleanup after all the simplifications. 510 // TODO: Investigate if this is too expensive. 511 FPM.addPass(ADCEPass()); 512 FPM.addPass( 513 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 514 FPM.addPass(InstCombinePass()); 515 invokePeepholeEPCallbacks(FPM, Level); 516 517 return FPM; 518 } 519 520 FunctionPassManager 521 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 522 ThinOrFullLTOPhase Phase) { 523 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 524 525 // The O1 pipeline has a separate pipeline creation function to simplify 526 // construction readability. 527 if (Level.getSpeedupLevel() == 1) 528 return buildO1FunctionSimplificationPipeline(Level, Phase); 529 530 FunctionPassManager FPM; 531 532 if (AreStatisticsEnabled()) 533 FPM.addPass(CountVisitsPass()); 534 535 // Form SSA out of local memory accesses after breaking apart aggregates into 536 // scalars. 537 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 538 539 // Catch trivial redundancies 540 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 541 if (EnableKnowledgeRetention) 542 FPM.addPass(AssumeSimplifyPass()); 543 544 // Hoisting of scalars and load expressions. 545 if (EnableGVNHoist) 546 FPM.addPass(GVNHoistPass()); 547 548 // Global value numbering based sinking. 549 if (EnableGVNSink) { 550 FPM.addPass(GVNSinkPass()); 551 FPM.addPass( 552 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 553 } 554 555 // Speculative execution if the target has divergent branches; otherwise nop. 556 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 557 558 // Optimize based on known information about branches, and cleanup afterward. 559 FPM.addPass(JumpThreadingPass()); 560 FPM.addPass(CorrelatedValuePropagationPass()); 561 562 FPM.addPass( 563 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 564 FPM.addPass(InstCombinePass()); 565 FPM.addPass(AggressiveInstCombinePass()); 566 567 if (!Level.isOptimizingForSize()) 568 FPM.addPass(LibCallsShrinkWrapPass()); 569 570 invokePeepholeEPCallbacks(FPM, Level); 571 572 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 573 // using the size value profile. Don't perform this when optimizing for size. 574 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 575 !Level.isOptimizingForSize()) 576 FPM.addPass(PGOMemOPSizeOpt()); 577 578 FPM.addPass(TailCallElimPass()); 579 FPM.addPass( 580 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 581 582 // Form canonically associated expression trees, and simplify the trees using 583 // basic mathematical properties. For example, this will form (nearly) 584 // minimal multiplication trees. 585 FPM.addPass(ReassociatePass()); 586 587 if (EnableConstraintElimination) 588 FPM.addPass(ConstraintEliminationPass()); 589 590 // Add the primary loop simplification pipeline. 591 // FIXME: Currently this is split into two loop pass pipelines because we run 592 // some function passes in between them. These can and should be removed 593 // and/or replaced by scheduling the loop pass equivalents in the correct 594 // positions. But those equivalent passes aren't powerful enough yet. 595 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 596 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 597 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 598 // `LoopInstSimplify`. 599 LoopPassManager LPM1, LPM2; 600 601 // Simplify the loop body. We do this initially to clean up after other loop 602 // passes run, either when iterating on a loop or on inner loops with 603 // implications on the outer loop. 604 LPM1.addPass(LoopInstSimplifyPass()); 605 LPM1.addPass(LoopSimplifyCFGPass()); 606 607 // Try to remove as much code from the loop header as possible, 608 // to reduce amount of IR that will have to be duplicated. However, 609 // do not perform speculative hoisting the first time as LICM 610 // will destroy metadata that may not need to be destroyed if run 611 // after loop rotation. 612 // TODO: Investigate promotion cap for O1. 613 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 614 /*AllowSpeculation=*/false)); 615 616 // Disable header duplication in loop rotation at -Oz. 617 LPM1.addPass( 618 LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); 619 // TODO: Investigate promotion cap for O1. 620 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 621 /*AllowSpeculation=*/true)); 622 LPM1.addPass( 623 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); 624 if (EnableLoopFlatten) 625 LPM1.addPass(LoopFlattenPass()); 626 627 LPM2.addPass(LoopIdiomRecognizePass()); 628 LPM2.addPass(IndVarSimplifyPass()); 629 630 invokeLateLoopOptimizationsEPCallbacks(LPM2, Level); 631 632 LPM2.addPass(LoopDeletionPass()); 633 634 if (EnableLoopInterchange) 635 LPM2.addPass(LoopInterchangePass()); 636 637 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 638 // because it changes IR to makes profile annotation in back compile 639 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 640 // attributes so we need to make sure and allow the full unroll pass to pay 641 // attention to it. 642 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 643 PGOOpt->Action != PGOOptions::SampleUse) 644 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 645 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 646 PTO.ForgetAllSCEVInLoopUnroll)); 647 648 invokeLoopOptimizerEndEPCallbacks(LPM2, Level); 649 650 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 651 /*UseMemorySSA=*/true, 652 /*UseBlockFrequencyInfo=*/true)); 653 FPM.addPass( 654 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 655 FPM.addPass(InstCombinePass()); 656 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 657 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 658 // *All* loop passes must preserve it, in order to be able to use it. 659 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 660 /*UseMemorySSA=*/false, 661 /*UseBlockFrequencyInfo=*/false)); 662 663 // Delete small array after loop unroll. 664 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 665 666 // Try vectorization/scalarization transforms that are both improvements 667 // themselves and can allow further folds with GVN and InstCombine. 668 FPM.addPass(VectorCombinePass(/*TryEarlyFoldsOnly=*/true)); 669 670 // Eliminate redundancies. 671 FPM.addPass(MergedLoadStoreMotionPass()); 672 if (RunNewGVN) 673 FPM.addPass(NewGVNPass()); 674 else 675 FPM.addPass(GVNPass()); 676 677 // Sparse conditional constant propagation. 678 // FIXME: It isn't clear why we do this *after* loop passes rather than 679 // before... 680 FPM.addPass(SCCPPass()); 681 682 // Delete dead bit computations (instcombine runs after to fold away the dead 683 // computations, and then ADCE will run later to exploit any new DCE 684 // opportunities that creates). 685 FPM.addPass(BDCEPass()); 686 687 // Run instcombine after redundancy and dead bit elimination to exploit 688 // opportunities opened up by them. 689 FPM.addPass(InstCombinePass()); 690 invokePeepholeEPCallbacks(FPM, Level); 691 692 // Re-consider control flow based optimizations after redundancy elimination, 693 // redo DCE, etc. 694 if (EnableDFAJumpThreading && Level.getSizeLevel() == 0) 695 FPM.addPass(DFAJumpThreadingPass()); 696 697 FPM.addPass(JumpThreadingPass()); 698 FPM.addPass(CorrelatedValuePropagationPass()); 699 700 // Finally, do an expensive DCE pass to catch all the dead code exposed by 701 // the simplifications and basic cleanup after all the simplifications. 702 // TODO: Investigate if this is too expensive. 703 FPM.addPass(ADCEPass()); 704 705 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 706 FPM.addPass(MemCpyOptPass()); 707 708 FPM.addPass(DSEPass()); 709 FPM.addPass(MoveAutoInitPass()); 710 711 FPM.addPass(createFunctionToLoopPassAdaptor( 712 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 713 /*AllowSpeculation=*/true), 714 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 715 716 FPM.addPass(CoroElidePass()); 717 718 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 719 720 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 721 .convertSwitchRangeToICmp(true) 722 .hoistCommonInsts(true) 723 .sinkCommonInsts(true))); 724 FPM.addPass(InstCombinePass()); 725 invokePeepholeEPCallbacks(FPM, Level); 726 727 return FPM; 728 } 729 730 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 731 MPM.addPass(CanonicalizeAliasesPass()); 732 MPM.addPass(NameAnonGlobalPass()); 733 } 734 735 void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM, 736 OptimizationLevel Level, 737 ThinOrFullLTOPhase LTOPhase) { 738 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 739 if (DisablePreInliner) 740 return; 741 InlineParams IP; 742 743 IP.DefaultThreshold = PreInlineThreshold; 744 745 // FIXME: The hint threshold has the same value used by the regular inliner 746 // when not optimzing for size. This should probably be lowered after 747 // performance testing. 748 // FIXME: this comment is cargo culted from the old pass manager, revisit). 749 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 750 ModuleInlinerWrapperPass MIWP( 751 IP, /* MandatoryFirst */ true, 752 InlineContext{LTOPhase, InlinePass::EarlyInliner}); 753 CGSCCPassManager &CGPipeline = MIWP.getPM(); 754 755 FunctionPassManager FPM; 756 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 757 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 758 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( 759 true))); // Merge & remove basic blocks. 760 FPM.addPass(InstCombinePass()); // Combine silly sequences. 761 invokePeepholeEPCallbacks(FPM, Level); 762 763 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 764 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 765 766 MPM.addPass(std::move(MIWP)); 767 768 // Delete anything that is now dead to make sure that we don't instrument 769 // dead code. Instrumentation can end up keeping dead code around and 770 // dramatically increase code size. 771 MPM.addPass(GlobalDCEPass()); 772 } 773 774 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 775 OptimizationLevel Level, bool RunProfileGen, 776 bool IsCS, bool AtomicCounterUpdate, 777 std::string ProfileFile, 778 std::string ProfileRemappingFile, 779 IntrusiveRefCntPtr<vfs::FileSystem> FS) { 780 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 781 782 if (!RunProfileGen) { 783 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 784 MPM.addPass( 785 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); 786 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 787 // RequireAnalysisPass for PSI before subsequent non-module passes. 788 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 789 return; 790 } 791 792 // Perform PGO instrumentation. 793 MPM.addPass(PGOInstrumentationGen(IsCS)); 794 795 if (EnablePostPGOLoopRotation) { 796 // Disable header duplication in loop rotation at -Oz. 797 MPM.addPass(createModuleToFunctionPassAdaptor( 798 createFunctionToLoopPassAdaptor( 799 LoopRotatePass(Level != OptimizationLevel::Oz), 800 /*UseMemorySSA=*/false, 801 /*UseBlockFrequencyInfo=*/false), 802 PTO.EagerlyInvalidateAnalyses)); 803 } 804 805 // Add the profile lowering pass. 806 InstrProfOptions Options; 807 if (!ProfileFile.empty()) 808 Options.InstrProfileOutput = ProfileFile; 809 // Do counter promotion at Level greater than O0. 810 Options.DoCounterPromotion = true; 811 Options.UseBFIInPromotion = IsCS; 812 Options.Atomic = AtomicCounterUpdate; 813 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); 814 } 815 816 void PassBuilder::addPGOInstrPassesForO0( 817 ModulePassManager &MPM, bool RunProfileGen, bool IsCS, 818 bool AtomicCounterUpdate, std::string ProfileFile, 819 std::string ProfileRemappingFile, IntrusiveRefCntPtr<vfs::FileSystem> FS) { 820 if (!RunProfileGen) { 821 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 822 MPM.addPass( 823 PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS, FS)); 824 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 825 // RequireAnalysisPass for PSI before subsequent non-module passes. 826 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 827 return; 828 } 829 830 // Perform PGO instrumentation. 831 MPM.addPass(PGOInstrumentationGen(IsCS)); 832 // Add the profile lowering pass. 833 InstrProfOptions Options; 834 if (!ProfileFile.empty()) 835 Options.InstrProfileOutput = ProfileFile; 836 // Do not do counter promotion at O0. 837 Options.DoCounterPromotion = false; 838 Options.UseBFIInPromotion = IsCS; 839 Options.Atomic = AtomicCounterUpdate; 840 MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); 841 } 842 843 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 844 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 845 } 846 847 ModuleInlinerWrapperPass 848 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 849 ThinOrFullLTOPhase Phase) { 850 InlineParams IP; 851 if (PTO.InlinerThreshold == -1) 852 IP = getInlineParamsFromOptLevel(Level); 853 else 854 IP = getInlineParams(PTO.InlinerThreshold); 855 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 856 // disable hot callsite inline (as much as possible [1]) because it makes 857 // profile annotation in the backend inaccurate. 858 // 859 // [1] Note the cost of a function could be below zero due to erased 860 // prologue / epilogue. 861 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 862 PGOOpt->Action == PGOOptions::SampleUse) 863 IP.HotCallSiteThreshold = 0; 864 865 if (PGOOpt) 866 IP.EnableDeferral = EnablePGOInlineDeferral; 867 868 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, 869 InlineContext{Phase, InlinePass::CGSCCInliner}, 870 UseInlineAdvisor, MaxDevirtIterations); 871 872 // Require the GlobalsAA analysis for the module so we can query it within 873 // the CGSCC pipeline. 874 if (EnableGlobalAnalyses) { 875 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 876 // Invalidate AAManager so it can be recreated and pick up the newly 877 // available GlobalsAA. 878 MIWP.addModulePass( 879 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 880 } 881 882 // Require the ProfileSummaryAnalysis for the module so we can query it within 883 // the inliner pass. 884 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 885 886 // Now begin the main postorder CGSCC pipeline. 887 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 888 // manager and trying to emulate its precise behavior. Much of this doesn't 889 // make a lot of sense and we should revisit the core CGSCC structure. 890 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 891 892 // Note: historically, the PruneEH pass was run first to deduce nounwind and 893 // generally clean up exception handling overhead. It isn't clear this is 894 // valuable as the inliner doesn't currently care whether it is inlining an 895 // invoke or a call. 896 897 if (AttributorRun & AttributorRunOption::CGSCC) 898 MainCGPipeline.addPass(AttributorCGSCCPass()); 899 900 // Deduce function attributes. We do another run of this after the function 901 // simplification pipeline, so this only needs to run when it could affect the 902 // function simplification pipeline, which is only the case with recursive 903 // functions. 904 MainCGPipeline.addPass(PostOrderFunctionAttrsPass(/*SkipNonRecursive*/ true)); 905 906 // When at O3 add argument promotion to the pass pipeline. 907 // FIXME: It isn't at all clear why this should be limited to O3. 908 if (Level == OptimizationLevel::O3) 909 MainCGPipeline.addPass(ArgumentPromotionPass()); 910 911 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 912 // there are no OpenMP runtime calls present in the module. 913 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 914 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 915 916 invokeCGSCCOptimizerLateEPCallbacks(MainCGPipeline, Level); 917 918 // Add the core function simplification pipeline nested inside the 919 // CGSCC walk. 920 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 921 buildFunctionSimplificationPipeline(Level, Phase), 922 PTO.EagerlyInvalidateAnalyses, /*NoRerun=*/true)); 923 924 // Finally, deduce any function attributes based on the fully simplified 925 // function. 926 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 927 928 // Mark that the function is fully simplified and that it shouldn't be 929 // simplified again if we somehow revisit it due to CGSCC mutations unless 930 // it's been modified since. 931 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 932 RequireAnalysisPass<ShouldNotRunFunctionPassesAnalysis, Function>())); 933 934 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 935 936 // Make sure we don't affect potential future NoRerun CGSCC adaptors. 937 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 938 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 939 940 return MIWP; 941 } 942 943 ModulePassManager 944 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 945 ThinOrFullLTOPhase Phase) { 946 ModulePassManager MPM; 947 948 InlineParams IP = getInlineParamsFromOptLevel(Level); 949 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 950 // disable hot callsite inline (as much as possible [1]) because it makes 951 // profile annotation in the backend inaccurate. 952 // 953 // [1] Note the cost of a function could be below zero due to erased 954 // prologue / epilogue. 955 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 956 PGOOpt->Action == PGOOptions::SampleUse) 957 IP.HotCallSiteThreshold = 0; 958 959 if (PGOOpt) 960 IP.EnableDeferral = EnablePGOInlineDeferral; 961 962 // The inline deferral logic is used to avoid losing some 963 // inlining chance in future. It is helpful in SCC inliner, in which 964 // inlining is processed in bottom-up order. 965 // While in module inliner, the inlining order is a priority-based order 966 // by default. The inline deferral is unnecessary there. So we disable the 967 // inline deferral logic in module inliner. 968 IP.EnableDeferral = false; 969 970 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor, Phase)); 971 972 MPM.addPass(createModuleToFunctionPassAdaptor( 973 buildFunctionSimplificationPipeline(Level, Phase), 974 PTO.EagerlyInvalidateAnalyses)); 975 976 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 977 CoroSplitPass(Level != OptimizationLevel::O0))); 978 979 return MPM; 980 } 981 982 ModulePassManager 983 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 984 ThinOrFullLTOPhase Phase) { 985 assert(Level != OptimizationLevel::O0 && 986 "Should not be used for O0 pipeline"); 987 988 assert(Phase != ThinOrFullLTOPhase::FullLTOPostLink && 989 "FullLTOPostLink shouldn't call buildModuleSimplificationPipeline!"); 990 991 ModulePassManager MPM; 992 993 // Place pseudo probe instrumentation as the first pass of the pipeline to 994 // minimize the impact of optimization changes. 995 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 996 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 997 MPM.addPass(SampleProfileProbePass(TM)); 998 999 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 1000 1001 // In ThinLTO mode, when flattened profile is used, all the available 1002 // profile information will be annotated in PreLink phase so there is 1003 // no need to load the profile again in PostLink. 1004 bool LoadSampleProfile = 1005 HasSampleProfile && 1006 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 1007 1008 // During the ThinLTO backend phase we perform early indirect call promotion 1009 // here, before globalopt. Otherwise imported available_externally functions 1010 // look unreferenced and are removed. If we are going to load the sample 1011 // profile then defer until later. 1012 // TODO: See if we can move later and consolidate with the location where 1013 // we perform ICP when we are loading a sample profile. 1014 // TODO: We pass HasSampleProfile (whether there was a sample profile file 1015 // passed to the compile) to the SamplePGO flag of ICP. This is used to 1016 // determine whether the new direct calls are annotated with prof metadata. 1017 // Ideally this should be determined from whether the IR is annotated with 1018 // sample profile, and not whether the a sample profile was provided on the 1019 // command line. E.g. for flattened profiles where we will not be reloading 1020 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 1021 // provide the sample profile file. 1022 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 1023 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 1024 1025 // Create an early function pass manager to cleanup the output of the 1026 // frontend. Not necessary with LTO post link pipelines since the pre link 1027 // pipeline already cleaned up the frontend output. 1028 if (Phase != ThinOrFullLTOPhase::ThinLTOPostLink) { 1029 // Do basic inference of function attributes from known properties of system 1030 // libraries and other oracles. 1031 MPM.addPass(InferFunctionAttrsPass()); 1032 MPM.addPass(CoroEarlyPass()); 1033 1034 FunctionPassManager EarlyFPM; 1035 // Lower llvm.expect to metadata before attempting transforms. 1036 // Compare/branch metadata may alter the behavior of passes like 1037 // SimplifyCFG. 1038 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 1039 EarlyFPM.addPass(SimplifyCFGPass()); 1040 EarlyFPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 1041 EarlyFPM.addPass(EarlyCSEPass()); 1042 if (Level == OptimizationLevel::O3) 1043 EarlyFPM.addPass(CallSiteSplittingPass()); 1044 MPM.addPass(createModuleToFunctionPassAdaptor( 1045 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses)); 1046 } 1047 1048 if (LoadSampleProfile) { 1049 // Annotate sample profile right after early FPM to ensure freshness of 1050 // the debug info. 1051 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1052 PGOOpt->ProfileRemappingFile, Phase)); 1053 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1054 // RequireAnalysisPass for PSI before subsequent non-module passes. 1055 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1056 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 1057 // for the profile annotation to be accurate in the LTO backend. 1058 if (!isLTOPreLink(Phase)) 1059 // We perform early indirect call promotion here, before globalopt. 1060 // This is important for the ThinLTO backend phase because otherwise 1061 // imported available_externally functions look unreferenced and are 1062 // removed. 1063 MPM.addPass( 1064 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 1065 } 1066 1067 // Try to perform OpenMP specific optimizations on the module. This is a 1068 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 1069 MPM.addPass(OpenMPOptPass()); 1070 1071 if (AttributorRun & AttributorRunOption::MODULE) 1072 MPM.addPass(AttributorPass()); 1073 1074 // Lower type metadata and the type.test intrinsic in the ThinLTO 1075 // post link pipeline after ICP. This is to enable usage of the type 1076 // tests in ICP sequences. 1077 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 1078 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1079 1080 invokePipelineEarlySimplificationEPCallbacks(MPM, Level); 1081 1082 // Interprocedural constant propagation now that basic cleanup has occurred 1083 // and prior to optimizing globals. 1084 // FIXME: This position in the pipeline hasn't been carefully considered in 1085 // years, it should be re-analyzed. 1086 MPM.addPass(IPSCCPPass( 1087 IPSCCPOptions(/*AllowFuncSpec=*/ 1088 Level != OptimizationLevel::Os && 1089 Level != OptimizationLevel::Oz && 1090 !isLTOPreLink(Phase)))); 1091 1092 // Attach metadata to indirect call sites indicating the set of functions 1093 // they may target at run-time. This should follow IPSCCP. 1094 MPM.addPass(CalledValuePropagationPass()); 1095 1096 // Optimize globals to try and fold them into constants. 1097 MPM.addPass(GlobalOptPass()); 1098 1099 // Create a small function pass pipeline to cleanup after all the global 1100 // optimizations. 1101 FunctionPassManager GlobalCleanupPM; 1102 // FIXME: Should this instead by a run of SROA? 1103 GlobalCleanupPM.addPass(PromotePass()); 1104 GlobalCleanupPM.addPass(InstCombinePass()); 1105 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 1106 GlobalCleanupPM.addPass( 1107 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1108 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 1109 PTO.EagerlyInvalidateAnalyses)); 1110 1111 // Invoke the pre-inliner passes for instrumentation PGO or MemProf. 1112 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 1113 (PGOOpt->Action == PGOOptions::IRInstr || 1114 PGOOpt->Action == PGOOptions::IRUse || !PGOOpt->MemoryProfile.empty())) 1115 addPreInlinerPasses(MPM, Level, Phase); 1116 1117 // Add all the requested passes for instrumentation PGO, if requested. 1118 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 1119 (PGOOpt->Action == PGOOptions::IRInstr || 1120 PGOOpt->Action == PGOOptions::IRUse)) { 1121 addPGOInstrPasses(MPM, Level, 1122 /*RunProfileGen=*/PGOOpt->Action == PGOOptions::IRInstr, 1123 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, 1124 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1125 PGOOpt->FS); 1126 MPM.addPass(PGOIndirectCallPromotion(false, false)); 1127 } 1128 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 1129 PGOOpt->CSAction == PGOOptions::CSIRInstr) 1130 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile)); 1131 1132 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 1133 !PGOOpt->MemoryProfile.empty()) 1134 MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS)); 1135 1136 // Synthesize function entry counts for non-PGO compilation. 1137 if (EnableSyntheticCounts && !PGOOpt) 1138 MPM.addPass(SyntheticCountsPropagation()); 1139 1140 MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true)); 1141 1142 if (EnableModuleInliner) 1143 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 1144 else 1145 MPM.addPass(buildInlinerPipeline(Level, Phase)); 1146 1147 // Remove any dead arguments exposed by cleanups, constant folding globals, 1148 // and argument promotion. 1149 MPM.addPass(DeadArgumentEliminationPass()); 1150 1151 MPM.addPass(CoroCleanupPass()); 1152 1153 // Optimize globals now that functions are fully simplified. 1154 MPM.addPass(GlobalOptPass()); 1155 MPM.addPass(GlobalDCEPass()); 1156 1157 return MPM; 1158 } 1159 1160 /// TODO: Should LTO cause any differences to this set of passes? 1161 void PassBuilder::addVectorPasses(OptimizationLevel Level, 1162 FunctionPassManager &FPM, bool IsFullLTO) { 1163 FPM.addPass(LoopVectorizePass( 1164 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 1165 1166 if (EnableInferAlignmentPass) 1167 FPM.addPass(InferAlignmentPass()); 1168 if (IsFullLTO) { 1169 // The vectorizer may have significantly shortened a loop body; unroll 1170 // again. Unroll small loops to hide loop backedge latency and saturate any 1171 // parallel execution resources of an out-of-order processor. We also then 1172 // need to clean up redundancies and loop invariant code. 1173 // FIXME: It would be really good to use a loop-integrated instruction 1174 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1175 // across the loop nests. 1176 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1177 if (EnableUnrollAndJam && PTO.LoopUnrolling) 1178 FPM.addPass(createFunctionToLoopPassAdaptor( 1179 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1180 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1181 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1182 PTO.ForgetAllSCEVInLoopUnroll))); 1183 FPM.addPass(WarnMissedTransformationsPass()); 1184 // Now that we are done with loop unrolling, be it either by LoopVectorizer, 1185 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have 1186 // become constant-offset, thus enabling SROA and alloca promotion. Do so. 1187 // NOTE: we are very late in the pipeline, and we don't have any LICM 1188 // or SimplifyCFG passes scheduled after us, that would cleanup 1189 // the CFG mess this may created if allowed to modify CFG, so forbid that. 1190 FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); 1191 } 1192 1193 if (!IsFullLTO) { 1194 // Eliminate loads by forwarding stores from the previous iteration to loads 1195 // of the current iteration. 1196 FPM.addPass(LoopLoadEliminationPass()); 1197 } 1198 // Cleanup after the loop optimization passes. 1199 FPM.addPass(InstCombinePass()); 1200 1201 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1202 ExtraVectorPassManager ExtraPasses; 1203 // At higher optimization levels, try to clean up any runtime overlap and 1204 // alignment checks inserted by the vectorizer. We want to track correlated 1205 // runtime checks for two inner loops in the same outer loop, fold any 1206 // common computations, hoist loop-invariant aspects out of any outer loop, 1207 // and unswitch the runtime checks if possible. Once hoisted, we may have 1208 // dead (or speculatable) control flows or more combining opportunities. 1209 ExtraPasses.addPass(EarlyCSEPass()); 1210 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1211 ExtraPasses.addPass(InstCombinePass()); 1212 LoopPassManager LPM; 1213 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1214 /*AllowSpeculation=*/true)); 1215 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1216 OptimizationLevel::O3)); 1217 ExtraPasses.addPass( 1218 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1219 /*UseBlockFrequencyInfo=*/true)); 1220 ExtraPasses.addPass( 1221 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1222 ExtraPasses.addPass(InstCombinePass()); 1223 FPM.addPass(std::move(ExtraPasses)); 1224 } 1225 1226 // Now that we've formed fast to execute loop structures, we do further 1227 // optimizations. These are run afterward as they might block doing complex 1228 // analyses and transforms such as what are needed for loop vectorization. 1229 1230 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1231 // GVN, loop transforms, and others have already run, so it's now better to 1232 // convert to more optimized IR using more aggressive simplify CFG options. 1233 // The extra sinking transform can create larger basic blocks, so do this 1234 // before SLP vectorization. 1235 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1236 .forwardSwitchCondToPhi(true) 1237 .convertSwitchRangeToICmp(true) 1238 .convertSwitchToLookupTable(true) 1239 .needCanonicalLoops(false) 1240 .hoistCommonInsts(true) 1241 .sinkCommonInsts(true))); 1242 1243 if (IsFullLTO) { 1244 FPM.addPass(SCCPPass()); 1245 FPM.addPass(InstCombinePass()); 1246 FPM.addPass(BDCEPass()); 1247 } 1248 1249 // Optimize parallel scalar instruction chains into SIMD instructions. 1250 if (PTO.SLPVectorization) { 1251 FPM.addPass(SLPVectorizerPass()); 1252 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1253 FPM.addPass(EarlyCSEPass()); 1254 } 1255 } 1256 // Enhance/cleanup vector code. 1257 FPM.addPass(VectorCombinePass()); 1258 1259 if (!IsFullLTO) { 1260 FPM.addPass(InstCombinePass()); 1261 // Unroll small loops to hide loop backedge latency and saturate any 1262 // parallel execution resources of an out-of-order processor. We also then 1263 // need to clean up redundancies and loop invariant code. 1264 // FIXME: It would be really good to use a loop-integrated instruction 1265 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1266 // across the loop nests. 1267 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1268 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1269 FPM.addPass(createFunctionToLoopPassAdaptor( 1270 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1271 } 1272 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1273 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1274 PTO.ForgetAllSCEVInLoopUnroll))); 1275 FPM.addPass(WarnMissedTransformationsPass()); 1276 // Now that we are done with loop unrolling, be it either by LoopVectorizer, 1277 // or LoopUnroll passes, some variable-offset GEP's into alloca's could have 1278 // become constant-offset, thus enabling SROA and alloca promotion. Do so. 1279 // NOTE: we are very late in the pipeline, and we don't have any LICM 1280 // or SimplifyCFG passes scheduled after us, that would cleanup 1281 // the CFG mess this may created if allowed to modify CFG, so forbid that. 1282 FPM.addPass(SROAPass(SROAOptions::PreserveCFG)); 1283 } 1284 1285 if (EnableInferAlignmentPass) 1286 FPM.addPass(InferAlignmentPass()); 1287 FPM.addPass(InstCombinePass()); 1288 1289 // This is needed for two reasons: 1290 // 1. It works around problems that instcombine introduces, such as sinking 1291 // expensive FP divides into loops containing multiplications using the 1292 // divide result. 1293 // 2. It helps to clean up some loop-invariant code created by the loop 1294 // unroll pass when IsFullLTO=false. 1295 FPM.addPass(createFunctionToLoopPassAdaptor( 1296 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1297 /*AllowSpeculation=*/true), 1298 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1299 1300 // Now that we've vectorized and unrolled loops, we may have more refined 1301 // alignment information, try to re-derive it here. 1302 FPM.addPass(AlignmentFromAssumptionsPass()); 1303 } 1304 1305 ModulePassManager 1306 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1307 ThinOrFullLTOPhase LTOPhase) { 1308 const bool LTOPreLink = isLTOPreLink(LTOPhase); 1309 ModulePassManager MPM; 1310 1311 // Run partial inlining pass to partially inline functions that have 1312 // large bodies. 1313 if (RunPartialInlining) 1314 MPM.addPass(PartialInlinerPass()); 1315 1316 // Remove avail extern fns and globals definitions since we aren't compiling 1317 // an object file for later LTO. For LTO we want to preserve these so they 1318 // are eligible for inlining at link-time. Note if they are unreferenced they 1319 // will be removed by GlobalDCE later, so this only impacts referenced 1320 // available externally globals. Eventually they will be suppressed during 1321 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1322 // may make globals referenced by available external functions dead and saves 1323 // running remaining passes on the eliminated functions. These should be 1324 // preserved during prelinking for link-time inlining decisions. 1325 if (!LTOPreLink) 1326 MPM.addPass(EliminateAvailableExternallyPass()); 1327 1328 if (EnableOrderFileInstrumentation) 1329 MPM.addPass(InstrOrderFilePass()); 1330 1331 // Do RPO function attribute inference across the module to forward-propagate 1332 // attributes where applicable. 1333 // FIXME: Is this really an optimization rather than a canonicalization? 1334 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1335 1336 // Do a post inline PGO instrumentation and use pass. This is a context 1337 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1338 // cross-module inline has not been done yet. The context sensitive 1339 // instrumentation is after all the inlines are done. 1340 if (!LTOPreLink && PGOOpt) { 1341 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1342 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, 1343 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1344 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, 1345 PGOOpt->FS); 1346 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1347 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, 1348 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1349 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1350 PGOOpt->FS); 1351 } 1352 1353 // Re-compute GlobalsAA here prior to function passes. This is particularly 1354 // useful as the above will have inlined, DCE'ed, and function-attr 1355 // propagated everything. We should at this point have a reasonably minimal 1356 // and richly annotated call graph. By computing aliasing and mod/ref 1357 // information for all local globals here, the late loop passes and notably 1358 // the vectorizer will be able to use them to help recognize vectorizable 1359 // memory operations. 1360 if (EnableGlobalAnalyses) 1361 MPM.addPass(RecomputeGlobalsAAPass()); 1362 1363 invokeOptimizerEarlyEPCallbacks(MPM, Level); 1364 1365 FunctionPassManager OptimizePM; 1366 // Scheduling LoopVersioningLICM when inlining is over, because after that 1367 // we may see more accurate aliasing. Reason to run this late is that too 1368 // early versioning may prevent further inlining due to increase of code 1369 // size. Other optimizations which runs later might get benefit of no-alias 1370 // assumption in clone loop. 1371 if (UseLoopVersioningLICM) { 1372 OptimizePM.addPass( 1373 createFunctionToLoopPassAdaptor(LoopVersioningLICMPass())); 1374 // LoopVersioningLICM pass might increase new LICM opportunities. 1375 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1376 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1377 /*AllowSpeculation=*/true), 1378 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1379 } 1380 1381 OptimizePM.addPass(Float2IntPass()); 1382 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1383 1384 if (EnableMatrix) { 1385 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1386 OptimizePM.addPass(EarlyCSEPass()); 1387 } 1388 1389 // CHR pass should only be applied with the profile information. 1390 // The check is to check the profile summary information in CHR. 1391 if (EnableCHR && Level == OptimizationLevel::O3) 1392 OptimizePM.addPass(ControlHeightReductionPass()); 1393 1394 // FIXME: We need to run some loop optimizations to re-rotate loops after 1395 // simplifycfg and others undo their rotation. 1396 1397 // Optimize the loop execution. These passes operate on entire loop nests 1398 // rather than on each loop in an inside-out manner, and so they are actually 1399 // function passes. 1400 1401 invokeVectorizerStartEPCallbacks(OptimizePM, Level); 1402 1403 LoopPassManager LPM; 1404 // First rotate loops that may have been un-rotated by prior passes. 1405 // Disable header duplication at -Oz. 1406 LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); 1407 // Some loops may have become dead by now. Try to delete them. 1408 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1409 // this may need to be revisited once we run GVN before loop deletion 1410 // in the simplification pipeline. 1411 LPM.addPass(LoopDeletionPass()); 1412 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1413 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1414 1415 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1416 // into separate loop that would otherwise inhibit vectorization. This is 1417 // currently only performed for loops marked with the metadata 1418 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1419 OptimizePM.addPass(LoopDistributePass()); 1420 1421 // Populates the VFABI attribute with the scalar-to-vector mappings 1422 // from the TargetLibraryInfo. 1423 OptimizePM.addPass(InjectTLIMappings()); 1424 1425 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1426 1427 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1428 // canonicalization pass that enables other optimizations. As a result, 1429 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1430 // result too early. 1431 OptimizePM.addPass(LoopSinkPass()); 1432 1433 // And finally clean up LCSSA form before generating code. 1434 OptimizePM.addPass(InstSimplifyPass()); 1435 1436 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1437 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1438 // flattening of blocks. 1439 OptimizePM.addPass(DivRemPairsPass()); 1440 1441 // Try to annotate calls that were created during optimization. 1442 OptimizePM.addPass(TailCallElimPass()); 1443 1444 // LoopSink (and other loop passes since the last simplifyCFG) might have 1445 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1446 OptimizePM.addPass( 1447 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1448 1449 // Add the core optimizing pipeline. 1450 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1451 PTO.EagerlyInvalidateAnalyses)); 1452 1453 invokeOptimizerLastEPCallbacks(MPM, Level); 1454 1455 // Split out cold code. Splitting is done late to avoid hiding context from 1456 // other optimizations and inadvertently regressing performance. The tradeoff 1457 // is that this has a higher code size cost than splitting early. 1458 if (EnableHotColdSplit && !LTOPreLink) 1459 MPM.addPass(HotColdSplittingPass()); 1460 1461 // Search the code for similar regions of code. If enough similar regions can 1462 // be found where extracting the regions into their own function will decrease 1463 // the size of the program, we extract the regions, a deduplicate the 1464 // structurally similar regions. 1465 if (EnableIROutliner) 1466 MPM.addPass(IROutlinerPass()); 1467 1468 // Merge functions if requested. 1469 if (PTO.MergeFunctions) 1470 MPM.addPass(MergeFunctionsPass()); 1471 1472 // Now we need to do some global optimization transforms. 1473 // FIXME: It would seem like these should come first in the optimization 1474 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1475 // ordering here. 1476 MPM.addPass(GlobalDCEPass()); 1477 MPM.addPass(ConstantMergePass()); 1478 1479 if (PTO.CallGraphProfile && !LTOPreLink) 1480 MPM.addPass(CGProfilePass(LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink || 1481 LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink)); 1482 1483 // TODO: Relative look table converter pass caused an issue when full lto is 1484 // enabled. See https://reviews.llvm.org/D94355 for more details. 1485 // Until the issue fixed, disable this pass during pre-linking phase. 1486 if (!LTOPreLink) 1487 MPM.addPass(RelLookupTableConverterPass()); 1488 1489 return MPM; 1490 } 1491 1492 ModulePassManager 1493 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1494 bool LTOPreLink) { 1495 if (Level == OptimizationLevel::O0) 1496 return buildO0DefaultPipeline(Level, LTOPreLink); 1497 1498 ModulePassManager MPM; 1499 1500 // Convert @llvm.global.annotations to !annotation metadata. 1501 MPM.addPass(Annotation2MetadataPass()); 1502 1503 // Force any function attributes we want the rest of the pipeline to observe. 1504 MPM.addPass(ForceFunctionAttrsPass()); 1505 1506 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1507 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1508 1509 // Apply module pipeline start EP callback. 1510 invokePipelineStartEPCallbacks(MPM, Level); 1511 1512 const ThinOrFullLTOPhase LTOPhase = LTOPreLink 1513 ? ThinOrFullLTOPhase::FullLTOPreLink 1514 : ThinOrFullLTOPhase::None; 1515 // Add the core simplification pipeline. 1516 MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase)); 1517 1518 // Now add the optimization pipeline. 1519 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPhase)); 1520 1521 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1522 PGOOpt->Action == PGOOptions::SampleUse) 1523 MPM.addPass(PseudoProbeUpdatePass()); 1524 1525 // Emit annotation remarks. 1526 addAnnotationRemarksPass(MPM); 1527 1528 if (LTOPreLink) 1529 addRequiredLTOPreLinkPasses(MPM); 1530 return MPM; 1531 } 1532 1533 ModulePassManager 1534 PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO, 1535 bool EmitSummary) { 1536 ModulePassManager MPM; 1537 if (ThinLTO) 1538 MPM.addPass(buildThinLTOPreLinkDefaultPipeline(Level)); 1539 else 1540 MPM.addPass(buildLTOPreLinkDefaultPipeline(Level)); 1541 MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary)); 1542 1543 // Use the ThinLTO post-link pipeline with sample profiling 1544 if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) 1545 MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr)); 1546 else { 1547 // otherwise, just use module optimization 1548 MPM.addPass( 1549 buildModuleOptimizationPipeline(Level, ThinOrFullLTOPhase::None)); 1550 // Emit annotation remarks. 1551 addAnnotationRemarksPass(MPM); 1552 } 1553 return MPM; 1554 } 1555 1556 ModulePassManager 1557 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1558 if (Level == OptimizationLevel::O0) 1559 return buildO0DefaultPipeline(Level, /*LTOPreLink*/true); 1560 1561 ModulePassManager MPM; 1562 1563 // Convert @llvm.global.annotations to !annotation metadata. 1564 MPM.addPass(Annotation2MetadataPass()); 1565 1566 // Force any function attributes we want the rest of the pipeline to observe. 1567 MPM.addPass(ForceFunctionAttrsPass()); 1568 1569 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1570 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1571 1572 // Apply module pipeline start EP callback. 1573 invokePipelineStartEPCallbacks(MPM, Level); 1574 1575 // If we are planning to perform ThinLTO later, we don't bloat the code with 1576 // unrolling/vectorization/... now. Just simplify the module as much as we 1577 // can. 1578 MPM.addPass(buildModuleSimplificationPipeline( 1579 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1580 1581 // Run partial inlining pass to partially inline functions that have 1582 // large bodies. 1583 // FIXME: It isn't clear whether this is really the right place to run this 1584 // in ThinLTO. Because there is another canonicalization and simplification 1585 // phase that will run after the thin link, running this here ends up with 1586 // less information than will be available later and it may grow functions in 1587 // ways that aren't beneficial. 1588 if (RunPartialInlining) 1589 MPM.addPass(PartialInlinerPass()); 1590 1591 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1592 PGOOpt->Action == PGOOptions::SampleUse) 1593 MPM.addPass(PseudoProbeUpdatePass()); 1594 1595 // Handle Optimizer{Early,Last}EPCallbacks added by clang on PreLink. Actual 1596 // optimization is going to be done in PostLink stage, but clang can't add 1597 // callbacks there in case of in-process ThinLTO called by linker. 1598 invokeOptimizerEarlyEPCallbacks(MPM, Level); 1599 invokeOptimizerLastEPCallbacks(MPM, Level); 1600 1601 // Emit annotation remarks. 1602 addAnnotationRemarksPass(MPM); 1603 1604 addRequiredLTOPreLinkPasses(MPM); 1605 1606 return MPM; 1607 } 1608 1609 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1610 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1611 ModulePassManager MPM; 1612 1613 if (ImportSummary) { 1614 // For ThinLTO we must apply the context disambiguation decisions early, to 1615 // ensure we can correctly match the callsites to summary data. 1616 if (EnableMemProfContextDisambiguation) 1617 MPM.addPass(MemProfContextDisambiguation(ImportSummary)); 1618 1619 // These passes import type identifier resolutions for whole-program 1620 // devirtualization and CFI. They must run early because other passes may 1621 // disturb the specific instruction patterns that these passes look for, 1622 // creating dependencies on resolutions that may not appear in the summary. 1623 // 1624 // For example, GVN may transform the pattern assume(type.test) appearing in 1625 // two basic blocks into assume(phi(type.test, type.test)), which would 1626 // transform a dependency on a WPD resolution into a dependency on a type 1627 // identifier resolution for CFI. 1628 // 1629 // Also, WPD has access to more precise information than ICP and can 1630 // devirtualize more effectively, so it should operate on the IR first. 1631 // 1632 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1633 // metadata and intrinsics. 1634 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1635 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1636 } 1637 1638 if (Level == OptimizationLevel::O0) { 1639 // Run a second time to clean up any type tests left behind by WPD for use 1640 // in ICP. 1641 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1642 // Drop available_externally and unreferenced globals. This is necessary 1643 // with ThinLTO in order to avoid leaving undefined references to dead 1644 // globals in the object file. 1645 MPM.addPass(EliminateAvailableExternallyPass()); 1646 MPM.addPass(GlobalDCEPass()); 1647 return MPM; 1648 } 1649 1650 // Add the core simplification pipeline. 1651 MPM.addPass(buildModuleSimplificationPipeline( 1652 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1653 1654 // Now add the optimization pipeline. 1655 MPM.addPass(buildModuleOptimizationPipeline( 1656 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1657 1658 // Emit annotation remarks. 1659 addAnnotationRemarksPass(MPM); 1660 1661 return MPM; 1662 } 1663 1664 ModulePassManager 1665 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1666 // FIXME: We should use a customized pre-link pipeline! 1667 return buildPerModuleDefaultPipeline(Level, 1668 /* LTOPreLink */ true); 1669 } 1670 1671 ModulePassManager 1672 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1673 ModuleSummaryIndex *ExportSummary) { 1674 ModulePassManager MPM; 1675 1676 invokeFullLinkTimeOptimizationEarlyEPCallbacks(MPM, Level); 1677 1678 // Create a function that performs CFI checks for cross-DSO calls with targets 1679 // in the current module. 1680 MPM.addPass(CrossDSOCFIPass()); 1681 1682 if (Level == OptimizationLevel::O0) { 1683 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1684 // metadata and intrinsics. 1685 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1686 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1687 // Run a second time to clean up any type tests left behind by WPD for use 1688 // in ICP. 1689 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1690 1691 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1692 1693 // Emit annotation remarks. 1694 addAnnotationRemarksPass(MPM); 1695 1696 return MPM; 1697 } 1698 1699 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1700 // Load sample profile before running the LTO optimization pipeline. 1701 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1702 PGOOpt->ProfileRemappingFile, 1703 ThinOrFullLTOPhase::FullLTOPostLink)); 1704 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1705 // RequireAnalysisPass for PSI before subsequent non-module passes. 1706 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1707 } 1708 1709 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1710 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); 1711 1712 // Remove unused virtual tables to improve the quality of code generated by 1713 // whole-program devirtualization and bitset lowering. 1714 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1715 1716 // Do basic inference of function attributes from known properties of system 1717 // libraries and other oracles. 1718 MPM.addPass(InferFunctionAttrsPass()); 1719 1720 if (Level.getSpeedupLevel() > 1) { 1721 MPM.addPass(createModuleToFunctionPassAdaptor( 1722 CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses)); 1723 1724 // Indirect call promotion. This should promote all the targets that are 1725 // left by the earlier promotion pass that promotes intra-module targets. 1726 // This two-step promotion is to save the compile time. For LTO, it should 1727 // produce the same result as if we only do promotion here. 1728 MPM.addPass(PGOIndirectCallPromotion( 1729 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1730 1731 // Propagate constants at call sites into the functions they call. This 1732 // opens opportunities for globalopt (and inlining) by substituting function 1733 // pointers passed as arguments to direct uses of functions. 1734 MPM.addPass(IPSCCPPass(IPSCCPOptions(/*AllowFuncSpec=*/ 1735 Level != OptimizationLevel::Os && 1736 Level != OptimizationLevel::Oz))); 1737 1738 // Attach metadata to indirect call sites indicating the set of functions 1739 // they may target at run-time. This should follow IPSCCP. 1740 MPM.addPass(CalledValuePropagationPass()); 1741 } 1742 1743 // Now deduce any function attributes based in the current code. 1744 MPM.addPass( 1745 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1746 1747 // Do RPO function attribute inference across the module to forward-propagate 1748 // attributes where applicable. 1749 // FIXME: Is this really an optimization rather than a canonicalization? 1750 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1751 1752 // Use in-range annotations on GEP indices to split globals where beneficial. 1753 MPM.addPass(GlobalSplitPass()); 1754 1755 // Run whole program optimization of virtual call when the list of callees 1756 // is fixed. 1757 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1758 1759 // Stop here at -O1. 1760 if (Level == OptimizationLevel::O1) { 1761 // The LowerTypeTestsPass needs to run to lower type metadata and the 1762 // type.test intrinsics. The pass does nothing if CFI is disabled. 1763 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1764 // Run a second time to clean up any type tests left behind by WPD for use 1765 // in ICP (which is performed earlier than this in the regular LTO 1766 // pipeline). 1767 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1768 1769 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1770 1771 // Emit annotation remarks. 1772 addAnnotationRemarksPass(MPM); 1773 1774 return MPM; 1775 } 1776 1777 // Optimize globals to try and fold them into constants. 1778 MPM.addPass(GlobalOptPass()); 1779 1780 // Promote any localized globals to SSA registers. 1781 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1782 1783 // Linking modules together can lead to duplicate global constant, only 1784 // keep one copy of each constant. 1785 MPM.addPass(ConstantMergePass()); 1786 1787 // Remove unused arguments from functions. 1788 MPM.addPass(DeadArgumentEliminationPass()); 1789 1790 // Reduce the code after globalopt and ipsccp. Both can open up significant 1791 // simplification opportunities, and both can propagate functions through 1792 // function pointers. When this happens, we often have to resolve varargs 1793 // calls, etc, so let instcombine do this. 1794 FunctionPassManager PeepholeFPM; 1795 PeepholeFPM.addPass(InstCombinePass()); 1796 if (Level.getSpeedupLevel() > 1) 1797 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1798 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1799 1800 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1801 PTO.EagerlyInvalidateAnalyses)); 1802 1803 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1804 // generally clean up exception handling overhead. It isn't clear this is 1805 // valuable as the inliner doesn't currently care whether it is inlining an 1806 // invoke or a call. 1807 // Run the inliner now. 1808 if (EnableModuleInliner) { 1809 MPM.addPass(ModuleInlinerPass(getInlineParamsFromOptLevel(Level), 1810 UseInlineAdvisor, 1811 ThinOrFullLTOPhase::FullLTOPostLink)); 1812 } else { 1813 MPM.addPass(ModuleInlinerWrapperPass( 1814 getInlineParamsFromOptLevel(Level), 1815 /* MandatoryFirst */ true, 1816 InlineContext{ThinOrFullLTOPhase::FullLTOPostLink, 1817 InlinePass::CGSCCInliner})); 1818 } 1819 1820 // Perform context disambiguation after inlining, since that would reduce the 1821 // amount of additional cloning required to distinguish the allocation 1822 // contexts. 1823 if (EnableMemProfContextDisambiguation) 1824 MPM.addPass(MemProfContextDisambiguation()); 1825 1826 // Optimize globals again after we ran the inliner. 1827 MPM.addPass(GlobalOptPass()); 1828 1829 // Run the OpenMPOpt pass again after global optimizations. 1830 MPM.addPass(OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink)); 1831 1832 // Garbage collect dead functions. 1833 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1834 1835 // If we didn't decide to inline a function, check to see if we can 1836 // transform it to pass arguments by value instead of by reference. 1837 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1838 1839 FunctionPassManager FPM; 1840 // The IPO Passes may leave cruft around. Clean up after them. 1841 FPM.addPass(InstCombinePass()); 1842 invokePeepholeEPCallbacks(FPM, Level); 1843 1844 if (EnableConstraintElimination) 1845 FPM.addPass(ConstraintEliminationPass()); 1846 1847 FPM.addPass(JumpThreadingPass()); 1848 1849 // Do a post inline PGO instrumentation and use pass. This is a context 1850 // sensitive PGO pass. 1851 if (PGOOpt) { 1852 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1853 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, 1854 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1855 PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, 1856 PGOOpt->FS); 1857 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1858 addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, 1859 /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, 1860 PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, 1861 PGOOpt->FS); 1862 } 1863 1864 // Break up allocas 1865 FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); 1866 1867 // LTO provides additional opportunities for tailcall elimination due to 1868 // link-time inlining, and visibility of nocapture attribute. 1869 FPM.addPass(TailCallElimPass()); 1870 1871 // Run a few AA driver optimizations here and now to cleanup the code. 1872 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 1873 PTO.EagerlyInvalidateAnalyses)); 1874 1875 MPM.addPass( 1876 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1877 1878 // Require the GlobalsAA analysis for the module so we can query it within 1879 // MainFPM. 1880 if (EnableGlobalAnalyses) { 1881 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1882 // Invalidate AAManager so it can be recreated and pick up the newly 1883 // available GlobalsAA. 1884 MPM.addPass( 1885 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 1886 } 1887 1888 FunctionPassManager MainFPM; 1889 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1890 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1891 /*AllowSpeculation=*/true), 1892 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/false)); 1893 1894 if (RunNewGVN) 1895 MainFPM.addPass(NewGVNPass()); 1896 else 1897 MainFPM.addPass(GVNPass()); 1898 1899 // Remove dead memcpy()'s. 1900 MainFPM.addPass(MemCpyOptPass()); 1901 1902 // Nuke dead stores. 1903 MainFPM.addPass(DSEPass()); 1904 MainFPM.addPass(MoveAutoInitPass()); 1905 MainFPM.addPass(MergedLoadStoreMotionPass()); 1906 1907 LoopPassManager LPM; 1908 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 1909 LPM.addPass(LoopFlattenPass()); 1910 LPM.addPass(IndVarSimplifyPass()); 1911 LPM.addPass(LoopDeletionPass()); 1912 // FIXME: Add loop interchange. 1913 1914 // Unroll small loops and perform peeling. 1915 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 1916 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 1917 PTO.ForgetAllSCEVInLoopUnroll)); 1918 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 1919 // *All* loop passes must preserve it, in order to be able to use it. 1920 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1921 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 1922 1923 MainFPM.addPass(LoopDistributePass()); 1924 1925 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 1926 1927 // Run the OpenMPOpt CGSCC pass again late. 1928 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 1929 OpenMPOptCGSCCPass(ThinOrFullLTOPhase::FullLTOPostLink))); 1930 1931 invokePeepholeEPCallbacks(MainFPM, Level); 1932 MainFPM.addPass(JumpThreadingPass()); 1933 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 1934 PTO.EagerlyInvalidateAnalyses)); 1935 1936 // Lower type metadata and the type.test intrinsic. This pass supports 1937 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 1938 // to be run at link time if CFI is enabled. This pass does nothing if 1939 // CFI is disabled. 1940 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1941 // Run a second time to clean up any type tests left behind by WPD for use 1942 // in ICP (which is performed earlier than this in the regular LTO pipeline). 1943 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1944 1945 // Enable splitting late in the FullLTO post-link pipeline. 1946 if (EnableHotColdSplit) 1947 MPM.addPass(HotColdSplittingPass()); 1948 1949 // Add late LTO optimization passes. 1950 FunctionPassManager LateFPM; 1951 1952 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1953 // canonicalization pass that enables other optimizations. As a result, 1954 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1955 // result too early. 1956 LateFPM.addPass(LoopSinkPass()); 1957 1958 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1959 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1960 // flattening of blocks. 1961 LateFPM.addPass(DivRemPairsPass()); 1962 1963 // Delete basic blocks, which optimization passes may have killed. 1964 LateFPM.addPass(SimplifyCFGPass( 1965 SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts( 1966 true))); 1967 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(LateFPM))); 1968 1969 // Drop bodies of available eternally objects to improve GlobalDCE. 1970 MPM.addPass(EliminateAvailableExternallyPass()); 1971 1972 // Now that we have optimized the program, discard unreachable functions. 1973 MPM.addPass(GlobalDCEPass(/*InLTOPostLink=*/true)); 1974 1975 if (PTO.MergeFunctions) 1976 MPM.addPass(MergeFunctionsPass()); 1977 1978 if (PTO.CallGraphProfile) 1979 MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true)); 1980 1981 invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level); 1982 1983 // Emit annotation remarks. 1984 addAnnotationRemarksPass(MPM); 1985 1986 return MPM; 1987 } 1988 1989 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 1990 bool LTOPreLink) { 1991 assert(Level == OptimizationLevel::O0 && 1992 "buildO0DefaultPipeline should only be used with O0"); 1993 1994 ModulePassManager MPM; 1995 1996 // Perform pseudo probe instrumentation in O0 mode. This is for the 1997 // consistency between different build modes. For example, a LTO build can be 1998 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 1999 // the postlink will require pseudo probe instrumentation in the prelink. 2000 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 2001 MPM.addPass(SampleProfileProbePass(TM)); 2002 2003 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 2004 PGOOpt->Action == PGOOptions::IRUse)) 2005 addPGOInstrPassesForO0( 2006 MPM, 2007 /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr), 2008 /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, PGOOpt->ProfileFile, 2009 PGOOpt->ProfileRemappingFile, PGOOpt->FS); 2010 2011 invokePipelineStartEPCallbacks(MPM, Level); 2012 2013 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 2014 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 2015 2016 invokePipelineEarlySimplificationEPCallbacks(MPM, Level); 2017 2018 // Build a minimal pipeline based on the semantics required by LLVM, 2019 // which is just that always inlining occurs. Further, disable generating 2020 // lifetime intrinsics to avoid enabling further optimizations during 2021 // code generation. 2022 MPM.addPass(AlwaysInlinerPass( 2023 /*InsertLifetimeIntrinsics=*/false)); 2024 2025 if (PTO.MergeFunctions) 2026 MPM.addPass(MergeFunctionsPass()); 2027 2028 if (EnableMatrix) 2029 MPM.addPass( 2030 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 2031 2032 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 2033 CGSCCPassManager CGPM; 2034 invokeCGSCCOptimizerLateEPCallbacks(CGPM, Level); 2035 if (!CGPM.isEmpty()) 2036 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 2037 } 2038 if (!LateLoopOptimizationsEPCallbacks.empty()) { 2039 LoopPassManager LPM; 2040 invokeLateLoopOptimizationsEPCallbacks(LPM, Level); 2041 if (!LPM.isEmpty()) { 2042 MPM.addPass(createModuleToFunctionPassAdaptor( 2043 createFunctionToLoopPassAdaptor(std::move(LPM)))); 2044 } 2045 } 2046 if (!LoopOptimizerEndEPCallbacks.empty()) { 2047 LoopPassManager LPM; 2048 invokeLoopOptimizerEndEPCallbacks(LPM, Level); 2049 if (!LPM.isEmpty()) { 2050 MPM.addPass(createModuleToFunctionPassAdaptor( 2051 createFunctionToLoopPassAdaptor(std::move(LPM)))); 2052 } 2053 } 2054 if (!ScalarOptimizerLateEPCallbacks.empty()) { 2055 FunctionPassManager FPM; 2056 invokeScalarOptimizerLateEPCallbacks(FPM, Level); 2057 if (!FPM.isEmpty()) 2058 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2059 } 2060 2061 invokeOptimizerEarlyEPCallbacks(MPM, Level); 2062 2063 if (!VectorizerStartEPCallbacks.empty()) { 2064 FunctionPassManager FPM; 2065 invokeVectorizerStartEPCallbacks(FPM, Level); 2066 if (!FPM.isEmpty()) 2067 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 2068 } 2069 2070 ModulePassManager CoroPM; 2071 CoroPM.addPass(CoroEarlyPass()); 2072 CGSCCPassManager CGPM; 2073 CGPM.addPass(CoroSplitPass()); 2074 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 2075 CoroPM.addPass(CoroCleanupPass()); 2076 CoroPM.addPass(GlobalDCEPass()); 2077 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); 2078 2079 invokeOptimizerLastEPCallbacks(MPM, Level); 2080 2081 if (LTOPreLink) 2082 addRequiredLTOPreLinkPasses(MPM); 2083 2084 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 2085 2086 return MPM; 2087 } 2088 2089 AAManager PassBuilder::buildDefaultAAPipeline() { 2090 AAManager AA; 2091 2092 // The order in which these are registered determines their priority when 2093 // being queried. 2094 2095 // First we register the basic alias analysis that provides the majority of 2096 // per-function local AA logic. This is a stateless, on-demand local set of 2097 // AA techniques. 2098 AA.registerFunctionAnalysis<BasicAA>(); 2099 2100 // Next we query fast, specialized alias analyses that wrap IR-embedded 2101 // information about aliasing. 2102 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 2103 AA.registerFunctionAnalysis<TypeBasedAA>(); 2104 2105 // Add support for querying global aliasing information when available. 2106 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 2107 // analysis, all that the `AAManager` can do is query for any *cached* 2108 // results from `GlobalsAA` through a readonly proxy. 2109 if (EnableGlobalAnalyses) 2110 AA.registerModuleAnalysis<GlobalsAA>(); 2111 2112 // Add target-specific alias analyses. 2113 if (TM) 2114 TM->registerDefaultAliasAnalyses(AA); 2115 2116 return AA; 2117 } 2118