1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/BasicAliasAnalysis.h" 19 #include "llvm/Analysis/CGSCCPassManager.h" 20 #include "llvm/Analysis/GlobalsModRef.h" 21 #include "llvm/Analysis/InlineAdvisor.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/ScopedNoAliasAA.h" 25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 26 #include "llvm/IR/PassManager.h" 27 #include "llvm/Passes/OptimizationLevel.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/PGOOptions.h" 32 #include "llvm/Target/TargetMachine.h" 33 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 34 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 35 #include "llvm/Transforms/Coroutines/CoroEarly.h" 36 #include "llvm/Transforms/Coroutines/CoroElide.h" 37 #include "llvm/Transforms/Coroutines/CoroSplit.h" 38 #include "llvm/Transforms/IPO/AlwaysInliner.h" 39 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 40 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 41 #include "llvm/Transforms/IPO/Attributor.h" 42 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 43 #include "llvm/Transforms/IPO/ConstantMerge.h" 44 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 45 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 46 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 47 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 48 #include "llvm/Transforms/IPO/FunctionAttrs.h" 49 #include "llvm/Transforms/IPO/GlobalDCE.h" 50 #include "llvm/Transforms/IPO/GlobalOpt.h" 51 #include "llvm/Transforms/IPO/GlobalSplit.h" 52 #include "llvm/Transforms/IPO/HotColdSplitting.h" 53 #include "llvm/Transforms/IPO/IROutliner.h" 54 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 55 #include "llvm/Transforms/IPO/Inliner.h" 56 #include "llvm/Transforms/IPO/LowerTypeTests.h" 57 #include "llvm/Transforms/IPO/MergeFunctions.h" 58 #include "llvm/Transforms/IPO/ModuleInliner.h" 59 #include "llvm/Transforms/IPO/OpenMPOpt.h" 60 #include "llvm/Transforms/IPO/PartialInlining.h" 61 #include "llvm/Transforms/IPO/SCCP.h" 62 #include "llvm/Transforms/IPO/SampleProfile.h" 63 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 64 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 65 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 66 #include "llvm/Transforms/InstCombine/InstCombine.h" 67 #include "llvm/Transforms/Instrumentation/CGProfile.h" 68 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 69 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 70 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 71 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 72 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 73 #include "llvm/Transforms/Scalar/ADCE.h" 74 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 75 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 76 #include "llvm/Transforms/Scalar/BDCE.h" 77 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 78 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 79 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 80 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 81 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 82 #include "llvm/Transforms/Scalar/DivRemPairs.h" 83 #include "llvm/Transforms/Scalar/EarlyCSE.h" 84 #include "llvm/Transforms/Scalar/Float2Int.h" 85 #include "llvm/Transforms/Scalar/GVN.h" 86 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 87 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 88 #include "llvm/Transforms/Scalar/JumpThreading.h" 89 #include "llvm/Transforms/Scalar/LICM.h" 90 #include "llvm/Transforms/Scalar/LoopDeletion.h" 91 #include "llvm/Transforms/Scalar/LoopDistribute.h" 92 #include "llvm/Transforms/Scalar/LoopFlatten.h" 93 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 94 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 95 #include "llvm/Transforms/Scalar/LoopInterchange.h" 96 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 97 #include "llvm/Transforms/Scalar/LoopPassManager.h" 98 #include "llvm/Transforms/Scalar/LoopRotation.h" 99 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 100 #include "llvm/Transforms/Scalar/LoopSink.h" 101 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 102 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 103 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 104 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 105 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 106 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 107 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 108 #include "llvm/Transforms/Scalar/NewGVN.h" 109 #include "llvm/Transforms/Scalar/Reassociate.h" 110 #include "llvm/Transforms/Scalar/SCCP.h" 111 #include "llvm/Transforms/Scalar/SROA.h" 112 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 113 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 114 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 115 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 116 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 117 #include "llvm/Transforms/Utils/AddDiscriminators.h" 118 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 119 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 120 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 121 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 122 #include "llvm/Transforms/Utils/Mem2Reg.h" 123 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 124 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 125 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 126 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 127 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 128 #include "llvm/Transforms/Vectorize/VectorCombine.h" 129 130 using namespace llvm; 131 132 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 133 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 134 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 135 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 136 "Heuristics-based inliner version."), 137 clEnumValN(InliningAdvisorMode::Development, "development", 138 "Use development mode (runtime-loadable model)."), 139 clEnumValN(InliningAdvisorMode::Release, "release", 140 "Use release mode (AOT-compiled model)."))); 141 142 static cl::opt<bool> EnableSyntheticCounts( 143 "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore, 144 cl::desc("Run synthetic function entry count generation " 145 "pass")); 146 147 /// Flag to enable inline deferral during PGO. 148 static cl::opt<bool> 149 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 150 cl::Hidden, 151 cl::desc("Enable inline deferral during PGO")); 152 153 static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::init(false), 154 cl::Hidden, cl::ZeroOrMore, 155 cl::desc("Enable memory profiler")); 156 157 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 158 cl::init(false), cl::Hidden, 159 cl::desc("Enable module inliner")); 160 161 static cl::opt<bool> PerformMandatoryInliningsFirst( 162 "mandatory-inlining-first", cl::init(true), cl::Hidden, cl::ZeroOrMore, 163 cl::desc("Perform mandatory inlinings module-wide, before performing " 164 "inlining.")); 165 166 static cl::opt<bool> EnableO3NonTrivialUnswitching( 167 "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden, 168 cl::ZeroOrMore, cl::desc("Enable non-trivial loop unswitching for -O3")); 169 170 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 171 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 172 cl::desc("Eagerly invalidate more analyses in default pipelines")); 173 174 static cl::opt<bool> EnableNoRerunSimplificationPipeline( 175 "enable-no-rerun-simplification-pipeline", cl::init(false), cl::Hidden, 176 cl::desc( 177 "Prevent running the simplification pipeline on a function more " 178 "than once in the case that SCC mutations cause a function to be " 179 "visited multiple times as long as the function has not been changed")); 180 181 static cl::opt<bool> EnableMergeFunctions( 182 "enable-merge-functions", cl::init(false), cl::Hidden, 183 cl::desc("Enable function merging as part of the optimization pipeline")); 184 185 PipelineTuningOptions::PipelineTuningOptions() { 186 LoopInterleaving = true; 187 LoopVectorization = true; 188 SLPVectorization = false; 189 LoopUnrolling = true; 190 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 191 LicmMssaOptCap = SetLicmMssaOptCap; 192 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 193 CallGraphProfile = true; 194 MergeFunctions = EnableMergeFunctions; 195 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 196 } 197 198 namespace llvm { 199 200 extern cl::opt<unsigned> MaxDevirtIterations; 201 extern cl::opt<bool> EnableConstraintElimination; 202 extern cl::opt<bool> EnableFunctionSpecialization; 203 extern cl::opt<bool> EnableGVNHoist; 204 extern cl::opt<bool> EnableGVNSink; 205 extern cl::opt<bool> EnableHotColdSplit; 206 extern cl::opt<bool> EnableIROutliner; 207 extern cl::opt<bool> EnableOrderFileInstrumentation; 208 extern cl::opt<bool> EnableCHR; 209 extern cl::opt<bool> EnableLoopInterchange; 210 extern cl::opt<bool> EnableUnrollAndJam; 211 extern cl::opt<bool> EnableLoopFlatten; 212 extern cl::opt<bool> EnableDFAJumpThreading; 213 extern cl::opt<bool> RunNewGVN; 214 extern cl::opt<bool> RunPartialInlining; 215 extern cl::opt<bool> ExtraVectorizerPasses; 216 217 extern cl::opt<bool> FlattenedProfileUsed; 218 219 extern cl::opt<AttributorRunOption> AttributorRun; 220 extern cl::opt<bool> EnableKnowledgeRetention; 221 222 extern cl::opt<bool> EnableMatrix; 223 224 extern cl::opt<bool> DisablePreInliner; 225 extern cl::opt<int> PreInlineThreshold; 226 } // namespace llvm 227 228 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 229 OptimizationLevel Level) { 230 for (auto &C : PeepholeEPCallbacks) 231 C(FPM, Level); 232 } 233 234 // Helper to add AnnotationRemarksPass. 235 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 236 FunctionPassManager FPM; 237 FPM.addPass(AnnotationRemarksPass()); 238 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 239 } 240 241 // Helper to check if the current compilation phase is preparing for LTO 242 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 243 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 244 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 245 } 246 247 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 248 FunctionPassManager 249 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 250 ThinOrFullLTOPhase Phase) { 251 252 FunctionPassManager FPM; 253 254 // Form SSA out of local memory accesses after breaking apart aggregates into 255 // scalars. 256 FPM.addPass(SROAPass()); 257 258 // Catch trivial redundancies 259 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 260 261 // Hoisting of scalars and load expressions. 262 FPM.addPass( 263 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 264 FPM.addPass(InstCombinePass()); 265 266 FPM.addPass(LibCallsShrinkWrapPass()); 267 268 invokePeepholeEPCallbacks(FPM, Level); 269 270 FPM.addPass( 271 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 272 273 // Form canonically associated expression trees, and simplify the trees using 274 // basic mathematical properties. For example, this will form (nearly) 275 // minimal multiplication trees. 276 FPM.addPass(ReassociatePass()); 277 278 // Add the primary loop simplification pipeline. 279 // FIXME: Currently this is split into two loop pass pipelines because we run 280 // some function passes in between them. These can and should be removed 281 // and/or replaced by scheduling the loop pass equivalents in the correct 282 // positions. But those equivalent passes aren't powerful enough yet. 283 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 284 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 285 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 286 // `LoopInstSimplify`. 287 LoopPassManager LPM1, LPM2; 288 289 // Simplify the loop body. We do this initially to clean up after other loop 290 // passes run, either when iterating on a loop or on inner loops with 291 // implications on the outer loop. 292 LPM1.addPass(LoopInstSimplifyPass()); 293 LPM1.addPass(LoopSimplifyCFGPass()); 294 295 // Try to remove as much code from the loop header as possible, 296 // to reduce amount of IR that will have to be duplicated. However, 297 // do not perform speculative hoisting the first time as LICM 298 // will destroy metadata that may not need to be destroyed if run 299 // after loop rotation. 300 // TODO: Investigate promotion cap for O1. 301 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 302 /*AllowSpeculation=*/false)); 303 304 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 305 isLTOPreLink(Phase))); 306 // TODO: Investigate promotion cap for O1. 307 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 308 /*AllowSpeculation=*/true)); 309 LPM1.addPass(SimpleLoopUnswitchPass()); 310 if (EnableLoopFlatten) 311 LPM1.addPass(LoopFlattenPass()); 312 313 LPM2.addPass(LoopIdiomRecognizePass()); 314 LPM2.addPass(IndVarSimplifyPass()); 315 316 for (auto &C : LateLoopOptimizationsEPCallbacks) 317 C(LPM2, Level); 318 319 LPM2.addPass(LoopDeletionPass()); 320 321 if (EnableLoopInterchange) 322 LPM2.addPass(LoopInterchangePass()); 323 324 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 325 // because it changes IR to makes profile annotation in back compile 326 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 327 // attributes so we need to make sure and allow the full unroll pass to pay 328 // attention to it. 329 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 330 PGOOpt->Action != PGOOptions::SampleUse) 331 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 332 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 333 PTO.ForgetAllSCEVInLoopUnroll)); 334 335 for (auto &C : LoopOptimizerEndEPCallbacks) 336 C(LPM2, Level); 337 338 // We provide the opt remark emitter pass for LICM to use. We only need to do 339 // this once as it is immutable. 340 FPM.addPass( 341 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 342 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 343 /*UseMemorySSA=*/true, 344 /*UseBlockFrequencyInfo=*/true)); 345 FPM.addPass( 346 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 347 FPM.addPass(InstCombinePass()); 348 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 349 // *All* loop passes must preserve it, in order to be able to use it. 350 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 351 /*UseMemorySSA=*/false, 352 /*UseBlockFrequencyInfo=*/false)); 353 354 // Delete small array after loop unroll. 355 FPM.addPass(SROAPass()); 356 357 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 358 FPM.addPass(MemCpyOptPass()); 359 360 // Sparse conditional constant propagation. 361 // FIXME: It isn't clear why we do this *after* loop passes rather than 362 // before... 363 FPM.addPass(SCCPPass()); 364 365 // Delete dead bit computations (instcombine runs after to fold away the dead 366 // computations, and then ADCE will run later to exploit any new DCE 367 // opportunities that creates). 368 FPM.addPass(BDCEPass()); 369 370 // Run instcombine after redundancy and dead bit elimination to exploit 371 // opportunities opened up by them. 372 FPM.addPass(InstCombinePass()); 373 invokePeepholeEPCallbacks(FPM, Level); 374 375 FPM.addPass(CoroElidePass()); 376 377 for (auto &C : ScalarOptimizerLateEPCallbacks) 378 C(FPM, Level); 379 380 // Finally, do an expensive DCE pass to catch all the dead code exposed by 381 // the simplifications and basic cleanup after all the simplifications. 382 // TODO: Investigate if this is too expensive. 383 FPM.addPass(ADCEPass()); 384 FPM.addPass( 385 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 386 FPM.addPass(InstCombinePass()); 387 invokePeepholeEPCallbacks(FPM, Level); 388 389 return FPM; 390 } 391 392 FunctionPassManager 393 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 394 ThinOrFullLTOPhase Phase) { 395 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 396 397 // The O1 pipeline has a separate pipeline creation function to simplify 398 // construction readability. 399 if (Level.getSpeedupLevel() == 1) 400 return buildO1FunctionSimplificationPipeline(Level, Phase); 401 402 FunctionPassManager FPM; 403 404 // Form SSA out of local memory accesses after breaking apart aggregates into 405 // scalars. 406 FPM.addPass(SROAPass()); 407 408 // Catch trivial redundancies 409 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 410 if (EnableKnowledgeRetention) 411 FPM.addPass(AssumeSimplifyPass()); 412 413 // Hoisting of scalars and load expressions. 414 if (EnableGVNHoist) 415 FPM.addPass(GVNHoistPass()); 416 417 // Global value numbering based sinking. 418 if (EnableGVNSink) { 419 FPM.addPass(GVNSinkPass()); 420 FPM.addPass( 421 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 422 } 423 424 if (EnableConstraintElimination) 425 FPM.addPass(ConstraintEliminationPass()); 426 427 // Speculative execution if the target has divergent branches; otherwise nop. 428 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 429 430 // Optimize based on known information about branches, and cleanup afterward. 431 FPM.addPass(JumpThreadingPass()); 432 FPM.addPass(CorrelatedValuePropagationPass()); 433 434 FPM.addPass( 435 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 436 FPM.addPass(InstCombinePass()); 437 if (Level == OptimizationLevel::O3) 438 FPM.addPass(AggressiveInstCombinePass()); 439 440 if (!Level.isOptimizingForSize()) 441 FPM.addPass(LibCallsShrinkWrapPass()); 442 443 invokePeepholeEPCallbacks(FPM, Level); 444 445 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 446 // using the size value profile. Don't perform this when optimizing for size. 447 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 448 !Level.isOptimizingForSize()) 449 FPM.addPass(PGOMemOPSizeOpt()); 450 451 FPM.addPass(TailCallElimPass()); 452 FPM.addPass( 453 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 454 455 // Form canonically associated expression trees, and simplify the trees using 456 // basic mathematical properties. For example, this will form (nearly) 457 // minimal multiplication trees. 458 FPM.addPass(ReassociatePass()); 459 460 // Add the primary loop simplification pipeline. 461 // FIXME: Currently this is split into two loop pass pipelines because we run 462 // some function passes in between them. These can and should be removed 463 // and/or replaced by scheduling the loop pass equivalents in the correct 464 // positions. But those equivalent passes aren't powerful enough yet. 465 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 466 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 467 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 468 // `LoopInstSimplify`. 469 LoopPassManager LPM1, LPM2; 470 471 // Simplify the loop body. We do this initially to clean up after other loop 472 // passes run, either when iterating on a loop or on inner loops with 473 // implications on the outer loop. 474 LPM1.addPass(LoopInstSimplifyPass()); 475 LPM1.addPass(LoopSimplifyCFGPass()); 476 477 // Try to remove as much code from the loop header as possible, 478 // to reduce amount of IR that will have to be duplicated. However, 479 // do not perform speculative hoisting the first time as LICM 480 // will destroy metadata that may not need to be destroyed if run 481 // after loop rotation. 482 // TODO: Investigate promotion cap for O1. 483 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 484 /*AllowSpeculation=*/false)); 485 486 // Disable header duplication in loop rotation at -Oz. 487 LPM1.addPass( 488 LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); 489 // TODO: Investigate promotion cap for O1. 490 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 491 /*AllowSpeculation=*/true)); 492 LPM1.addPass( 493 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && 494 EnableO3NonTrivialUnswitching)); 495 if (EnableLoopFlatten) 496 LPM1.addPass(LoopFlattenPass()); 497 498 LPM2.addPass(LoopIdiomRecognizePass()); 499 LPM2.addPass(IndVarSimplifyPass()); 500 501 for (auto &C : LateLoopOptimizationsEPCallbacks) 502 C(LPM2, Level); 503 504 LPM2.addPass(LoopDeletionPass()); 505 506 if (EnableLoopInterchange) 507 LPM2.addPass(LoopInterchangePass()); 508 509 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 510 // because it changes IR to makes profile annotation in back compile 511 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 512 // attributes so we need to make sure and allow the full unroll pass to pay 513 // attention to it. 514 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 515 PGOOpt->Action != PGOOptions::SampleUse) 516 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 517 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 518 PTO.ForgetAllSCEVInLoopUnroll)); 519 520 for (auto &C : LoopOptimizerEndEPCallbacks) 521 C(LPM2, Level); 522 523 // We provide the opt remark emitter pass for LICM to use. We only need to do 524 // this once as it is immutable. 525 FPM.addPass( 526 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 527 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 528 /*UseMemorySSA=*/true, 529 /*UseBlockFrequencyInfo=*/true)); 530 FPM.addPass( 531 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 532 FPM.addPass(InstCombinePass()); 533 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 534 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 535 // *All* loop passes must preserve it, in order to be able to use it. 536 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 537 /*UseMemorySSA=*/false, 538 /*UseBlockFrequencyInfo=*/false)); 539 540 // Delete small array after loop unroll. 541 FPM.addPass(SROAPass()); 542 543 // The matrix extension can introduce large vector operations early, which can 544 // benefit from running vector-combine early on. 545 if (EnableMatrix) 546 FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true)); 547 548 // Eliminate redundancies. 549 FPM.addPass(MergedLoadStoreMotionPass()); 550 if (RunNewGVN) 551 FPM.addPass(NewGVNPass()); 552 else 553 FPM.addPass(GVNPass()); 554 555 // Sparse conditional constant propagation. 556 // FIXME: It isn't clear why we do this *after* loop passes rather than 557 // before... 558 FPM.addPass(SCCPPass()); 559 560 // Delete dead bit computations (instcombine runs after to fold away the dead 561 // computations, and then ADCE will run later to exploit any new DCE 562 // opportunities that creates). 563 FPM.addPass(BDCEPass()); 564 565 // Run instcombine after redundancy and dead bit elimination to exploit 566 // opportunities opened up by them. 567 FPM.addPass(InstCombinePass()); 568 invokePeepholeEPCallbacks(FPM, Level); 569 570 // Re-consider control flow based optimizations after redundancy elimination, 571 // redo DCE, etc. 572 if (EnableDFAJumpThreading && Level.getSizeLevel() == 0) 573 FPM.addPass(DFAJumpThreadingPass()); 574 575 FPM.addPass(JumpThreadingPass()); 576 FPM.addPass(CorrelatedValuePropagationPass()); 577 578 // Finally, do an expensive DCE pass to catch all the dead code exposed by 579 // the simplifications and basic cleanup after all the simplifications. 580 // TODO: Investigate if this is too expensive. 581 FPM.addPass(ADCEPass()); 582 583 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 584 FPM.addPass(MemCpyOptPass()); 585 586 FPM.addPass(DSEPass()); 587 FPM.addPass(createFunctionToLoopPassAdaptor( 588 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 589 /*AllowSpeculation=*/true), 590 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 591 592 FPM.addPass(CoroElidePass()); 593 594 for (auto &C : ScalarOptimizerLateEPCallbacks) 595 C(FPM, Level); 596 597 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 598 .convertSwitchRangeToICmp(true) 599 .hoistCommonInsts(true) 600 .sinkCommonInsts(true))); 601 FPM.addPass(InstCombinePass()); 602 invokePeepholeEPCallbacks(FPM, Level); 603 604 if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt && 605 (PGOOpt->Action == PGOOptions::IRUse || 606 PGOOpt->Action == PGOOptions::SampleUse)) 607 FPM.addPass(ControlHeightReductionPass()); 608 609 return FPM; 610 } 611 612 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 613 MPM.addPass(CanonicalizeAliasesPass()); 614 MPM.addPass(NameAnonGlobalPass()); 615 } 616 617 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 618 OptimizationLevel Level, bool RunProfileGen, 619 bool IsCS, std::string ProfileFile, 620 std::string ProfileRemappingFile) { 621 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 622 if (!IsCS && !DisablePreInliner) { 623 InlineParams IP; 624 625 IP.DefaultThreshold = PreInlineThreshold; 626 627 // FIXME: The hint threshold has the same value used by the regular inliner 628 // when not optimzing for size. This should probably be lowered after 629 // performance testing. 630 // FIXME: this comment is cargo culted from the old pass manager, revisit). 631 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 632 ModuleInlinerWrapperPass MIWP(IP); 633 CGSCCPassManager &CGPipeline = MIWP.getPM(); 634 635 FunctionPassManager FPM; 636 FPM.addPass(SROAPass()); 637 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 638 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( 639 true))); // Merge & remove basic blocks. 640 FPM.addPass(InstCombinePass()); // Combine silly sequences. 641 invokePeepholeEPCallbacks(FPM, Level); 642 643 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 644 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 645 646 MPM.addPass(std::move(MIWP)); 647 648 // Delete anything that is now dead to make sure that we don't instrument 649 // dead code. Instrumentation can end up keeping dead code around and 650 // dramatically increase code size. 651 MPM.addPass(GlobalDCEPass()); 652 } 653 654 if (!RunProfileGen) { 655 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 656 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 657 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 658 // RequireAnalysisPass for PSI before subsequent non-module passes. 659 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 660 return; 661 } 662 663 // Perform PGO instrumentation. 664 MPM.addPass(PGOInstrumentationGen(IsCS)); 665 666 FunctionPassManager FPM; 667 // Disable header duplication in loop rotation at -Oz. 668 FPM.addPass(createFunctionToLoopPassAdaptor( 669 LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false, 670 /*UseBlockFrequencyInfo=*/false)); 671 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 672 PTO.EagerlyInvalidateAnalyses)); 673 674 // Add the profile lowering pass. 675 InstrProfOptions Options; 676 if (!ProfileFile.empty()) 677 Options.InstrProfileOutput = ProfileFile; 678 // Do counter promotion at Level greater than O0. 679 Options.DoCounterPromotion = true; 680 Options.UseBFIInPromotion = IsCS; 681 MPM.addPass(InstrProfiling(Options, IsCS)); 682 } 683 684 void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM, 685 bool RunProfileGen, bool IsCS, 686 std::string ProfileFile, 687 std::string ProfileRemappingFile) { 688 if (!RunProfileGen) { 689 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 690 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 691 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 692 // RequireAnalysisPass for PSI before subsequent non-module passes. 693 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 694 return; 695 } 696 697 // Perform PGO instrumentation. 698 MPM.addPass(PGOInstrumentationGen(IsCS)); 699 // Add the profile lowering pass. 700 InstrProfOptions Options; 701 if (!ProfileFile.empty()) 702 Options.InstrProfileOutput = ProfileFile; 703 // Do not do counter promotion at O0. 704 Options.DoCounterPromotion = false; 705 Options.UseBFIInPromotion = IsCS; 706 MPM.addPass(InstrProfiling(Options, IsCS)); 707 } 708 709 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 710 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 711 } 712 713 ModuleInlinerWrapperPass 714 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 715 ThinOrFullLTOPhase Phase) { 716 InlineParams IP = getInlineParamsFromOptLevel(Level); 717 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 718 PGOOpt->Action == PGOOptions::SampleUse) 719 IP.HotCallSiteThreshold = 0; 720 721 if (PGOOpt) 722 IP.EnableDeferral = EnablePGOInlineDeferral; 723 724 ModuleInlinerWrapperPass MIWP(IP, PerformMandatoryInliningsFirst, 725 UseInlineAdvisor, MaxDevirtIterations); 726 727 // Require the GlobalsAA analysis for the module so we can query it within 728 // the CGSCC pipeline. 729 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 730 // Invalidate AAManager so it can be recreated and pick up the newly available 731 // GlobalsAA. 732 MIWP.addModulePass( 733 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 734 735 // Require the ProfileSummaryAnalysis for the module so we can query it within 736 // the inliner pass. 737 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 738 739 // Now begin the main postorder CGSCC pipeline. 740 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 741 // manager and trying to emulate its precise behavior. Much of this doesn't 742 // make a lot of sense and we should revisit the core CGSCC structure. 743 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 744 745 // Note: historically, the PruneEH pass was run first to deduce nounwind and 746 // generally clean up exception handling overhead. It isn't clear this is 747 // valuable as the inliner doesn't currently care whether it is inlining an 748 // invoke or a call. 749 750 if (AttributorRun & AttributorRunOption::CGSCC) 751 MainCGPipeline.addPass(AttributorCGSCCPass()); 752 753 // Now deduce any function attributes based in the current code. 754 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 755 756 // When at O3 add argument promotion to the pass pipeline. 757 // FIXME: It isn't at all clear why this should be limited to O3. 758 if (Level == OptimizationLevel::O3) 759 MainCGPipeline.addPass(ArgumentPromotionPass()); 760 761 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 762 // there are no OpenMP runtime calls present in the module. 763 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 764 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 765 766 for (auto &C : CGSCCOptimizerLateEPCallbacks) 767 C(MainCGPipeline, Level); 768 769 // Lastly, add the core function simplification pipeline nested inside the 770 // CGSCC walk. 771 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 772 buildFunctionSimplificationPipeline(Level, Phase), 773 PTO.EagerlyInvalidateAnalyses, EnableNoRerunSimplificationPipeline)); 774 775 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 776 777 if (EnableNoRerunSimplificationPipeline) 778 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 779 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 780 781 return MIWP; 782 } 783 784 ModulePassManager 785 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 786 ThinOrFullLTOPhase Phase) { 787 ModulePassManager MPM; 788 789 InlineParams IP = getInlineParamsFromOptLevel(Level); 790 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 791 PGOOpt->Action == PGOOptions::SampleUse) 792 IP.HotCallSiteThreshold = 0; 793 794 if (PGOOpt) 795 IP.EnableDeferral = EnablePGOInlineDeferral; 796 797 // The inline deferral logic is used to avoid losing some 798 // inlining chance in future. It is helpful in SCC inliner, in which 799 // inlining is processed in bottom-up order. 800 // While in module inliner, the inlining order is a priority-based order 801 // by default. The inline deferral is unnecessary there. So we disable the 802 // inline deferral logic in module inliner. 803 IP.EnableDeferral = false; 804 805 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor)); 806 807 MPM.addPass(createModuleToFunctionPassAdaptor( 808 buildFunctionSimplificationPipeline(Level, Phase), 809 PTO.EagerlyInvalidateAnalyses)); 810 811 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 812 CoroSplitPass(Level != OptimizationLevel::O0))); 813 814 return MPM; 815 } 816 817 ModulePassManager 818 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 819 ThinOrFullLTOPhase Phase) { 820 ModulePassManager MPM; 821 822 // Place pseudo probe instrumentation as the first pass of the pipeline to 823 // minimize the impact of optimization changes. 824 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 825 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 826 MPM.addPass(SampleProfileProbePass(TM)); 827 828 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 829 830 // In ThinLTO mode, when flattened profile is used, all the available 831 // profile information will be annotated in PreLink phase so there is 832 // no need to load the profile again in PostLink. 833 bool LoadSampleProfile = 834 HasSampleProfile && 835 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 836 837 // During the ThinLTO backend phase we perform early indirect call promotion 838 // here, before globalopt. Otherwise imported available_externally functions 839 // look unreferenced and are removed. If we are going to load the sample 840 // profile then defer until later. 841 // TODO: See if we can move later and consolidate with the location where 842 // we perform ICP when we are loading a sample profile. 843 // TODO: We pass HasSampleProfile (whether there was a sample profile file 844 // passed to the compile) to the SamplePGO flag of ICP. This is used to 845 // determine whether the new direct calls are annotated with prof metadata. 846 // Ideally this should be determined from whether the IR is annotated with 847 // sample profile, and not whether the a sample profile was provided on the 848 // command line. E.g. for flattened profiles where we will not be reloading 849 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 850 // provide the sample profile file. 851 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 852 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 853 854 // Do basic inference of function attributes from known properties of system 855 // libraries and other oracles. 856 MPM.addPass(InferFunctionAttrsPass()); 857 858 // Create an early function pass manager to cleanup the output of the 859 // frontend. 860 FunctionPassManager EarlyFPM; 861 // Lower llvm.expect to metadata before attempting transforms. 862 // Compare/branch metadata may alter the behavior of passes like SimplifyCFG. 863 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 864 EarlyFPM.addPass(SimplifyCFGPass()); 865 EarlyFPM.addPass(SROAPass()); 866 EarlyFPM.addPass(EarlyCSEPass()); 867 EarlyFPM.addPass(CoroEarlyPass()); 868 if (Level == OptimizationLevel::O3) 869 EarlyFPM.addPass(CallSiteSplittingPass()); 870 871 // In SamplePGO ThinLTO backend, we need instcombine before profile annotation 872 // to convert bitcast to direct calls so that they can be inlined during the 873 // profile annotation prepration step. 874 // More details about SamplePGO design can be found in: 875 // https://research.google.com/pubs/pub45290.html 876 // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured. 877 if (LoadSampleProfile) 878 EarlyFPM.addPass(InstCombinePass()); 879 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM), 880 PTO.EagerlyInvalidateAnalyses)); 881 882 if (LoadSampleProfile) { 883 // Annotate sample profile right after early FPM to ensure freshness of 884 // the debug info. 885 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 886 PGOOpt->ProfileRemappingFile, Phase)); 887 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 888 // RequireAnalysisPass for PSI before subsequent non-module passes. 889 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 890 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 891 // for the profile annotation to be accurate in the LTO backend. 892 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink && 893 Phase != ThinOrFullLTOPhase::FullLTOPreLink) 894 // We perform early indirect call promotion here, before globalopt. 895 // This is important for the ThinLTO backend phase because otherwise 896 // imported available_externally functions look unreferenced and are 897 // removed. 898 MPM.addPass( 899 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 900 } 901 902 // Try to perform OpenMP specific optimizations on the module. This is a 903 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 904 if (Level != OptimizationLevel::O0) 905 MPM.addPass(OpenMPOptPass()); 906 907 if (AttributorRun & AttributorRunOption::MODULE) 908 MPM.addPass(AttributorPass()); 909 910 // Lower type metadata and the type.test intrinsic in the ThinLTO 911 // post link pipeline after ICP. This is to enable usage of the type 912 // tests in ICP sequences. 913 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 914 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 915 916 for (auto &C : PipelineEarlySimplificationEPCallbacks) 917 C(MPM, Level); 918 919 // Specialize functions with IPSCCP. 920 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 921 MPM.addPass(FunctionSpecializationPass()); 922 923 // Interprocedural constant propagation now that basic cleanup has occurred 924 // and prior to optimizing globals. 925 // FIXME: This position in the pipeline hasn't been carefully considered in 926 // years, it should be re-analyzed. 927 MPM.addPass(IPSCCPPass()); 928 929 // Attach metadata to indirect call sites indicating the set of functions 930 // they may target at run-time. This should follow IPSCCP. 931 MPM.addPass(CalledValuePropagationPass()); 932 933 // Optimize globals to try and fold them into constants. 934 MPM.addPass(GlobalOptPass()); 935 936 // Promote any localized globals to SSA registers. 937 // FIXME: Should this instead by a run of SROA? 938 // FIXME: We should probably run instcombine and simplifycfg afterward to 939 // delete control flows that are dead once globals have been folded to 940 // constants. 941 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 942 943 // Remove any dead arguments exposed by cleanups and constant folding 944 // globals. 945 MPM.addPass(DeadArgumentEliminationPass()); 946 947 // Create a small function pass pipeline to cleanup after all the global 948 // optimizations. 949 FunctionPassManager GlobalCleanupPM; 950 GlobalCleanupPM.addPass(InstCombinePass()); 951 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 952 953 GlobalCleanupPM.addPass( 954 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 955 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 956 PTO.EagerlyInvalidateAnalyses)); 957 958 // Add all the requested passes for instrumentation PGO, if requested. 959 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 960 (PGOOpt->Action == PGOOptions::IRInstr || 961 PGOOpt->Action == PGOOptions::IRUse)) { 962 addPGOInstrPasses(MPM, Level, 963 /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr, 964 /* IsCS */ false, PGOOpt->ProfileFile, 965 PGOOpt->ProfileRemappingFile); 966 MPM.addPass(PGOIndirectCallPromotion(false, false)); 967 } 968 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 969 PGOOpt->CSAction == PGOOptions::CSIRInstr) 970 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile)); 971 972 // Synthesize function entry counts for non-PGO compilation. 973 if (EnableSyntheticCounts && !PGOOpt) 974 MPM.addPass(SyntheticCountsPropagation()); 975 976 if (EnableModuleInliner) 977 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 978 else 979 MPM.addPass(buildInlinerPipeline(Level, Phase)); 980 981 if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { 982 MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass())); 983 MPM.addPass(ModuleMemProfilerPass()); 984 } 985 986 return MPM; 987 } 988 989 /// TODO: Should LTO cause any differences to this set of passes? 990 void PassBuilder::addVectorPasses(OptimizationLevel Level, 991 FunctionPassManager &FPM, bool IsFullLTO) { 992 FPM.addPass(LoopVectorizePass( 993 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 994 995 if (IsFullLTO) { 996 // The vectorizer may have significantly shortened a loop body; unroll 997 // again. Unroll small loops to hide loop backedge latency and saturate any 998 // parallel execution resources of an out-of-order processor. We also then 999 // need to clean up redundancies and loop invariant code. 1000 // FIXME: It would be really good to use a loop-integrated instruction 1001 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1002 // across the loop nests. 1003 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1004 if (EnableUnrollAndJam && PTO.LoopUnrolling) 1005 FPM.addPass(createFunctionToLoopPassAdaptor( 1006 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1007 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1008 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1009 PTO.ForgetAllSCEVInLoopUnroll))); 1010 FPM.addPass(WarnMissedTransformationsPass()); 1011 } 1012 1013 if (!IsFullLTO) { 1014 // Eliminate loads by forwarding stores from the previous iteration to loads 1015 // of the current iteration. 1016 FPM.addPass(LoopLoadEliminationPass()); 1017 } 1018 // Cleanup after the loop optimization passes. 1019 FPM.addPass(InstCombinePass()); 1020 1021 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1022 ExtraVectorPassManager ExtraPasses; 1023 // At higher optimization levels, try to clean up any runtime overlap and 1024 // alignment checks inserted by the vectorizer. We want to track correlated 1025 // runtime checks for two inner loops in the same outer loop, fold any 1026 // common computations, hoist loop-invariant aspects out of any outer loop, 1027 // and unswitch the runtime checks if possible. Once hoisted, we may have 1028 // dead (or speculatable) control flows or more combining opportunities. 1029 ExtraPasses.addPass(EarlyCSEPass()); 1030 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1031 ExtraPasses.addPass(InstCombinePass()); 1032 LoopPassManager LPM; 1033 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1034 /*AllowSpeculation=*/true)); 1035 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1036 OptimizationLevel::O3)); 1037 ExtraPasses.addPass( 1038 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1039 ExtraPasses.addPass( 1040 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1041 /*UseBlockFrequencyInfo=*/true)); 1042 ExtraPasses.addPass( 1043 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1044 ExtraPasses.addPass(InstCombinePass()); 1045 FPM.addPass(std::move(ExtraPasses)); 1046 } 1047 1048 // Now that we've formed fast to execute loop structures, we do further 1049 // optimizations. These are run afterward as they might block doing complex 1050 // analyses and transforms such as what are needed for loop vectorization. 1051 1052 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1053 // GVN, loop transforms, and others have already run, so it's now better to 1054 // convert to more optimized IR using more aggressive simplify CFG options. 1055 // The extra sinking transform can create larger basic blocks, so do this 1056 // before SLP vectorization. 1057 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1058 .forwardSwitchCondToPhi(true) 1059 .convertSwitchRangeToICmp(true) 1060 .convertSwitchToLookupTable(true) 1061 .needCanonicalLoops(false) 1062 .hoistCommonInsts(true) 1063 .sinkCommonInsts(true))); 1064 1065 if (IsFullLTO) { 1066 FPM.addPass(SCCPPass()); 1067 FPM.addPass(InstCombinePass()); 1068 FPM.addPass(BDCEPass()); 1069 } 1070 1071 // Optimize parallel scalar instruction chains into SIMD instructions. 1072 if (PTO.SLPVectorization) { 1073 FPM.addPass(SLPVectorizerPass()); 1074 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1075 FPM.addPass(EarlyCSEPass()); 1076 } 1077 } 1078 // Enhance/cleanup vector code. 1079 FPM.addPass(VectorCombinePass()); 1080 1081 if (!IsFullLTO) { 1082 FPM.addPass(InstCombinePass()); 1083 // Unroll small loops to hide loop backedge latency and saturate any 1084 // parallel execution resources of an out-of-order processor. We also then 1085 // need to clean up redundancies and loop invariant code. 1086 // FIXME: It would be really good to use a loop-integrated instruction 1087 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1088 // across the loop nests. 1089 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1090 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1091 FPM.addPass(createFunctionToLoopPassAdaptor( 1092 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1093 } 1094 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1095 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1096 PTO.ForgetAllSCEVInLoopUnroll))); 1097 FPM.addPass(WarnMissedTransformationsPass()); 1098 FPM.addPass(InstCombinePass()); 1099 FPM.addPass( 1100 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1101 FPM.addPass(createFunctionToLoopPassAdaptor( 1102 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1103 /*AllowSpeculation=*/true), 1104 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1105 } 1106 1107 // Now that we've vectorized and unrolled loops, we may have more refined 1108 // alignment information, try to re-derive it here. 1109 FPM.addPass(AlignmentFromAssumptionsPass()); 1110 1111 if (IsFullLTO) 1112 FPM.addPass(InstCombinePass()); 1113 } 1114 1115 ModulePassManager 1116 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1117 bool LTOPreLink) { 1118 ModulePassManager MPM; 1119 1120 // Optimize globals now that the module is fully simplified. 1121 MPM.addPass(GlobalOptPass()); 1122 MPM.addPass(GlobalDCEPass()); 1123 1124 // Run partial inlining pass to partially inline functions that have 1125 // large bodies. 1126 if (RunPartialInlining) 1127 MPM.addPass(PartialInlinerPass()); 1128 1129 // Remove avail extern fns and globals definitions since we aren't compiling 1130 // an object file for later LTO. For LTO we want to preserve these so they 1131 // are eligible for inlining at link-time. Note if they are unreferenced they 1132 // will be removed by GlobalDCE later, so this only impacts referenced 1133 // available externally globals. Eventually they will be suppressed during 1134 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1135 // may make globals referenced by available external functions dead and saves 1136 // running remaining passes on the eliminated functions. These should be 1137 // preserved during prelinking for link-time inlining decisions. 1138 if (!LTOPreLink) 1139 MPM.addPass(EliminateAvailableExternallyPass()); 1140 1141 if (EnableOrderFileInstrumentation) 1142 MPM.addPass(InstrOrderFilePass()); 1143 1144 // Do RPO function attribute inference across the module to forward-propagate 1145 // attributes where applicable. 1146 // FIXME: Is this really an optimization rather than a canonicalization? 1147 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1148 1149 // Do a post inline PGO instrumentation and use pass. This is a context 1150 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1151 // cross-module inline has not been done yet. The context sensitive 1152 // instrumentation is after all the inlines are done. 1153 if (!LTOPreLink && PGOOpt) { 1154 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1155 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1156 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1157 PGOOpt->ProfileRemappingFile); 1158 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1159 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1160 /* IsCS */ true, PGOOpt->ProfileFile, 1161 PGOOpt->ProfileRemappingFile); 1162 } 1163 1164 // Re-require GloblasAA here prior to function passes. This is particularly 1165 // useful as the above will have inlined, DCE'ed, and function-attr 1166 // propagated everything. We should at this point have a reasonably minimal 1167 // and richly annotated call graph. By computing aliasing and mod/ref 1168 // information for all local globals here, the late loop passes and notably 1169 // the vectorizer will be able to use them to help recognize vectorizable 1170 // memory operations. 1171 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1172 1173 FunctionPassManager OptimizePM; 1174 OptimizePM.addPass(Float2IntPass()); 1175 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1176 1177 if (EnableMatrix) { 1178 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1179 OptimizePM.addPass(EarlyCSEPass()); 1180 } 1181 1182 // FIXME: We need to run some loop optimizations to re-rotate loops after 1183 // simplifycfg and others undo their rotation. 1184 1185 // Optimize the loop execution. These passes operate on entire loop nests 1186 // rather than on each loop in an inside-out manner, and so they are actually 1187 // function passes. 1188 1189 for (auto &C : VectorizerStartEPCallbacks) 1190 C(OptimizePM, Level); 1191 1192 LoopPassManager LPM; 1193 // First rotate loops that may have been un-rotated by prior passes. 1194 // Disable header duplication at -Oz. 1195 LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); 1196 // Some loops may have become dead by now. Try to delete them. 1197 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1198 // this may need to be revisited once we run GVN before loop deletion 1199 // in the simplification pipeline. 1200 LPM.addPass(LoopDeletionPass()); 1201 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1202 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1203 1204 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1205 // into separate loop that would otherwise inhibit vectorization. This is 1206 // currently only performed for loops marked with the metadata 1207 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1208 OptimizePM.addPass(LoopDistributePass()); 1209 1210 // Populates the VFABI attribute with the scalar-to-vector mappings 1211 // from the TargetLibraryInfo. 1212 OptimizePM.addPass(InjectTLIMappings()); 1213 1214 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1215 1216 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1217 // canonicalization pass that enables other optimizations. As a result, 1218 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1219 // result too early. 1220 OptimizePM.addPass(LoopSinkPass()); 1221 1222 // And finally clean up LCSSA form before generating code. 1223 OptimizePM.addPass(InstSimplifyPass()); 1224 1225 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1226 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1227 // flattening of blocks. 1228 OptimizePM.addPass(DivRemPairsPass()); 1229 1230 // LoopSink (and other loop passes since the last simplifyCFG) might have 1231 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1232 OptimizePM.addPass( 1233 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1234 1235 OptimizePM.addPass(CoroCleanupPass()); 1236 1237 // Add the core optimizing pipeline. 1238 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1239 PTO.EagerlyInvalidateAnalyses)); 1240 1241 for (auto &C : OptimizerLastEPCallbacks) 1242 C(MPM, Level); 1243 1244 // Split out cold code. Splitting is done late to avoid hiding context from 1245 // other optimizations and inadvertently regressing performance. The tradeoff 1246 // is that this has a higher code size cost than splitting early. 1247 if (EnableHotColdSplit && !LTOPreLink) 1248 MPM.addPass(HotColdSplittingPass()); 1249 1250 // Search the code for similar regions of code. If enough similar regions can 1251 // be found where extracting the regions into their own function will decrease 1252 // the size of the program, we extract the regions, a deduplicate the 1253 // structurally similar regions. 1254 if (EnableIROutliner) 1255 MPM.addPass(IROutlinerPass()); 1256 1257 // Merge functions if requested. 1258 if (PTO.MergeFunctions) 1259 MPM.addPass(MergeFunctionsPass()); 1260 1261 if (PTO.CallGraphProfile) 1262 MPM.addPass(CGProfilePass()); 1263 1264 // Now we need to do some global optimization transforms. 1265 // FIXME: It would seem like these should come first in the optimization 1266 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1267 // ordering here. 1268 MPM.addPass(GlobalDCEPass()); 1269 MPM.addPass(ConstantMergePass()); 1270 1271 // TODO: Relative look table converter pass caused an issue when full lto is 1272 // enabled. See https://reviews.llvm.org/D94355 for more details. 1273 // Until the issue fixed, disable this pass during pre-linking phase. 1274 if (!LTOPreLink) 1275 MPM.addPass(RelLookupTableConverterPass()); 1276 1277 return MPM; 1278 } 1279 1280 ModulePassManager 1281 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1282 bool LTOPreLink) { 1283 assert(Level != OptimizationLevel::O0 && 1284 "Must request optimizations for the default pipeline!"); 1285 1286 ModulePassManager MPM; 1287 1288 // Convert @llvm.global.annotations to !annotation metadata. 1289 MPM.addPass(Annotation2MetadataPass()); 1290 1291 // Force any function attributes we want the rest of the pipeline to observe. 1292 MPM.addPass(ForceFunctionAttrsPass()); 1293 1294 // Apply module pipeline start EP callback. 1295 for (auto &C : PipelineStartEPCallbacks) 1296 C(MPM, Level); 1297 1298 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1299 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1300 1301 // Add the core simplification pipeline. 1302 MPM.addPass(buildModuleSimplificationPipeline( 1303 Level, LTOPreLink ? ThinOrFullLTOPhase::FullLTOPreLink 1304 : ThinOrFullLTOPhase::None)); 1305 1306 // Now add the optimization pipeline. 1307 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPreLink)); 1308 1309 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1310 PGOOpt->Action == PGOOptions::SampleUse) 1311 MPM.addPass(PseudoProbeUpdatePass()); 1312 1313 // Emit annotation remarks. 1314 addAnnotationRemarksPass(MPM); 1315 1316 if (LTOPreLink) 1317 addRequiredLTOPreLinkPasses(MPM); 1318 1319 return MPM; 1320 } 1321 1322 ModulePassManager 1323 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1324 assert(Level != OptimizationLevel::O0 && 1325 "Must request optimizations for the default pipeline!"); 1326 1327 ModulePassManager MPM; 1328 1329 // Convert @llvm.global.annotations to !annotation metadata. 1330 MPM.addPass(Annotation2MetadataPass()); 1331 1332 // Force any function attributes we want the rest of the pipeline to observe. 1333 MPM.addPass(ForceFunctionAttrsPass()); 1334 1335 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1336 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1337 1338 // Apply module pipeline start EP callback. 1339 for (auto &C : PipelineStartEPCallbacks) 1340 C(MPM, Level); 1341 1342 // If we are planning to perform ThinLTO later, we don't bloat the code with 1343 // unrolling/vectorization/... now. Just simplify the module as much as we 1344 // can. 1345 MPM.addPass(buildModuleSimplificationPipeline( 1346 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1347 1348 // Run partial inlining pass to partially inline functions that have 1349 // large bodies. 1350 // FIXME: It isn't clear whether this is really the right place to run this 1351 // in ThinLTO. Because there is another canonicalization and simplification 1352 // phase that will run after the thin link, running this here ends up with 1353 // less information than will be available later and it may grow functions in 1354 // ways that aren't beneficial. 1355 if (RunPartialInlining) 1356 MPM.addPass(PartialInlinerPass()); 1357 1358 // Reduce the size of the IR as much as possible. 1359 MPM.addPass(GlobalOptPass()); 1360 1361 // Module simplification splits coroutines, but does not fully clean up 1362 // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up 1363 // on these, we schedule the cleanup here. 1364 MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass())); 1365 1366 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1367 PGOOpt->Action == PGOOptions::SampleUse) 1368 MPM.addPass(PseudoProbeUpdatePass()); 1369 1370 // Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual 1371 // optimization is going to be done in PostLink stage, but clang can't 1372 // add callbacks there in case of in-process ThinLTO called by linker. 1373 for (auto &C : OptimizerLastEPCallbacks) 1374 C(MPM, Level); 1375 1376 // Emit annotation remarks. 1377 addAnnotationRemarksPass(MPM); 1378 1379 addRequiredLTOPreLinkPasses(MPM); 1380 1381 return MPM; 1382 } 1383 1384 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1385 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1386 ModulePassManager MPM; 1387 1388 // Convert @llvm.global.annotations to !annotation metadata. 1389 MPM.addPass(Annotation2MetadataPass()); 1390 1391 if (ImportSummary) { 1392 // These passes import type identifier resolutions for whole-program 1393 // devirtualization and CFI. They must run early because other passes may 1394 // disturb the specific instruction patterns that these passes look for, 1395 // creating dependencies on resolutions that may not appear in the summary. 1396 // 1397 // For example, GVN may transform the pattern assume(type.test) appearing in 1398 // two basic blocks into assume(phi(type.test, type.test)), which would 1399 // transform a dependency on a WPD resolution into a dependency on a type 1400 // identifier resolution for CFI. 1401 // 1402 // Also, WPD has access to more precise information than ICP and can 1403 // devirtualize more effectively, so it should operate on the IR first. 1404 // 1405 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1406 // metadata and intrinsics. 1407 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1408 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1409 } 1410 1411 if (Level == OptimizationLevel::O0) { 1412 // Run a second time to clean up any type tests left behind by WPD for use 1413 // in ICP. 1414 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1415 // Drop available_externally and unreferenced globals. This is necessary 1416 // with ThinLTO in order to avoid leaving undefined references to dead 1417 // globals in the object file. 1418 MPM.addPass(EliminateAvailableExternallyPass()); 1419 MPM.addPass(GlobalDCEPass()); 1420 return MPM; 1421 } 1422 1423 // Force any function attributes we want the rest of the pipeline to observe. 1424 MPM.addPass(ForceFunctionAttrsPass()); 1425 1426 // Add the core simplification pipeline. 1427 MPM.addPass(buildModuleSimplificationPipeline( 1428 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1429 1430 // Now add the optimization pipeline. 1431 MPM.addPass(buildModuleOptimizationPipeline(Level)); 1432 1433 // Emit annotation remarks. 1434 addAnnotationRemarksPass(MPM); 1435 1436 return MPM; 1437 } 1438 1439 ModulePassManager 1440 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1441 assert(Level != OptimizationLevel::O0 && 1442 "Must request optimizations for the default pipeline!"); 1443 // FIXME: We should use a customized pre-link pipeline! 1444 return buildPerModuleDefaultPipeline(Level, 1445 /* LTOPreLink */ true); 1446 } 1447 1448 ModulePassManager 1449 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1450 ModuleSummaryIndex *ExportSummary) { 1451 ModulePassManager MPM; 1452 1453 // Convert @llvm.global.annotations to !annotation metadata. 1454 MPM.addPass(Annotation2MetadataPass()); 1455 1456 // Create a function that performs CFI checks for cross-DSO calls with targets 1457 // in the current module. 1458 MPM.addPass(CrossDSOCFIPass()); 1459 1460 if (Level == OptimizationLevel::O0) { 1461 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1462 // metadata and intrinsics. 1463 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1464 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1465 // Run a second time to clean up any type tests left behind by WPD for use 1466 // in ICP. 1467 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1468 1469 // Emit annotation remarks. 1470 addAnnotationRemarksPass(MPM); 1471 1472 return MPM; 1473 } 1474 1475 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1476 // Load sample profile before running the LTO optimization pipeline. 1477 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1478 PGOOpt->ProfileRemappingFile, 1479 ThinOrFullLTOPhase::FullLTOPostLink)); 1480 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1481 // RequireAnalysisPass for PSI before subsequent non-module passes. 1482 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1483 } 1484 1485 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1486 MPM.addPass(OpenMPOptPass()); 1487 1488 // Remove unused virtual tables to improve the quality of code generated by 1489 // whole-program devirtualization and bitset lowering. 1490 MPM.addPass(GlobalDCEPass()); 1491 1492 // Force any function attributes we want the rest of the pipeline to observe. 1493 MPM.addPass(ForceFunctionAttrsPass()); 1494 1495 // Do basic inference of function attributes from known properties of system 1496 // libraries and other oracles. 1497 MPM.addPass(InferFunctionAttrsPass()); 1498 1499 if (Level.getSpeedupLevel() > 1) { 1500 FunctionPassManager EarlyFPM; 1501 EarlyFPM.addPass(CallSiteSplittingPass()); 1502 MPM.addPass(createModuleToFunctionPassAdaptor( 1503 std::move(EarlyFPM), PTO.EagerlyInvalidateAnalyses)); 1504 1505 // Indirect call promotion. This should promote all the targets that are 1506 // left by the earlier promotion pass that promotes intra-module targets. 1507 // This two-step promotion is to save the compile time. For LTO, it should 1508 // produce the same result as if we only do promotion here. 1509 MPM.addPass(PGOIndirectCallPromotion( 1510 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1511 1512 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 1513 MPM.addPass(FunctionSpecializationPass()); 1514 // Propagate constants at call sites into the functions they call. This 1515 // opens opportunities for globalopt (and inlining) by substituting function 1516 // pointers passed as arguments to direct uses of functions. 1517 MPM.addPass(IPSCCPPass()); 1518 1519 // Attach metadata to indirect call sites indicating the set of functions 1520 // they may target at run-time. This should follow IPSCCP. 1521 MPM.addPass(CalledValuePropagationPass()); 1522 } 1523 1524 // Now deduce any function attributes based in the current code. 1525 MPM.addPass( 1526 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1527 1528 // Do RPO function attribute inference across the module to forward-propagate 1529 // attributes where applicable. 1530 // FIXME: Is this really an optimization rather than a canonicalization? 1531 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1532 1533 // Use in-range annotations on GEP indices to split globals where beneficial. 1534 MPM.addPass(GlobalSplitPass()); 1535 1536 // Run whole program optimization of virtual call when the list of callees 1537 // is fixed. 1538 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1539 1540 // Stop here at -O1. 1541 if (Level == OptimizationLevel::O1) { 1542 // The LowerTypeTestsPass needs to run to lower type metadata and the 1543 // type.test intrinsics. The pass does nothing if CFI is disabled. 1544 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1545 // Run a second time to clean up any type tests left behind by WPD for use 1546 // in ICP (which is performed earlier than this in the regular LTO 1547 // pipeline). 1548 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1549 1550 // Emit annotation remarks. 1551 addAnnotationRemarksPass(MPM); 1552 1553 return MPM; 1554 } 1555 1556 // Optimize globals to try and fold them into constants. 1557 MPM.addPass(GlobalOptPass()); 1558 1559 // Promote any localized globals to SSA registers. 1560 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1561 1562 // Linking modules together can lead to duplicate global constant, only 1563 // keep one copy of each constant. 1564 MPM.addPass(ConstantMergePass()); 1565 1566 // Remove unused arguments from functions. 1567 MPM.addPass(DeadArgumentEliminationPass()); 1568 1569 // Reduce the code after globalopt and ipsccp. Both can open up significant 1570 // simplification opportunities, and both can propagate functions through 1571 // function pointers. When this happens, we often have to resolve varargs 1572 // calls, etc, so let instcombine do this. 1573 FunctionPassManager PeepholeFPM; 1574 PeepholeFPM.addPass(InstCombinePass()); 1575 if (Level == OptimizationLevel::O3) 1576 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1577 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1578 1579 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1580 PTO.EagerlyInvalidateAnalyses)); 1581 1582 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1583 // generally clean up exception handling overhead. It isn't clear this is 1584 // valuable as the inliner doesn't currently care whether it is inlining an 1585 // invoke or a call. 1586 // Run the inliner now. 1587 MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level))); 1588 1589 // Optimize globals again after we ran the inliner. 1590 MPM.addPass(GlobalOptPass()); 1591 1592 // Garbage collect dead functions. 1593 MPM.addPass(GlobalDCEPass()); 1594 1595 // If we didn't decide to inline a function, check to see if we can 1596 // transform it to pass arguments by value instead of by reference. 1597 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1598 1599 FunctionPassManager FPM; 1600 // The IPO Passes may leave cruft around. Clean up after them. 1601 FPM.addPass(InstCombinePass()); 1602 invokePeepholeEPCallbacks(FPM, Level); 1603 1604 FPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); 1605 1606 // Do a post inline PGO instrumentation and use pass. This is a context 1607 // sensitive PGO pass. 1608 if (PGOOpt) { 1609 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1610 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1611 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1612 PGOOpt->ProfileRemappingFile); 1613 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1614 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1615 /* IsCS */ true, PGOOpt->ProfileFile, 1616 PGOOpt->ProfileRemappingFile); 1617 } 1618 1619 // Break up allocas 1620 FPM.addPass(SROAPass()); 1621 1622 // LTO provides additional opportunities for tailcall elimination due to 1623 // link-time inlining, and visibility of nocapture attribute. 1624 FPM.addPass(TailCallElimPass()); 1625 1626 // Run a few AA driver optimizations here and now to cleanup the code. 1627 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 1628 PTO.EagerlyInvalidateAnalyses)); 1629 1630 MPM.addPass( 1631 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1632 1633 // Require the GlobalsAA analysis for the module so we can query it within 1634 // MainFPM. 1635 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1636 // Invalidate AAManager so it can be recreated and pick up the newly available 1637 // GlobalsAA. 1638 MPM.addPass( 1639 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 1640 1641 FunctionPassManager MainFPM; 1642 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1643 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1644 /*AllowSpeculation=*/true), 1645 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1646 1647 if (RunNewGVN) 1648 MainFPM.addPass(NewGVNPass()); 1649 else 1650 MainFPM.addPass(GVNPass()); 1651 1652 // Remove dead memcpy()'s. 1653 MainFPM.addPass(MemCpyOptPass()); 1654 1655 // Nuke dead stores. 1656 MainFPM.addPass(DSEPass()); 1657 MainFPM.addPass(MergedLoadStoreMotionPass()); 1658 1659 1660 if (EnableConstraintElimination) 1661 MainFPM.addPass(ConstraintEliminationPass()); 1662 1663 LoopPassManager LPM; 1664 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 1665 LPM.addPass(LoopFlattenPass()); 1666 LPM.addPass(IndVarSimplifyPass()); 1667 LPM.addPass(LoopDeletionPass()); 1668 // FIXME: Add loop interchange. 1669 1670 // Unroll small loops and perform peeling. 1671 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 1672 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 1673 PTO.ForgetAllSCEVInLoopUnroll)); 1674 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 1675 // *All* loop passes must preserve it, in order to be able to use it. 1676 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1677 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 1678 1679 MainFPM.addPass(LoopDistributePass()); 1680 1681 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 1682 1683 // Run the OpenMPOpt CGSCC pass again late. 1684 MPM.addPass( 1685 createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass())); 1686 1687 invokePeepholeEPCallbacks(MainFPM, Level); 1688 MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true)); 1689 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 1690 PTO.EagerlyInvalidateAnalyses)); 1691 1692 // Lower type metadata and the type.test intrinsic. This pass supports 1693 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 1694 // to be run at link time if CFI is enabled. This pass does nothing if 1695 // CFI is disabled. 1696 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1697 // Run a second time to clean up any type tests left behind by WPD for use 1698 // in ICP (which is performed earlier than this in the regular LTO pipeline). 1699 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1700 1701 // Enable splitting late in the FullLTO post-link pipeline. This is done in 1702 // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses). 1703 if (EnableHotColdSplit) 1704 MPM.addPass(HotColdSplittingPass()); 1705 1706 // Add late LTO optimization passes. 1707 // Delete basic blocks, which optimization passes may have killed. 1708 MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass( 1709 SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts( 1710 true)))); 1711 1712 // Drop bodies of available eternally objects to improve GlobalDCE. 1713 MPM.addPass(EliminateAvailableExternallyPass()); 1714 1715 // Now that we have optimized the program, discard unreachable functions. 1716 MPM.addPass(GlobalDCEPass()); 1717 1718 if (PTO.MergeFunctions) 1719 MPM.addPass(MergeFunctionsPass()); 1720 1721 // Emit annotation remarks. 1722 addAnnotationRemarksPass(MPM); 1723 1724 return MPM; 1725 } 1726 1727 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 1728 bool LTOPreLink) { 1729 assert(Level == OptimizationLevel::O0 && 1730 "buildO0DefaultPipeline should only be used with O0"); 1731 1732 ModulePassManager MPM; 1733 1734 // Perform pseudo probe instrumentation in O0 mode. This is for the 1735 // consistency between different build modes. For example, a LTO build can be 1736 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 1737 // the postlink will require pseudo probe instrumentation in the prelink. 1738 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 1739 MPM.addPass(SampleProfileProbePass(TM)); 1740 1741 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 1742 PGOOpt->Action == PGOOptions::IRUse)) 1743 addPGOInstrPassesForO0( 1744 MPM, 1745 /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr), 1746 /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); 1747 1748 for (auto &C : PipelineStartEPCallbacks) 1749 C(MPM, Level); 1750 1751 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1752 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1753 1754 for (auto &C : PipelineEarlySimplificationEPCallbacks) 1755 C(MPM, Level); 1756 1757 // Build a minimal pipeline based on the semantics required by LLVM, 1758 // which is just that always inlining occurs. Further, disable generating 1759 // lifetime intrinsics to avoid enabling further optimizations during 1760 // code generation. 1761 MPM.addPass(AlwaysInlinerPass( 1762 /*InsertLifetimeIntrinsics=*/false)); 1763 1764 if (PTO.MergeFunctions) 1765 MPM.addPass(MergeFunctionsPass()); 1766 1767 if (EnableMatrix) 1768 MPM.addPass( 1769 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 1770 1771 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 1772 CGSCCPassManager CGPM; 1773 for (auto &C : CGSCCOptimizerLateEPCallbacks) 1774 C(CGPM, Level); 1775 if (!CGPM.isEmpty()) 1776 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1777 } 1778 if (!LateLoopOptimizationsEPCallbacks.empty()) { 1779 LoopPassManager LPM; 1780 for (auto &C : LateLoopOptimizationsEPCallbacks) 1781 C(LPM, Level); 1782 if (!LPM.isEmpty()) { 1783 MPM.addPass(createModuleToFunctionPassAdaptor( 1784 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1785 } 1786 } 1787 if (!LoopOptimizerEndEPCallbacks.empty()) { 1788 LoopPassManager LPM; 1789 for (auto &C : LoopOptimizerEndEPCallbacks) 1790 C(LPM, Level); 1791 if (!LPM.isEmpty()) { 1792 MPM.addPass(createModuleToFunctionPassAdaptor( 1793 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1794 } 1795 } 1796 if (!ScalarOptimizerLateEPCallbacks.empty()) { 1797 FunctionPassManager FPM; 1798 for (auto &C : ScalarOptimizerLateEPCallbacks) 1799 C(FPM, Level); 1800 if (!FPM.isEmpty()) 1801 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1802 } 1803 if (!VectorizerStartEPCallbacks.empty()) { 1804 FunctionPassManager FPM; 1805 for (auto &C : VectorizerStartEPCallbacks) 1806 C(FPM, Level); 1807 if (!FPM.isEmpty()) 1808 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1809 } 1810 1811 MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass())); 1812 CGSCCPassManager CGPM; 1813 CGPM.addPass(CoroSplitPass()); 1814 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1815 MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass())); 1816 1817 for (auto &C : OptimizerLastEPCallbacks) 1818 C(MPM, Level); 1819 1820 if (LTOPreLink) 1821 addRequiredLTOPreLinkPasses(MPM); 1822 1823 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 1824 1825 return MPM; 1826 } 1827 1828 AAManager PassBuilder::buildDefaultAAPipeline() { 1829 AAManager AA; 1830 1831 // The order in which these are registered determines their priority when 1832 // being queried. 1833 1834 // First we register the basic alias analysis that provides the majority of 1835 // per-function local AA logic. This is a stateless, on-demand local set of 1836 // AA techniques. 1837 AA.registerFunctionAnalysis<BasicAA>(); 1838 1839 // Next we query fast, specialized alias analyses that wrap IR-embedded 1840 // information about aliasing. 1841 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 1842 AA.registerFunctionAnalysis<TypeBasedAA>(); 1843 1844 // Add support for querying global aliasing information when available. 1845 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 1846 // analysis, all that the `AAManager` can do is query for any *cached* 1847 // results from `GlobalsAA` through a readonly proxy. 1848 AA.registerModuleAnalysis<GlobalsAA>(); 1849 1850 // Add target-specific alias analyses. 1851 if (TM) 1852 TM->registerDefaultAliasAnalyses(AA); 1853 1854 return AA; 1855 } 1856