1 //===- Construction of pass pipelines -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// 10 /// This file provides the implementation of the PassBuilder based on our 11 /// static pass registry as well as related functionality. It also provides 12 /// helpers to aid in analyzing, debugging, and testing passes and pass 13 /// pipelines. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Analysis/AliasAnalysis.h" 18 #include "llvm/Analysis/BasicAliasAnalysis.h" 19 #include "llvm/Analysis/CGSCCPassManager.h" 20 #include "llvm/Analysis/GlobalsModRef.h" 21 #include "llvm/Analysis/InlineAdvisor.h" 22 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/ScopedNoAliasAA.h" 25 #include "llvm/Analysis/TypeBasedAliasAnalysis.h" 26 #include "llvm/IR/PassManager.h" 27 #include "llvm/Passes/OptimizationLevel.h" 28 #include "llvm/Passes/PassBuilder.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/PGOOptions.h" 32 #include "llvm/Target/TargetMachine.h" 33 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h" 34 #include "llvm/Transforms/Coroutines/CoroCleanup.h" 35 #include "llvm/Transforms/Coroutines/CoroConditionalWrapper.h" 36 #include "llvm/Transforms/Coroutines/CoroEarly.h" 37 #include "llvm/Transforms/Coroutines/CoroElide.h" 38 #include "llvm/Transforms/Coroutines/CoroSplit.h" 39 #include "llvm/Transforms/IPO/AlwaysInliner.h" 40 #include "llvm/Transforms/IPO/Annotation2Metadata.h" 41 #include "llvm/Transforms/IPO/ArgumentPromotion.h" 42 #include "llvm/Transforms/IPO/Attributor.h" 43 #include "llvm/Transforms/IPO/CalledValuePropagation.h" 44 #include "llvm/Transforms/IPO/ConstantMerge.h" 45 #include "llvm/Transforms/IPO/CrossDSOCFI.h" 46 #include "llvm/Transforms/IPO/DeadArgumentElimination.h" 47 #include "llvm/Transforms/IPO/ElimAvailExtern.h" 48 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" 49 #include "llvm/Transforms/IPO/FunctionAttrs.h" 50 #include "llvm/Transforms/IPO/GlobalDCE.h" 51 #include "llvm/Transforms/IPO/GlobalOpt.h" 52 #include "llvm/Transforms/IPO/GlobalSplit.h" 53 #include "llvm/Transforms/IPO/HotColdSplitting.h" 54 #include "llvm/Transforms/IPO/IROutliner.h" 55 #include "llvm/Transforms/IPO/InferFunctionAttrs.h" 56 #include "llvm/Transforms/IPO/Inliner.h" 57 #include "llvm/Transforms/IPO/LowerTypeTests.h" 58 #include "llvm/Transforms/IPO/MergeFunctions.h" 59 #include "llvm/Transforms/IPO/ModuleInliner.h" 60 #include "llvm/Transforms/IPO/OpenMPOpt.h" 61 #include "llvm/Transforms/IPO/PartialInlining.h" 62 #include "llvm/Transforms/IPO/SCCP.h" 63 #include "llvm/Transforms/IPO/SampleProfile.h" 64 #include "llvm/Transforms/IPO/SampleProfileProbe.h" 65 #include "llvm/Transforms/IPO/SyntheticCountsPropagation.h" 66 #include "llvm/Transforms/IPO/WholeProgramDevirt.h" 67 #include "llvm/Transforms/InstCombine/InstCombine.h" 68 #include "llvm/Transforms/Instrumentation/CGProfile.h" 69 #include "llvm/Transforms/Instrumentation/ControlHeightReduction.h" 70 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h" 71 #include "llvm/Transforms/Instrumentation/InstrProfiling.h" 72 #include "llvm/Transforms/Instrumentation/MemProfiler.h" 73 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h" 74 #include "llvm/Transforms/Scalar/ADCE.h" 75 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" 76 #include "llvm/Transforms/Scalar/AnnotationRemarks.h" 77 #include "llvm/Transforms/Scalar/BDCE.h" 78 #include "llvm/Transforms/Scalar/CallSiteSplitting.h" 79 #include "llvm/Transforms/Scalar/ConstraintElimination.h" 80 #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" 81 #include "llvm/Transforms/Scalar/DFAJumpThreading.h" 82 #include "llvm/Transforms/Scalar/DeadStoreElimination.h" 83 #include "llvm/Transforms/Scalar/DivRemPairs.h" 84 #include "llvm/Transforms/Scalar/EarlyCSE.h" 85 #include "llvm/Transforms/Scalar/Float2Int.h" 86 #include "llvm/Transforms/Scalar/GVN.h" 87 #include "llvm/Transforms/Scalar/IndVarSimplify.h" 88 #include "llvm/Transforms/Scalar/InstSimplifyPass.h" 89 #include "llvm/Transforms/Scalar/JumpThreading.h" 90 #include "llvm/Transforms/Scalar/LICM.h" 91 #include "llvm/Transforms/Scalar/LoopDeletion.h" 92 #include "llvm/Transforms/Scalar/LoopDistribute.h" 93 #include "llvm/Transforms/Scalar/LoopFlatten.h" 94 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 95 #include "llvm/Transforms/Scalar/LoopInstSimplify.h" 96 #include "llvm/Transforms/Scalar/LoopInterchange.h" 97 #include "llvm/Transforms/Scalar/LoopLoadElimination.h" 98 #include "llvm/Transforms/Scalar/LoopPassManager.h" 99 #include "llvm/Transforms/Scalar/LoopRotation.h" 100 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h" 101 #include "llvm/Transforms/Scalar/LoopSink.h" 102 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h" 103 #include "llvm/Transforms/Scalar/LoopUnrollPass.h" 104 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h" 105 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" 106 #include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h" 107 #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" 108 #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" 109 #include "llvm/Transforms/Scalar/NewGVN.h" 110 #include "llvm/Transforms/Scalar/Reassociate.h" 111 #include "llvm/Transforms/Scalar/SCCP.h" 112 #include "llvm/Transforms/Scalar/SROA.h" 113 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" 114 #include "llvm/Transforms/Scalar/SimplifyCFG.h" 115 #include "llvm/Transforms/Scalar/SpeculativeExecution.h" 116 #include "llvm/Transforms/Scalar/TailRecursionElimination.h" 117 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h" 118 #include "llvm/Transforms/Utils/AddDiscriminators.h" 119 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" 120 #include "llvm/Transforms/Utils/CanonicalizeAliases.h" 121 #include "llvm/Transforms/Utils/InjectTLIMappings.h" 122 #include "llvm/Transforms/Utils/LibCallsShrinkWrap.h" 123 #include "llvm/Transforms/Utils/Mem2Reg.h" 124 #include "llvm/Transforms/Utils/NameAnonGlobals.h" 125 #include "llvm/Transforms/Utils/RelLookupTableConverter.h" 126 #include "llvm/Transforms/Utils/SimplifyCFGOptions.h" 127 #include "llvm/Transforms/Vectorize/LoopVectorize.h" 128 #include "llvm/Transforms/Vectorize/SLPVectorizer.h" 129 #include "llvm/Transforms/Vectorize/VectorCombine.h" 130 131 using namespace llvm; 132 133 static cl::opt<InliningAdvisorMode> UseInlineAdvisor( 134 "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden, 135 cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"), 136 cl::values(clEnumValN(InliningAdvisorMode::Default, "default", 137 "Heuristics-based inliner version."), 138 clEnumValN(InliningAdvisorMode::Development, "development", 139 "Use development mode (runtime-loadable model)."), 140 clEnumValN(InliningAdvisorMode::Release, "release", 141 "Use release mode (AOT-compiled model)."))); 142 143 static cl::opt<bool> EnableSyntheticCounts( 144 "enable-npm-synthetic-counts", cl::Hidden, 145 cl::desc("Run synthetic function entry count generation " 146 "pass")); 147 148 /// Flag to enable inline deferral during PGO. 149 static cl::opt<bool> 150 EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true), 151 cl::Hidden, 152 cl::desc("Enable inline deferral during PGO")); 153 154 static cl::opt<bool> EnableMemProfiler("enable-mem-prof", cl::Hidden, 155 cl::desc("Enable memory profiler")); 156 157 static cl::opt<bool> EnableModuleInliner("enable-module-inliner", 158 cl::init(false), cl::Hidden, 159 cl::desc("Enable module inliner")); 160 161 static cl::opt<bool> PerformMandatoryInliningsFirst( 162 "mandatory-inlining-first", cl::init(true), cl::Hidden, 163 cl::desc("Perform mandatory inlinings module-wide, before performing " 164 "inlining.")); 165 166 static cl::opt<bool> EnableO3NonTrivialUnswitching( 167 "enable-npm-O3-nontrivial-unswitch", cl::init(true), cl::Hidden, 168 cl::desc("Enable non-trivial loop unswitching for -O3")); 169 170 static cl::opt<bool> EnableEagerlyInvalidateAnalyses( 171 "eagerly-invalidate-analyses", cl::init(true), cl::Hidden, 172 cl::desc("Eagerly invalidate more analyses in default pipelines")); 173 174 static cl::opt<bool> EnableNoRerunSimplificationPipeline( 175 "enable-no-rerun-simplification-pipeline", cl::init(true), cl::Hidden, 176 cl::desc( 177 "Prevent running the simplification pipeline on a function more " 178 "than once in the case that SCC mutations cause a function to be " 179 "visited multiple times as long as the function has not been changed")); 180 181 static cl::opt<bool> EnableMergeFunctions( 182 "enable-merge-functions", cl::init(false), cl::Hidden, 183 cl::desc("Enable function merging as part of the optimization pipeline")); 184 185 PipelineTuningOptions::PipelineTuningOptions() { 186 LoopInterleaving = true; 187 LoopVectorization = true; 188 SLPVectorization = false; 189 LoopUnrolling = true; 190 ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll; 191 LicmMssaOptCap = SetLicmMssaOptCap; 192 LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap; 193 CallGraphProfile = true; 194 MergeFunctions = EnableMergeFunctions; 195 EagerlyInvalidateAnalyses = EnableEagerlyInvalidateAnalyses; 196 } 197 198 namespace llvm { 199 200 extern cl::opt<unsigned> MaxDevirtIterations; 201 extern cl::opt<bool> EnableConstraintElimination; 202 extern cl::opt<bool> EnableFunctionSpecialization; 203 extern cl::opt<bool> EnableGVNHoist; 204 extern cl::opt<bool> EnableGVNSink; 205 extern cl::opt<bool> EnableHotColdSplit; 206 extern cl::opt<bool> EnableIROutliner; 207 extern cl::opt<bool> EnableOrderFileInstrumentation; 208 extern cl::opt<bool> EnableCHR; 209 extern cl::opt<bool> EnableLoopInterchange; 210 extern cl::opt<bool> EnableUnrollAndJam; 211 extern cl::opt<bool> EnableLoopFlatten; 212 extern cl::opt<bool> EnableDFAJumpThreading; 213 extern cl::opt<bool> RunNewGVN; 214 extern cl::opt<bool> RunPartialInlining; 215 extern cl::opt<bool> ExtraVectorizerPasses; 216 217 extern cl::opt<bool> FlattenedProfileUsed; 218 219 extern cl::opt<AttributorRunOption> AttributorRun; 220 extern cl::opt<bool> EnableKnowledgeRetention; 221 222 extern cl::opt<bool> EnableMatrix; 223 224 extern cl::opt<bool> DisablePreInliner; 225 extern cl::opt<int> PreInlineThreshold; 226 } // namespace llvm 227 228 void PassBuilder::invokePeepholeEPCallbacks(FunctionPassManager &FPM, 229 OptimizationLevel Level) { 230 for (auto &C : PeepholeEPCallbacks) 231 C(FPM, Level); 232 } 233 234 // Helper to add AnnotationRemarksPass. 235 static void addAnnotationRemarksPass(ModulePassManager &MPM) { 236 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 237 } 238 239 // Helper to check if the current compilation phase is preparing for LTO 240 static bool isLTOPreLink(ThinOrFullLTOPhase Phase) { 241 return Phase == ThinOrFullLTOPhase::ThinLTOPreLink || 242 Phase == ThinOrFullLTOPhase::FullLTOPreLink; 243 } 244 245 // TODO: Investigate the cost/benefit of tail call elimination on debugging. 246 FunctionPassManager 247 PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level, 248 ThinOrFullLTOPhase Phase) { 249 250 FunctionPassManager FPM; 251 252 // Form SSA out of local memory accesses after breaking apart aggregates into 253 // scalars. 254 FPM.addPass(SROAPass()); 255 256 // Catch trivial redundancies 257 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 258 259 // Hoisting of scalars and load expressions. 260 FPM.addPass( 261 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 262 FPM.addPass(InstCombinePass()); 263 264 FPM.addPass(LibCallsShrinkWrapPass()); 265 266 invokePeepholeEPCallbacks(FPM, Level); 267 268 FPM.addPass( 269 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 270 271 // Form canonically associated expression trees, and simplify the trees using 272 // basic mathematical properties. For example, this will form (nearly) 273 // minimal multiplication trees. 274 FPM.addPass(ReassociatePass()); 275 276 // Add the primary loop simplification pipeline. 277 // FIXME: Currently this is split into two loop pass pipelines because we run 278 // some function passes in between them. These can and should be removed 279 // and/or replaced by scheduling the loop pass equivalents in the correct 280 // positions. But those equivalent passes aren't powerful enough yet. 281 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 282 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 283 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 284 // `LoopInstSimplify`. 285 LoopPassManager LPM1, LPM2; 286 287 // Simplify the loop body. We do this initially to clean up after other loop 288 // passes run, either when iterating on a loop or on inner loops with 289 // implications on the outer loop. 290 LPM1.addPass(LoopInstSimplifyPass()); 291 LPM1.addPass(LoopSimplifyCFGPass()); 292 293 // Try to remove as much code from the loop header as possible, 294 // to reduce amount of IR that will have to be duplicated. However, 295 // do not perform speculative hoisting the first time as LICM 296 // will destroy metadata that may not need to be destroyed if run 297 // after loop rotation. 298 // TODO: Investigate promotion cap for O1. 299 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 300 /*AllowSpeculation=*/false)); 301 302 LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, 303 isLTOPreLink(Phase))); 304 // TODO: Investigate promotion cap for O1. 305 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 306 /*AllowSpeculation=*/true)); 307 LPM1.addPass(SimpleLoopUnswitchPass()); 308 if (EnableLoopFlatten) 309 LPM1.addPass(LoopFlattenPass()); 310 311 LPM2.addPass(LoopIdiomRecognizePass()); 312 LPM2.addPass(IndVarSimplifyPass()); 313 314 for (auto &C : LateLoopOptimizationsEPCallbacks) 315 C(LPM2, Level); 316 317 LPM2.addPass(LoopDeletionPass()); 318 319 if (EnableLoopInterchange) 320 LPM2.addPass(LoopInterchangePass()); 321 322 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 323 // because it changes IR to makes profile annotation in back compile 324 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 325 // attributes so we need to make sure and allow the full unroll pass to pay 326 // attention to it. 327 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 328 PGOOpt->Action != PGOOptions::SampleUse) 329 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 330 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 331 PTO.ForgetAllSCEVInLoopUnroll)); 332 333 for (auto &C : LoopOptimizerEndEPCallbacks) 334 C(LPM2, Level); 335 336 // We provide the opt remark emitter pass for LICM to use. We only need to do 337 // this once as it is immutable. 338 FPM.addPass( 339 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 340 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 341 /*UseMemorySSA=*/true, 342 /*UseBlockFrequencyInfo=*/true)); 343 FPM.addPass( 344 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 345 FPM.addPass(InstCombinePass()); 346 // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA. 347 // *All* loop passes must preserve it, in order to be able to use it. 348 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 349 /*UseMemorySSA=*/false, 350 /*UseBlockFrequencyInfo=*/false)); 351 352 // Delete small array after loop unroll. 353 FPM.addPass(SROAPass()); 354 355 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 356 FPM.addPass(MemCpyOptPass()); 357 358 // Sparse conditional constant propagation. 359 // FIXME: It isn't clear why we do this *after* loop passes rather than 360 // before... 361 FPM.addPass(SCCPPass()); 362 363 // Delete dead bit computations (instcombine runs after to fold away the dead 364 // computations, and then ADCE will run later to exploit any new DCE 365 // opportunities that creates). 366 FPM.addPass(BDCEPass()); 367 368 // Run instcombine after redundancy and dead bit elimination to exploit 369 // opportunities opened up by them. 370 FPM.addPass(InstCombinePass()); 371 invokePeepholeEPCallbacks(FPM, Level); 372 373 FPM.addPass(CoroElidePass()); 374 375 for (auto &C : ScalarOptimizerLateEPCallbacks) 376 C(FPM, Level); 377 378 // Finally, do an expensive DCE pass to catch all the dead code exposed by 379 // the simplifications and basic cleanup after all the simplifications. 380 // TODO: Investigate if this is too expensive. 381 FPM.addPass(ADCEPass()); 382 FPM.addPass( 383 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 384 FPM.addPass(InstCombinePass()); 385 invokePeepholeEPCallbacks(FPM, Level); 386 387 return FPM; 388 } 389 390 FunctionPassManager 391 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, 392 ThinOrFullLTOPhase Phase) { 393 assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); 394 395 // The O1 pipeline has a separate pipeline creation function to simplify 396 // construction readability. 397 if (Level.getSpeedupLevel() == 1) 398 return buildO1FunctionSimplificationPipeline(Level, Phase); 399 400 FunctionPassManager FPM; 401 402 // Form SSA out of local memory accesses after breaking apart aggregates into 403 // scalars. 404 FPM.addPass(SROAPass()); 405 406 // Catch trivial redundancies 407 FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); 408 if (EnableKnowledgeRetention) 409 FPM.addPass(AssumeSimplifyPass()); 410 411 // Hoisting of scalars and load expressions. 412 if (EnableGVNHoist) 413 FPM.addPass(GVNHoistPass()); 414 415 // Global value numbering based sinking. 416 if (EnableGVNSink) { 417 FPM.addPass(GVNSinkPass()); 418 FPM.addPass( 419 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 420 } 421 422 if (EnableConstraintElimination) 423 FPM.addPass(ConstraintEliminationPass()); 424 425 // Speculative execution if the target has divergent branches; otherwise nop. 426 FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true)); 427 428 // Optimize based on known information about branches, and cleanup afterward. 429 FPM.addPass(JumpThreadingPass()); 430 FPM.addPass(CorrelatedValuePropagationPass()); 431 432 FPM.addPass( 433 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 434 FPM.addPass(InstCombinePass()); 435 if (Level == OptimizationLevel::O3) 436 FPM.addPass(AggressiveInstCombinePass()); 437 438 if (!Level.isOptimizingForSize()) 439 FPM.addPass(LibCallsShrinkWrapPass()); 440 441 invokePeepholeEPCallbacks(FPM, Level); 442 443 // For PGO use pipeline, try to optimize memory intrinsics such as memcpy 444 // using the size value profile. Don't perform this when optimizing for size. 445 if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && 446 !Level.isOptimizingForSize()) 447 FPM.addPass(PGOMemOPSizeOpt()); 448 449 FPM.addPass(TailCallElimPass()); 450 FPM.addPass( 451 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 452 453 // Form canonically associated expression trees, and simplify the trees using 454 // basic mathematical properties. For example, this will form (nearly) 455 // minimal multiplication trees. 456 FPM.addPass(ReassociatePass()); 457 458 // Add the primary loop simplification pipeline. 459 // FIXME: Currently this is split into two loop pass pipelines because we run 460 // some function passes in between them. These can and should be removed 461 // and/or replaced by scheduling the loop pass equivalents in the correct 462 // positions. But those equivalent passes aren't powerful enough yet. 463 // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still 464 // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to 465 // fully replace `SimplifyCFGPass`, and the closest to the other we have is 466 // `LoopInstSimplify`. 467 LoopPassManager LPM1, LPM2; 468 469 // Simplify the loop body. We do this initially to clean up after other loop 470 // passes run, either when iterating on a loop or on inner loops with 471 // implications on the outer loop. 472 LPM1.addPass(LoopInstSimplifyPass()); 473 LPM1.addPass(LoopSimplifyCFGPass()); 474 475 // Try to remove as much code from the loop header as possible, 476 // to reduce amount of IR that will have to be duplicated. However, 477 // do not perform speculative hoisting the first time as LICM 478 // will destroy metadata that may not need to be destroyed if run 479 // after loop rotation. 480 // TODO: Investigate promotion cap for O1. 481 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 482 /*AllowSpeculation=*/false)); 483 484 // Disable header duplication in loop rotation at -Oz. 485 LPM1.addPass( 486 LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); 487 // TODO: Investigate promotion cap for O1. 488 LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 489 /*AllowSpeculation=*/true)); 490 LPM1.addPass( 491 SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && 492 EnableO3NonTrivialUnswitching)); 493 if (EnableLoopFlatten) 494 LPM1.addPass(LoopFlattenPass()); 495 496 LPM2.addPass(LoopIdiomRecognizePass()); 497 LPM2.addPass(IndVarSimplifyPass()); 498 499 for (auto &C : LateLoopOptimizationsEPCallbacks) 500 C(LPM2, Level); 501 502 LPM2.addPass(LoopDeletionPass()); 503 504 if (EnableLoopInterchange) 505 LPM2.addPass(LoopInterchangePass()); 506 507 // Do not enable unrolling in PreLinkThinLTO phase during sample PGO 508 // because it changes IR to makes profile annotation in back compile 509 // inaccurate. The normal unroller doesn't pay attention to forced full unroll 510 // attributes so we need to make sure and allow the full unroll pass to pay 511 // attention to it. 512 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink || !PGOOpt || 513 PGOOpt->Action != PGOOptions::SampleUse) 514 LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 515 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 516 PTO.ForgetAllSCEVInLoopUnroll)); 517 518 for (auto &C : LoopOptimizerEndEPCallbacks) 519 C(LPM2, Level); 520 521 // We provide the opt remark emitter pass for LICM to use. We only need to do 522 // this once as it is immutable. 523 FPM.addPass( 524 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 525 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1), 526 /*UseMemorySSA=*/true, 527 /*UseBlockFrequencyInfo=*/true)); 528 FPM.addPass( 529 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 530 FPM.addPass(InstCombinePass()); 531 // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass, 532 // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA. 533 // *All* loop passes must preserve it, in order to be able to use it. 534 FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2), 535 /*UseMemorySSA=*/false, 536 /*UseBlockFrequencyInfo=*/false)); 537 538 // Delete small array after loop unroll. 539 FPM.addPass(SROAPass()); 540 541 // The matrix extension can introduce large vector operations early, which can 542 // benefit from running vector-combine early on. 543 if (EnableMatrix) 544 FPM.addPass(VectorCombinePass(/*ScalarizationOnly=*/true)); 545 546 // Eliminate redundancies. 547 FPM.addPass(MergedLoadStoreMotionPass()); 548 if (RunNewGVN) 549 FPM.addPass(NewGVNPass()); 550 else 551 FPM.addPass(GVNPass()); 552 553 // Sparse conditional constant propagation. 554 // FIXME: It isn't clear why we do this *after* loop passes rather than 555 // before... 556 FPM.addPass(SCCPPass()); 557 558 // Delete dead bit computations (instcombine runs after to fold away the dead 559 // computations, and then ADCE will run later to exploit any new DCE 560 // opportunities that creates). 561 FPM.addPass(BDCEPass()); 562 563 // Run instcombine after redundancy and dead bit elimination to exploit 564 // opportunities opened up by them. 565 FPM.addPass(InstCombinePass()); 566 invokePeepholeEPCallbacks(FPM, Level); 567 568 // Re-consider control flow based optimizations after redundancy elimination, 569 // redo DCE, etc. 570 if (EnableDFAJumpThreading && Level.getSizeLevel() == 0) 571 FPM.addPass(DFAJumpThreadingPass()); 572 573 FPM.addPass(JumpThreadingPass()); 574 FPM.addPass(CorrelatedValuePropagationPass()); 575 576 // Finally, do an expensive DCE pass to catch all the dead code exposed by 577 // the simplifications and basic cleanup after all the simplifications. 578 // TODO: Investigate if this is too expensive. 579 FPM.addPass(ADCEPass()); 580 581 // Specially optimize memory movement as it doesn't look like dataflow in SSA. 582 FPM.addPass(MemCpyOptPass()); 583 584 FPM.addPass(DSEPass()); 585 FPM.addPass(createFunctionToLoopPassAdaptor( 586 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 587 /*AllowSpeculation=*/true), 588 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 589 590 FPM.addPass(CoroElidePass()); 591 592 for (auto &C : ScalarOptimizerLateEPCallbacks) 593 C(FPM, Level); 594 595 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 596 .convertSwitchRangeToICmp(true) 597 .hoistCommonInsts(true) 598 .sinkCommonInsts(true))); 599 FPM.addPass(InstCombinePass()); 600 invokePeepholeEPCallbacks(FPM, Level); 601 602 if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt && 603 (PGOOpt->Action == PGOOptions::IRUse || 604 PGOOpt->Action == PGOOptions::SampleUse)) 605 FPM.addPass(ControlHeightReductionPass()); 606 607 return FPM; 608 } 609 610 void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) { 611 MPM.addPass(CanonicalizeAliasesPass()); 612 MPM.addPass(NameAnonGlobalPass()); 613 } 614 615 void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, 616 OptimizationLevel Level, bool RunProfileGen, 617 bool IsCS, std::string ProfileFile, 618 std::string ProfileRemappingFile, 619 ThinOrFullLTOPhase LTOPhase) { 620 assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); 621 if (!IsCS && !DisablePreInliner) { 622 InlineParams IP; 623 624 IP.DefaultThreshold = PreInlineThreshold; 625 626 // FIXME: The hint threshold has the same value used by the regular inliner 627 // when not optimzing for size. This should probably be lowered after 628 // performance testing. 629 // FIXME: this comment is cargo culted from the old pass manager, revisit). 630 IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325; 631 ModuleInlinerWrapperPass MIWP( 632 IP, /* MandatoryFirst */ true, 633 InlineContext{LTOPhase, InlinePass::EarlyInliner}); 634 CGSCCPassManager &CGPipeline = MIWP.getPM(); 635 636 FunctionPassManager FPM; 637 FPM.addPass(SROAPass()); 638 FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies. 639 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp( 640 true))); // Merge & remove basic blocks. 641 FPM.addPass(InstCombinePass()); // Combine silly sequences. 642 invokePeepholeEPCallbacks(FPM, Level); 643 644 CGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 645 std::move(FPM), PTO.EagerlyInvalidateAnalyses)); 646 647 MPM.addPass(std::move(MIWP)); 648 649 // Delete anything that is now dead to make sure that we don't instrument 650 // dead code. Instrumentation can end up keeping dead code around and 651 // dramatically increase code size. 652 MPM.addPass(GlobalDCEPass()); 653 } 654 655 if (!RunProfileGen) { 656 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 657 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 658 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 659 // RequireAnalysisPass for PSI before subsequent non-module passes. 660 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 661 return; 662 } 663 664 // Perform PGO instrumentation. 665 MPM.addPass(PGOInstrumentationGen(IsCS)); 666 667 // Disable header duplication in loop rotation at -Oz. 668 MPM.addPass(createModuleToFunctionPassAdaptor( 669 createFunctionToLoopPassAdaptor( 670 LoopRotatePass(Level != OptimizationLevel::Oz), 671 /*UseMemorySSA=*/false, 672 /*UseBlockFrequencyInfo=*/false), 673 PTO.EagerlyInvalidateAnalyses)); 674 675 // Add the profile lowering pass. 676 InstrProfOptions Options; 677 if (!ProfileFile.empty()) 678 Options.InstrProfileOutput = ProfileFile; 679 // Do counter promotion at Level greater than O0. 680 Options.DoCounterPromotion = true; 681 Options.UseBFIInPromotion = IsCS; 682 MPM.addPass(InstrProfiling(Options, IsCS)); 683 } 684 685 void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM, 686 bool RunProfileGen, bool IsCS, 687 std::string ProfileFile, 688 std::string ProfileRemappingFile) { 689 if (!RunProfileGen) { 690 assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); 691 MPM.addPass(PGOInstrumentationUse(ProfileFile, ProfileRemappingFile, IsCS)); 692 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 693 // RequireAnalysisPass for PSI before subsequent non-module passes. 694 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 695 return; 696 } 697 698 // Perform PGO instrumentation. 699 MPM.addPass(PGOInstrumentationGen(IsCS)); 700 // Add the profile lowering pass. 701 InstrProfOptions Options; 702 if (!ProfileFile.empty()) 703 Options.InstrProfileOutput = ProfileFile; 704 // Do not do counter promotion at O0. 705 Options.DoCounterPromotion = false; 706 Options.UseBFIInPromotion = IsCS; 707 MPM.addPass(InstrProfiling(Options, IsCS)); 708 } 709 710 static InlineParams getInlineParamsFromOptLevel(OptimizationLevel Level) { 711 return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); 712 } 713 714 ModuleInlinerWrapperPass 715 PassBuilder::buildInlinerPipeline(OptimizationLevel Level, 716 ThinOrFullLTOPhase Phase) { 717 InlineParams IP = getInlineParamsFromOptLevel(Level); 718 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 719 // disable hot callsite inline (as much as possible [1]) because it makes 720 // profile annotation in the backend inaccurate. 721 // 722 // [1] Note the cost of a function could be below zero due to erased 723 // prologue / epilogue. 724 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 725 PGOOpt->Action == PGOOptions::SampleUse) 726 IP.HotCallSiteThreshold = 0; 727 728 if (PGOOpt) 729 IP.EnableDeferral = EnablePGOInlineDeferral; 730 731 ModuleInlinerWrapperPass MIWP( 732 IP, PerformMandatoryInliningsFirst, 733 InlineContext{Phase, InlinePass::CGSCCInliner}, 734 UseInlineAdvisor, MaxDevirtIterations); 735 736 // Require the GlobalsAA analysis for the module so we can query it within 737 // the CGSCC pipeline. 738 MIWP.addModulePass(RequireAnalysisPass<GlobalsAA, Module>()); 739 // Invalidate AAManager so it can be recreated and pick up the newly available 740 // GlobalsAA. 741 MIWP.addModulePass( 742 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 743 744 // Require the ProfileSummaryAnalysis for the module so we can query it within 745 // the inliner pass. 746 MIWP.addModulePass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 747 748 // Now begin the main postorder CGSCC pipeline. 749 // FIXME: The current CGSCC pipeline has its origins in the legacy pass 750 // manager and trying to emulate its precise behavior. Much of this doesn't 751 // make a lot of sense and we should revisit the core CGSCC structure. 752 CGSCCPassManager &MainCGPipeline = MIWP.getPM(); 753 754 // Note: historically, the PruneEH pass was run first to deduce nounwind and 755 // generally clean up exception handling overhead. It isn't clear this is 756 // valuable as the inliner doesn't currently care whether it is inlining an 757 // invoke or a call. 758 759 if (AttributorRun & AttributorRunOption::CGSCC) 760 MainCGPipeline.addPass(AttributorCGSCCPass()); 761 762 // Now deduce any function attributes based in the current code. 763 MainCGPipeline.addPass(PostOrderFunctionAttrsPass()); 764 765 // When at O3 add argument promotion to the pass pipeline. 766 // FIXME: It isn't at all clear why this should be limited to O3. 767 if (Level == OptimizationLevel::O3) 768 MainCGPipeline.addPass(ArgumentPromotionPass()); 769 770 // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if 771 // there are no OpenMP runtime calls present in the module. 772 if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3) 773 MainCGPipeline.addPass(OpenMPOptCGSCCPass()); 774 775 for (auto &C : CGSCCOptimizerLateEPCallbacks) 776 C(MainCGPipeline, Level); 777 778 // Lastly, add the core function simplification pipeline nested inside the 779 // CGSCC walk. 780 MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor( 781 buildFunctionSimplificationPipeline(Level, Phase), 782 PTO.EagerlyInvalidateAnalyses, EnableNoRerunSimplificationPipeline)); 783 784 MainCGPipeline.addPass(CoroSplitPass(Level != OptimizationLevel::O0)); 785 786 if (EnableNoRerunSimplificationPipeline) 787 MIWP.addLateModulePass(createModuleToFunctionPassAdaptor( 788 InvalidateAnalysisPass<ShouldNotRunFunctionPassesAnalysis>())); 789 790 return MIWP; 791 } 792 793 ModulePassManager 794 PassBuilder::buildModuleInlinerPipeline(OptimizationLevel Level, 795 ThinOrFullLTOPhase Phase) { 796 ModulePassManager MPM; 797 798 InlineParams IP = getInlineParamsFromOptLevel(Level); 799 // For PreLinkThinLTO + SamplePGO, set hot-caller threshold to 0 to 800 // disable hot callsite inline (as much as possible [1]) because it makes 801 // profile annotation in the backend inaccurate. 802 // 803 // [1] Note the cost of a function could be below zero due to erased 804 // prologue / epilogue. 805 if (Phase == ThinOrFullLTOPhase::ThinLTOPreLink && PGOOpt && 806 PGOOpt->Action == PGOOptions::SampleUse) 807 IP.HotCallSiteThreshold = 0; 808 809 if (PGOOpt) 810 IP.EnableDeferral = EnablePGOInlineDeferral; 811 812 // The inline deferral logic is used to avoid losing some 813 // inlining chance in future. It is helpful in SCC inliner, in which 814 // inlining is processed in bottom-up order. 815 // While in module inliner, the inlining order is a priority-based order 816 // by default. The inline deferral is unnecessary there. So we disable the 817 // inline deferral logic in module inliner. 818 IP.EnableDeferral = false; 819 820 MPM.addPass(ModuleInlinerPass(IP, UseInlineAdvisor, Phase)); 821 822 MPM.addPass(createModuleToFunctionPassAdaptor( 823 buildFunctionSimplificationPipeline(Level, Phase), 824 PTO.EagerlyInvalidateAnalyses)); 825 826 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( 827 CoroSplitPass(Level != OptimizationLevel::O0))); 828 829 return MPM; 830 } 831 832 ModulePassManager 833 PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, 834 ThinOrFullLTOPhase Phase) { 835 ModulePassManager MPM; 836 837 // Place pseudo probe instrumentation as the first pass of the pipeline to 838 // minimize the impact of optimization changes. 839 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 840 Phase != ThinOrFullLTOPhase::ThinLTOPostLink) 841 MPM.addPass(SampleProfileProbePass(TM)); 842 843 bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse); 844 845 // In ThinLTO mode, when flattened profile is used, all the available 846 // profile information will be annotated in PreLink phase so there is 847 // no need to load the profile again in PostLink. 848 bool LoadSampleProfile = 849 HasSampleProfile && 850 !(FlattenedProfileUsed && Phase == ThinOrFullLTOPhase::ThinLTOPostLink); 851 852 // During the ThinLTO backend phase we perform early indirect call promotion 853 // here, before globalopt. Otherwise imported available_externally functions 854 // look unreferenced and are removed. If we are going to load the sample 855 // profile then defer until later. 856 // TODO: See if we can move later and consolidate with the location where 857 // we perform ICP when we are loading a sample profile. 858 // TODO: We pass HasSampleProfile (whether there was a sample profile file 859 // passed to the compile) to the SamplePGO flag of ICP. This is used to 860 // determine whether the new direct calls are annotated with prof metadata. 861 // Ideally this should be determined from whether the IR is annotated with 862 // sample profile, and not whether the a sample profile was provided on the 863 // command line. E.g. for flattened profiles where we will not be reloading 864 // the sample profile in the ThinLTO backend, we ideally shouldn't have to 865 // provide the sample profile file. 866 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink && !LoadSampleProfile) 867 MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */, HasSampleProfile)); 868 869 // Do basic inference of function attributes from known properties of system 870 // libraries and other oracles. 871 MPM.addPass(InferFunctionAttrsPass()); 872 MPM.addPass(CoroEarlyPass()); 873 874 // Create an early function pass manager to cleanup the output of the 875 // frontend. 876 FunctionPassManager EarlyFPM; 877 // Lower llvm.expect to metadata before attempting transforms. 878 // Compare/branch metadata may alter the behavior of passes like SimplifyCFG. 879 EarlyFPM.addPass(LowerExpectIntrinsicPass()); 880 EarlyFPM.addPass(SimplifyCFGPass()); 881 EarlyFPM.addPass(SROAPass()); 882 EarlyFPM.addPass(EarlyCSEPass()); 883 if (Level == OptimizationLevel::O3) 884 EarlyFPM.addPass(CallSiteSplittingPass()); 885 886 // In SamplePGO ThinLTO backend, we need instcombine before profile annotation 887 // to convert bitcast to direct calls so that they can be inlined during the 888 // profile annotation prepration step. 889 // More details about SamplePGO design can be found in: 890 // https://research.google.com/pubs/pub45290.html 891 // FIXME: revisit how SampleProfileLoad/Inliner/ICP is structured. 892 if (LoadSampleProfile) 893 EarlyFPM.addPass(InstCombinePass()); 894 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM), 895 PTO.EagerlyInvalidateAnalyses)); 896 897 if (LoadSampleProfile) { 898 // Annotate sample profile right after early FPM to ensure freshness of 899 // the debug info. 900 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 901 PGOOpt->ProfileRemappingFile, Phase)); 902 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 903 // RequireAnalysisPass for PSI before subsequent non-module passes. 904 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 905 // Do not invoke ICP in the LTOPrelink phase as it makes it hard 906 // for the profile annotation to be accurate in the LTO backend. 907 if (Phase != ThinOrFullLTOPhase::ThinLTOPreLink && 908 Phase != ThinOrFullLTOPhase::FullLTOPreLink) 909 // We perform early indirect call promotion here, before globalopt. 910 // This is important for the ThinLTO backend phase because otherwise 911 // imported available_externally functions look unreferenced and are 912 // removed. 913 MPM.addPass( 914 PGOIndirectCallPromotion(true /* IsInLTO */, true /* SamplePGO */)); 915 } 916 917 // Try to perform OpenMP specific optimizations on the module. This is a 918 // (quick!) no-op if there are no OpenMP runtime calls present in the module. 919 if (Level != OptimizationLevel::O0) 920 MPM.addPass(OpenMPOptPass()); 921 922 if (AttributorRun & AttributorRunOption::MODULE) 923 MPM.addPass(AttributorPass()); 924 925 // Lower type metadata and the type.test intrinsic in the ThinLTO 926 // post link pipeline after ICP. This is to enable usage of the type 927 // tests in ICP sequences. 928 if (Phase == ThinOrFullLTOPhase::ThinLTOPostLink) 929 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 930 931 for (auto &C : PipelineEarlySimplificationEPCallbacks) 932 C(MPM, Level); 933 934 // Specialize functions with IPSCCP. 935 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 936 MPM.addPass(FunctionSpecializationPass()); 937 938 // Interprocedural constant propagation now that basic cleanup has occurred 939 // and prior to optimizing globals. 940 // FIXME: This position in the pipeline hasn't been carefully considered in 941 // years, it should be re-analyzed. 942 MPM.addPass(IPSCCPPass()); 943 944 // Attach metadata to indirect call sites indicating the set of functions 945 // they may target at run-time. This should follow IPSCCP. 946 MPM.addPass(CalledValuePropagationPass()); 947 948 // Optimize globals to try and fold them into constants. 949 MPM.addPass(GlobalOptPass()); 950 951 // Promote any localized globals to SSA registers. 952 // FIXME: Should this instead by a run of SROA? 953 // FIXME: We should probably run instcombine and simplifycfg afterward to 954 // delete control flows that are dead once globals have been folded to 955 // constants. 956 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 957 958 // Remove any dead arguments exposed by cleanups and constant folding 959 // globals. 960 MPM.addPass(DeadArgumentEliminationPass()); 961 962 // Create a small function pass pipeline to cleanup after all the global 963 // optimizations. 964 FunctionPassManager GlobalCleanupPM; 965 GlobalCleanupPM.addPass(InstCombinePass()); 966 invokePeepholeEPCallbacks(GlobalCleanupPM, Level); 967 968 GlobalCleanupPM.addPass( 969 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 970 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM), 971 PTO.EagerlyInvalidateAnalyses)); 972 973 // Add all the requested passes for instrumentation PGO, if requested. 974 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 975 (PGOOpt->Action == PGOOptions::IRInstr || 976 PGOOpt->Action == PGOOptions::IRUse)) { 977 addPGOInstrPasses(MPM, Level, 978 /* RunProfileGen */ PGOOpt->Action == PGOOptions::IRInstr, 979 /* IsCS */ false, PGOOpt->ProfileFile, 980 PGOOpt->ProfileRemappingFile, Phase); 981 MPM.addPass(PGOIndirectCallPromotion(false, false)); 982 } 983 if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink && 984 PGOOpt->CSAction == PGOOptions::CSIRInstr) 985 MPM.addPass(PGOInstrumentationGenCreateVar(PGOOpt->CSProfileGenFile)); 986 987 // Synthesize function entry counts for non-PGO compilation. 988 if (EnableSyntheticCounts && !PGOOpt) 989 MPM.addPass(SyntheticCountsPropagation()); 990 991 if (EnableModuleInliner) 992 MPM.addPass(buildModuleInlinerPipeline(Level, Phase)); 993 else 994 MPM.addPass(buildInlinerPipeline(Level, Phase)); 995 996 MPM.addPass(CoroCleanupPass()); 997 998 if (EnableMemProfiler && Phase != ThinOrFullLTOPhase::ThinLTOPreLink) { 999 MPM.addPass(createModuleToFunctionPassAdaptor(MemProfilerPass())); 1000 MPM.addPass(ModuleMemProfilerPass()); 1001 } 1002 1003 return MPM; 1004 } 1005 1006 /// TODO: Should LTO cause any differences to this set of passes? 1007 void PassBuilder::addVectorPasses(OptimizationLevel Level, 1008 FunctionPassManager &FPM, bool IsFullLTO) { 1009 FPM.addPass(LoopVectorizePass( 1010 LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization))); 1011 1012 if (IsFullLTO) { 1013 // The vectorizer may have significantly shortened a loop body; unroll 1014 // again. Unroll small loops to hide loop backedge latency and saturate any 1015 // parallel execution resources of an out-of-order processor. We also then 1016 // need to clean up redundancies and loop invariant code. 1017 // FIXME: It would be really good to use a loop-integrated instruction 1018 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1019 // across the loop nests. 1020 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1021 if (EnableUnrollAndJam && PTO.LoopUnrolling) 1022 FPM.addPass(createFunctionToLoopPassAdaptor( 1023 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1024 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1025 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1026 PTO.ForgetAllSCEVInLoopUnroll))); 1027 FPM.addPass(WarnMissedTransformationsPass()); 1028 } 1029 1030 if (!IsFullLTO) { 1031 // Eliminate loads by forwarding stores from the previous iteration to loads 1032 // of the current iteration. 1033 FPM.addPass(LoopLoadEliminationPass()); 1034 } 1035 // Cleanup after the loop optimization passes. 1036 FPM.addPass(InstCombinePass()); 1037 1038 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1039 ExtraVectorPassManager ExtraPasses; 1040 // At higher optimization levels, try to clean up any runtime overlap and 1041 // alignment checks inserted by the vectorizer. We want to track correlated 1042 // runtime checks for two inner loops in the same outer loop, fold any 1043 // common computations, hoist loop-invariant aspects out of any outer loop, 1044 // and unswitch the runtime checks if possible. Once hoisted, we may have 1045 // dead (or speculatable) control flows or more combining opportunities. 1046 ExtraPasses.addPass(EarlyCSEPass()); 1047 ExtraPasses.addPass(CorrelatedValuePropagationPass()); 1048 ExtraPasses.addPass(InstCombinePass()); 1049 LoopPassManager LPM; 1050 LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1051 /*AllowSpeculation=*/true)); 1052 LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == 1053 OptimizationLevel::O3)); 1054 ExtraPasses.addPass( 1055 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1056 ExtraPasses.addPass( 1057 createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true, 1058 /*UseBlockFrequencyInfo=*/true)); 1059 ExtraPasses.addPass( 1060 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1061 ExtraPasses.addPass(InstCombinePass()); 1062 FPM.addPass(std::move(ExtraPasses)); 1063 } 1064 1065 // Now that we've formed fast to execute loop structures, we do further 1066 // optimizations. These are run afterward as they might block doing complex 1067 // analyses and transforms such as what are needed for loop vectorization. 1068 1069 // Cleanup after loop vectorization, etc. Simplification passes like CVP and 1070 // GVN, loop transforms, and others have already run, so it's now better to 1071 // convert to more optimized IR using more aggressive simplify CFG options. 1072 // The extra sinking transform can create larger basic blocks, so do this 1073 // before SLP vectorization. 1074 FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions() 1075 .forwardSwitchCondToPhi(true) 1076 .convertSwitchRangeToICmp(true) 1077 .convertSwitchToLookupTable(true) 1078 .needCanonicalLoops(false) 1079 .hoistCommonInsts(true) 1080 .sinkCommonInsts(true))); 1081 1082 if (IsFullLTO) { 1083 FPM.addPass(SCCPPass()); 1084 FPM.addPass(InstCombinePass()); 1085 FPM.addPass(BDCEPass()); 1086 } 1087 1088 // Optimize parallel scalar instruction chains into SIMD instructions. 1089 if (PTO.SLPVectorization) { 1090 FPM.addPass(SLPVectorizerPass()); 1091 if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) { 1092 FPM.addPass(EarlyCSEPass()); 1093 } 1094 } 1095 // Enhance/cleanup vector code. 1096 FPM.addPass(VectorCombinePass()); 1097 1098 if (!IsFullLTO) { 1099 FPM.addPass(InstCombinePass()); 1100 // Unroll small loops to hide loop backedge latency and saturate any 1101 // parallel execution resources of an out-of-order processor. We also then 1102 // need to clean up redundancies and loop invariant code. 1103 // FIXME: It would be really good to use a loop-integrated instruction 1104 // combiner for cleanup here so that the unrolling and LICM can be pipelined 1105 // across the loop nests. 1106 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll 1107 if (EnableUnrollAndJam && PTO.LoopUnrolling) { 1108 FPM.addPass(createFunctionToLoopPassAdaptor( 1109 LoopUnrollAndJamPass(Level.getSpeedupLevel()))); 1110 } 1111 FPM.addPass(LoopUnrollPass(LoopUnrollOptions( 1112 Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, 1113 PTO.ForgetAllSCEVInLoopUnroll))); 1114 FPM.addPass(WarnMissedTransformationsPass()); 1115 FPM.addPass(InstCombinePass()); 1116 FPM.addPass( 1117 RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>()); 1118 FPM.addPass(createFunctionToLoopPassAdaptor( 1119 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1120 /*AllowSpeculation=*/true), 1121 /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1122 } 1123 1124 // Now that we've vectorized and unrolled loops, we may have more refined 1125 // alignment information, try to re-derive it here. 1126 FPM.addPass(AlignmentFromAssumptionsPass()); 1127 1128 if (IsFullLTO) 1129 FPM.addPass(InstCombinePass()); 1130 } 1131 1132 ModulePassManager 1133 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, 1134 ThinOrFullLTOPhase LTOPhase) { 1135 const bool LTOPreLink = (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink || 1136 LTOPhase == ThinOrFullLTOPhase::FullLTOPreLink); 1137 ModulePassManager MPM; 1138 1139 // Optimize globals now that the module is fully simplified. 1140 MPM.addPass(GlobalOptPass()); 1141 MPM.addPass(GlobalDCEPass()); 1142 1143 // Run partial inlining pass to partially inline functions that have 1144 // large bodies. 1145 if (RunPartialInlining) 1146 MPM.addPass(PartialInlinerPass()); 1147 1148 // Remove avail extern fns and globals definitions since we aren't compiling 1149 // an object file for later LTO. For LTO we want to preserve these so they 1150 // are eligible for inlining at link-time. Note if they are unreferenced they 1151 // will be removed by GlobalDCE later, so this only impacts referenced 1152 // available externally globals. Eventually they will be suppressed during 1153 // codegen, but eliminating here enables more opportunity for GlobalDCE as it 1154 // may make globals referenced by available external functions dead and saves 1155 // running remaining passes on the eliminated functions. These should be 1156 // preserved during prelinking for link-time inlining decisions. 1157 if (!LTOPreLink) 1158 MPM.addPass(EliminateAvailableExternallyPass()); 1159 1160 if (EnableOrderFileInstrumentation) 1161 MPM.addPass(InstrOrderFilePass()); 1162 1163 // Do RPO function attribute inference across the module to forward-propagate 1164 // attributes where applicable. 1165 // FIXME: Is this really an optimization rather than a canonicalization? 1166 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1167 1168 // Do a post inline PGO instrumentation and use pass. This is a context 1169 // sensitive PGO pass. We don't want to do this in LTOPreLink phrase as 1170 // cross-module inline has not been done yet. The context sensitive 1171 // instrumentation is after all the inlines are done. 1172 if (!LTOPreLink && PGOOpt) { 1173 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1174 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1175 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1176 PGOOpt->ProfileRemappingFile, LTOPhase); 1177 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1178 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1179 /* IsCS */ true, PGOOpt->ProfileFile, 1180 PGOOpt->ProfileRemappingFile, LTOPhase); 1181 } 1182 1183 // Re-compute GlobalsAA here prior to function passes. This is particularly 1184 // useful as the above will have inlined, DCE'ed, and function-attr 1185 // propagated everything. We should at this point have a reasonably minimal 1186 // and richly annotated call graph. By computing aliasing and mod/ref 1187 // information for all local globals here, the late loop passes and notably 1188 // the vectorizer will be able to use them to help recognize vectorizable 1189 // memory operations. 1190 MPM.addPass(RecomputeGlobalsAAPass()); 1191 1192 for (auto &C : OptimizerEarlyEPCallbacks) 1193 C(MPM, Level); 1194 1195 FunctionPassManager OptimizePM; 1196 OptimizePM.addPass(Float2IntPass()); 1197 OptimizePM.addPass(LowerConstantIntrinsicsPass()); 1198 1199 if (EnableMatrix) { 1200 OptimizePM.addPass(LowerMatrixIntrinsicsPass()); 1201 OptimizePM.addPass(EarlyCSEPass()); 1202 } 1203 1204 // FIXME: We need to run some loop optimizations to re-rotate loops after 1205 // simplifycfg and others undo their rotation. 1206 1207 // Optimize the loop execution. These passes operate on entire loop nests 1208 // rather than on each loop in an inside-out manner, and so they are actually 1209 // function passes. 1210 1211 for (auto &C : VectorizerStartEPCallbacks) 1212 C(OptimizePM, Level); 1213 1214 LoopPassManager LPM; 1215 // First rotate loops that may have been un-rotated by prior passes. 1216 // Disable header duplication at -Oz. 1217 LPM.addPass(LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink)); 1218 // Some loops may have become dead by now. Try to delete them. 1219 // FIXME: see discussion in https://reviews.llvm.org/D112851, 1220 // this may need to be revisited once we run GVN before loop deletion 1221 // in the simplification pipeline. 1222 LPM.addPass(LoopDeletionPass()); 1223 OptimizePM.addPass(createFunctionToLoopPassAdaptor( 1224 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false)); 1225 1226 // Distribute loops to allow partial vectorization. I.e. isolate dependences 1227 // into separate loop that would otherwise inhibit vectorization. This is 1228 // currently only performed for loops marked with the metadata 1229 // llvm.loop.distribute=true or when -enable-loop-distribute is specified. 1230 OptimizePM.addPass(LoopDistributePass()); 1231 1232 // Populates the VFABI attribute with the scalar-to-vector mappings 1233 // from the TargetLibraryInfo. 1234 OptimizePM.addPass(InjectTLIMappings()); 1235 1236 addVectorPasses(Level, OptimizePM, /* IsFullLTO */ false); 1237 1238 // LoopSink pass sinks instructions hoisted by LICM, which serves as a 1239 // canonicalization pass that enables other optimizations. As a result, 1240 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM 1241 // result too early. 1242 OptimizePM.addPass(LoopSinkPass()); 1243 1244 // And finally clean up LCSSA form before generating code. 1245 OptimizePM.addPass(InstSimplifyPass()); 1246 1247 // This hoists/decomposes div/rem ops. It should run after other sink/hoist 1248 // passes to avoid re-sinking, but before SimplifyCFG because it can allow 1249 // flattening of blocks. 1250 OptimizePM.addPass(DivRemPairsPass()); 1251 1252 // Try to annotate calls that were created during optimization. 1253 OptimizePM.addPass(TailCallElimPass()); 1254 1255 // LoopSink (and other loop passes since the last simplifyCFG) might have 1256 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. 1257 OptimizePM.addPass( 1258 SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(true))); 1259 1260 // Add the core optimizing pipeline. 1261 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM), 1262 PTO.EagerlyInvalidateAnalyses)); 1263 1264 for (auto &C : OptimizerLastEPCallbacks) 1265 C(MPM, Level); 1266 1267 // Split out cold code. Splitting is done late to avoid hiding context from 1268 // other optimizations and inadvertently regressing performance. The tradeoff 1269 // is that this has a higher code size cost than splitting early. 1270 if (EnableHotColdSplit && !LTOPreLink) 1271 MPM.addPass(HotColdSplittingPass()); 1272 1273 // Search the code for similar regions of code. If enough similar regions can 1274 // be found where extracting the regions into their own function will decrease 1275 // the size of the program, we extract the regions, a deduplicate the 1276 // structurally similar regions. 1277 if (EnableIROutliner) 1278 MPM.addPass(IROutlinerPass()); 1279 1280 // Merge functions if requested. 1281 if (PTO.MergeFunctions) 1282 MPM.addPass(MergeFunctionsPass()); 1283 1284 // Now we need to do some global optimization transforms. 1285 // FIXME: It would seem like these should come first in the optimization 1286 // pipeline and maybe be the bottom of the canonicalization pipeline? Weird 1287 // ordering here. 1288 MPM.addPass(GlobalDCEPass()); 1289 MPM.addPass(ConstantMergePass()); 1290 1291 if (PTO.CallGraphProfile && !LTOPreLink) 1292 MPM.addPass(CGProfilePass()); 1293 1294 // TODO: Relative look table converter pass caused an issue when full lto is 1295 // enabled. See https://reviews.llvm.org/D94355 for more details. 1296 // Until the issue fixed, disable this pass during pre-linking phase. 1297 if (!LTOPreLink) 1298 MPM.addPass(RelLookupTableConverterPass()); 1299 1300 return MPM; 1301 } 1302 1303 ModulePassManager 1304 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, 1305 bool LTOPreLink) { 1306 assert(Level != OptimizationLevel::O0 && 1307 "Must request optimizations for the default pipeline!"); 1308 1309 ModulePassManager MPM; 1310 1311 // Convert @llvm.global.annotations to !annotation metadata. 1312 MPM.addPass(Annotation2MetadataPass()); 1313 1314 // Force any function attributes we want the rest of the pipeline to observe. 1315 MPM.addPass(ForceFunctionAttrsPass()); 1316 1317 // Apply module pipeline start EP callback. 1318 for (auto &C : PipelineStartEPCallbacks) 1319 C(MPM, Level); 1320 1321 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1322 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1323 1324 const ThinOrFullLTOPhase LTOPhase = LTOPreLink 1325 ? ThinOrFullLTOPhase::FullLTOPreLink 1326 : ThinOrFullLTOPhase::None; 1327 // Add the core simplification pipeline. 1328 MPM.addPass(buildModuleSimplificationPipeline(Level, LTOPhase)); 1329 1330 // Now add the optimization pipeline. 1331 MPM.addPass(buildModuleOptimizationPipeline(Level, LTOPhase)); 1332 1333 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1334 PGOOpt->Action == PGOOptions::SampleUse) 1335 MPM.addPass(PseudoProbeUpdatePass()); 1336 1337 // Emit annotation remarks. 1338 addAnnotationRemarksPass(MPM); 1339 1340 if (LTOPreLink) 1341 addRequiredLTOPreLinkPasses(MPM); 1342 1343 return MPM; 1344 } 1345 1346 ModulePassManager 1347 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1348 assert(Level != OptimizationLevel::O0 && 1349 "Must request optimizations for the default pipeline!"); 1350 1351 ModulePassManager MPM; 1352 1353 // Convert @llvm.global.annotations to !annotation metadata. 1354 MPM.addPass(Annotation2MetadataPass()); 1355 1356 // Force any function attributes we want the rest of the pipeline to observe. 1357 MPM.addPass(ForceFunctionAttrsPass()); 1358 1359 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1360 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1361 1362 // Apply module pipeline start EP callback. 1363 for (auto &C : PipelineStartEPCallbacks) 1364 C(MPM, Level); 1365 1366 // If we are planning to perform ThinLTO later, we don't bloat the code with 1367 // unrolling/vectorization/... now. Just simplify the module as much as we 1368 // can. 1369 MPM.addPass(buildModuleSimplificationPipeline( 1370 Level, ThinOrFullLTOPhase::ThinLTOPreLink)); 1371 1372 // Run partial inlining pass to partially inline functions that have 1373 // large bodies. 1374 // FIXME: It isn't clear whether this is really the right place to run this 1375 // in ThinLTO. Because there is another canonicalization and simplification 1376 // phase that will run after the thin link, running this here ends up with 1377 // less information than will be available later and it may grow functions in 1378 // ways that aren't beneficial. 1379 if (RunPartialInlining) 1380 MPM.addPass(PartialInlinerPass()); 1381 1382 // Reduce the size of the IR as much as possible. 1383 MPM.addPass(GlobalOptPass()); 1384 1385 if (PGOOpt && PGOOpt->PseudoProbeForProfiling && 1386 PGOOpt->Action == PGOOptions::SampleUse) 1387 MPM.addPass(PseudoProbeUpdatePass()); 1388 1389 // Handle OptimizerLastEPCallbacks added by clang on PreLink. Actual 1390 // optimization is going to be done in PostLink stage, but clang can't 1391 // add callbacks there in case of in-process ThinLTO called by linker. 1392 for (auto &C : OptimizerLastEPCallbacks) 1393 C(MPM, Level); 1394 1395 // Emit annotation remarks. 1396 addAnnotationRemarksPass(MPM); 1397 1398 addRequiredLTOPreLinkPasses(MPM); 1399 1400 return MPM; 1401 } 1402 1403 ModulePassManager PassBuilder::buildThinLTODefaultPipeline( 1404 OptimizationLevel Level, const ModuleSummaryIndex *ImportSummary) { 1405 ModulePassManager MPM; 1406 1407 // Convert @llvm.global.annotations to !annotation metadata. 1408 MPM.addPass(Annotation2MetadataPass()); 1409 1410 if (ImportSummary) { 1411 // These passes import type identifier resolutions for whole-program 1412 // devirtualization and CFI. They must run early because other passes may 1413 // disturb the specific instruction patterns that these passes look for, 1414 // creating dependencies on resolutions that may not appear in the summary. 1415 // 1416 // For example, GVN may transform the pattern assume(type.test) appearing in 1417 // two basic blocks into assume(phi(type.test, type.test)), which would 1418 // transform a dependency on a WPD resolution into a dependency on a type 1419 // identifier resolution for CFI. 1420 // 1421 // Also, WPD has access to more precise information than ICP and can 1422 // devirtualize more effectively, so it should operate on the IR first. 1423 // 1424 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1425 // metadata and intrinsics. 1426 MPM.addPass(WholeProgramDevirtPass(nullptr, ImportSummary)); 1427 MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); 1428 } 1429 1430 if (Level == OptimizationLevel::O0) { 1431 // Run a second time to clean up any type tests left behind by WPD for use 1432 // in ICP. 1433 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1434 // Drop available_externally and unreferenced globals. This is necessary 1435 // with ThinLTO in order to avoid leaving undefined references to dead 1436 // globals in the object file. 1437 MPM.addPass(EliminateAvailableExternallyPass()); 1438 MPM.addPass(GlobalDCEPass()); 1439 return MPM; 1440 } 1441 1442 // Force any function attributes we want the rest of the pipeline to observe. 1443 MPM.addPass(ForceFunctionAttrsPass()); 1444 1445 // Add the core simplification pipeline. 1446 MPM.addPass(buildModuleSimplificationPipeline( 1447 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1448 1449 // Now add the optimization pipeline. 1450 MPM.addPass(buildModuleOptimizationPipeline( 1451 Level, ThinOrFullLTOPhase::ThinLTOPostLink)); 1452 1453 // Emit annotation remarks. 1454 addAnnotationRemarksPass(MPM); 1455 1456 return MPM; 1457 } 1458 1459 ModulePassManager 1460 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level) { 1461 assert(Level != OptimizationLevel::O0 && 1462 "Must request optimizations for the default pipeline!"); 1463 // FIXME: We should use a customized pre-link pipeline! 1464 return buildPerModuleDefaultPipeline(Level, 1465 /* LTOPreLink */ true); 1466 } 1467 1468 ModulePassManager 1469 PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, 1470 ModuleSummaryIndex *ExportSummary) { 1471 ModulePassManager MPM; 1472 1473 // Convert @llvm.global.annotations to !annotation metadata. 1474 MPM.addPass(Annotation2MetadataPass()); 1475 1476 for (auto &C : FullLinkTimeOptimizationEarlyEPCallbacks) 1477 C(MPM, Level); 1478 1479 // Create a function that performs CFI checks for cross-DSO calls with targets 1480 // in the current module. 1481 MPM.addPass(CrossDSOCFIPass()); 1482 1483 if (Level == OptimizationLevel::O0) { 1484 // The WPD and LowerTypeTest passes need to run at -O0 to lower type 1485 // metadata and intrinsics. 1486 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1487 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1488 // Run a second time to clean up any type tests left behind by WPD for use 1489 // in ICP. 1490 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1491 1492 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 1493 C(MPM, Level); 1494 1495 // Emit annotation remarks. 1496 addAnnotationRemarksPass(MPM); 1497 1498 return MPM; 1499 } 1500 1501 if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { 1502 // Load sample profile before running the LTO optimization pipeline. 1503 MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, 1504 PGOOpt->ProfileRemappingFile, 1505 ThinOrFullLTOPhase::FullLTOPostLink)); 1506 // Cache ProfileSummaryAnalysis once to avoid the potential need to insert 1507 // RequireAnalysisPass for PSI before subsequent non-module passes. 1508 MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>()); 1509 } 1510 1511 // Try to run OpenMP optimizations, quick no-op if no OpenMP metadata present. 1512 MPM.addPass(OpenMPOptPass()); 1513 1514 // Remove unused virtual tables to improve the quality of code generated by 1515 // whole-program devirtualization and bitset lowering. 1516 MPM.addPass(GlobalDCEPass()); 1517 1518 // Force any function attributes we want the rest of the pipeline to observe. 1519 MPM.addPass(ForceFunctionAttrsPass()); 1520 1521 // Do basic inference of function attributes from known properties of system 1522 // libraries and other oracles. 1523 MPM.addPass(InferFunctionAttrsPass()); 1524 1525 if (Level.getSpeedupLevel() > 1) { 1526 MPM.addPass(createModuleToFunctionPassAdaptor( 1527 CallSiteSplittingPass(), PTO.EagerlyInvalidateAnalyses)); 1528 1529 // Indirect call promotion. This should promote all the targets that are 1530 // left by the earlier promotion pass that promotes intra-module targets. 1531 // This two-step promotion is to save the compile time. For LTO, it should 1532 // produce the same result as if we only do promotion here. 1533 MPM.addPass(PGOIndirectCallPromotion( 1534 true /* InLTO */, PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)); 1535 1536 if (EnableFunctionSpecialization && Level == OptimizationLevel::O3) 1537 MPM.addPass(FunctionSpecializationPass()); 1538 // Propagate constants at call sites into the functions they call. This 1539 // opens opportunities for globalopt (and inlining) by substituting function 1540 // pointers passed as arguments to direct uses of functions. 1541 MPM.addPass(IPSCCPPass()); 1542 1543 // Attach metadata to indirect call sites indicating the set of functions 1544 // they may target at run-time. This should follow IPSCCP. 1545 MPM.addPass(CalledValuePropagationPass()); 1546 } 1547 1548 // Now deduce any function attributes based in the current code. 1549 MPM.addPass( 1550 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1551 1552 // Do RPO function attribute inference across the module to forward-propagate 1553 // attributes where applicable. 1554 // FIXME: Is this really an optimization rather than a canonicalization? 1555 MPM.addPass(ReversePostOrderFunctionAttrsPass()); 1556 1557 // Use in-range annotations on GEP indices to split globals where beneficial. 1558 MPM.addPass(GlobalSplitPass()); 1559 1560 // Run whole program optimization of virtual call when the list of callees 1561 // is fixed. 1562 MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); 1563 1564 // Stop here at -O1. 1565 if (Level == OptimizationLevel::O1) { 1566 // The LowerTypeTestsPass needs to run to lower type metadata and the 1567 // type.test intrinsics. The pass does nothing if CFI is disabled. 1568 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1569 // Run a second time to clean up any type tests left behind by WPD for use 1570 // in ICP (which is performed earlier than this in the regular LTO 1571 // pipeline). 1572 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1573 1574 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 1575 C(MPM, Level); 1576 1577 // Emit annotation remarks. 1578 addAnnotationRemarksPass(MPM); 1579 1580 return MPM; 1581 } 1582 1583 // Optimize globals to try and fold them into constants. 1584 MPM.addPass(GlobalOptPass()); 1585 1586 // Promote any localized globals to SSA registers. 1587 MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass())); 1588 1589 // Linking modules together can lead to duplicate global constant, only 1590 // keep one copy of each constant. 1591 MPM.addPass(ConstantMergePass()); 1592 1593 // Remove unused arguments from functions. 1594 MPM.addPass(DeadArgumentEliminationPass()); 1595 1596 // Reduce the code after globalopt and ipsccp. Both can open up significant 1597 // simplification opportunities, and both can propagate functions through 1598 // function pointers. When this happens, we often have to resolve varargs 1599 // calls, etc, so let instcombine do this. 1600 FunctionPassManager PeepholeFPM; 1601 PeepholeFPM.addPass(InstCombinePass()); 1602 if (Level == OptimizationLevel::O3) 1603 PeepholeFPM.addPass(AggressiveInstCombinePass()); 1604 invokePeepholeEPCallbacks(PeepholeFPM, Level); 1605 1606 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), 1607 PTO.EagerlyInvalidateAnalyses)); 1608 1609 // Note: historically, the PruneEH pass was run first to deduce nounwind and 1610 // generally clean up exception handling overhead. It isn't clear this is 1611 // valuable as the inliner doesn't currently care whether it is inlining an 1612 // invoke or a call. 1613 // Run the inliner now. 1614 MPM.addPass(ModuleInlinerWrapperPass( 1615 getInlineParamsFromOptLevel(Level), 1616 /* MandatoryFirst */ true, 1617 InlineContext{ThinOrFullLTOPhase::FullLTOPostLink, 1618 InlinePass::CGSCCInliner})); 1619 1620 // Optimize globals again after we ran the inliner. 1621 MPM.addPass(GlobalOptPass()); 1622 1623 // Garbage collect dead functions. 1624 MPM.addPass(GlobalDCEPass()); 1625 1626 // If we didn't decide to inline a function, check to see if we can 1627 // transform it to pass arguments by value instead of by reference. 1628 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(ArgumentPromotionPass())); 1629 1630 FunctionPassManager FPM; 1631 // The IPO Passes may leave cruft around. Clean up after them. 1632 FPM.addPass(InstCombinePass()); 1633 invokePeepholeEPCallbacks(FPM, Level); 1634 1635 FPM.addPass(JumpThreadingPass()); 1636 1637 // Do a post inline PGO instrumentation and use pass. This is a context 1638 // sensitive PGO pass. 1639 if (PGOOpt) { 1640 if (PGOOpt->CSAction == PGOOptions::CSIRInstr) 1641 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, 1642 /* IsCS */ true, PGOOpt->CSProfileGenFile, 1643 PGOOpt->ProfileRemappingFile, 1644 ThinOrFullLTOPhase::FullLTOPostLink); 1645 else if (PGOOpt->CSAction == PGOOptions::CSIRUse) 1646 addPGOInstrPasses(MPM, Level, /* RunProfileGen */ false, 1647 /* IsCS */ true, PGOOpt->ProfileFile, 1648 PGOOpt->ProfileRemappingFile, 1649 ThinOrFullLTOPhase::FullLTOPostLink); 1650 } 1651 1652 // Break up allocas 1653 FPM.addPass(SROAPass()); 1654 1655 // LTO provides additional opportunities for tailcall elimination due to 1656 // link-time inlining, and visibility of nocapture attribute. 1657 FPM.addPass(TailCallElimPass()); 1658 1659 // Run a few AA driver optimizations here and now to cleanup the code. 1660 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM), 1661 PTO.EagerlyInvalidateAnalyses)); 1662 1663 MPM.addPass( 1664 createModuleToPostOrderCGSCCPassAdaptor(PostOrderFunctionAttrsPass())); 1665 1666 // Require the GlobalsAA analysis for the module so we can query it within 1667 // MainFPM. 1668 MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>()); 1669 // Invalidate AAManager so it can be recreated and pick up the newly available 1670 // GlobalsAA. 1671 MPM.addPass( 1672 createModuleToFunctionPassAdaptor(InvalidateAnalysisPass<AAManager>())); 1673 1674 FunctionPassManager MainFPM; 1675 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1676 LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, 1677 /*AllowSpeculation=*/true), 1678 /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); 1679 1680 if (RunNewGVN) 1681 MainFPM.addPass(NewGVNPass()); 1682 else 1683 MainFPM.addPass(GVNPass()); 1684 1685 // Remove dead memcpy()'s. 1686 MainFPM.addPass(MemCpyOptPass()); 1687 1688 // Nuke dead stores. 1689 MainFPM.addPass(DSEPass()); 1690 MainFPM.addPass(MergedLoadStoreMotionPass()); 1691 1692 1693 if (EnableConstraintElimination) 1694 MainFPM.addPass(ConstraintEliminationPass()); 1695 1696 LoopPassManager LPM; 1697 if (EnableLoopFlatten && Level.getSpeedupLevel() > 1) 1698 LPM.addPass(LoopFlattenPass()); 1699 LPM.addPass(IndVarSimplifyPass()); 1700 LPM.addPass(LoopDeletionPass()); 1701 // FIXME: Add loop interchange. 1702 1703 // Unroll small loops and perform peeling. 1704 LPM.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), 1705 /* OnlyWhenForced= */ !PTO.LoopUnrolling, 1706 PTO.ForgetAllSCEVInLoopUnroll)); 1707 // The loop passes in LPM (LoopFullUnrollPass) do not preserve MemorySSA. 1708 // *All* loop passes must preserve it, in order to be able to use it. 1709 MainFPM.addPass(createFunctionToLoopPassAdaptor( 1710 std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true)); 1711 1712 MainFPM.addPass(LoopDistributePass()); 1713 1714 addVectorPasses(Level, MainFPM, /* IsFullLTO */ true); 1715 1716 // Run the OpenMPOpt CGSCC pass again late. 1717 MPM.addPass( 1718 createModuleToPostOrderCGSCCPassAdaptor(OpenMPOptCGSCCPass())); 1719 1720 invokePeepholeEPCallbacks(MainFPM, Level); 1721 MainFPM.addPass(JumpThreadingPass()); 1722 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM), 1723 PTO.EagerlyInvalidateAnalyses)); 1724 1725 // Lower type metadata and the type.test intrinsic. This pass supports 1726 // clang's control flow integrity mechanisms (-fsanitize=cfi*) and needs 1727 // to be run at link time if CFI is enabled. This pass does nothing if 1728 // CFI is disabled. 1729 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); 1730 // Run a second time to clean up any type tests left behind by WPD for use 1731 // in ICP (which is performed earlier than this in the regular LTO pipeline). 1732 MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true)); 1733 1734 // Enable splitting late in the FullLTO post-link pipeline. 1735 if (EnableHotColdSplit) 1736 MPM.addPass(HotColdSplittingPass()); 1737 1738 // Add late LTO optimization passes. 1739 // Delete basic blocks, which optimization passes may have killed. 1740 MPM.addPass(createModuleToFunctionPassAdaptor(SimplifyCFGPass( 1741 SimplifyCFGOptions().convertSwitchRangeToICmp(true).hoistCommonInsts( 1742 true)))); 1743 1744 // Drop bodies of available eternally objects to improve GlobalDCE. 1745 MPM.addPass(EliminateAvailableExternallyPass()); 1746 1747 // Now that we have optimized the program, discard unreachable functions. 1748 MPM.addPass(GlobalDCEPass()); 1749 1750 if (PTO.MergeFunctions) 1751 MPM.addPass(MergeFunctionsPass()); 1752 1753 if (PTO.CallGraphProfile) 1754 MPM.addPass(CGProfilePass()); 1755 1756 for (auto &C : FullLinkTimeOptimizationLastEPCallbacks) 1757 C(MPM, Level); 1758 1759 // Emit annotation remarks. 1760 addAnnotationRemarksPass(MPM); 1761 1762 return MPM; 1763 } 1764 1765 ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, 1766 bool LTOPreLink) { 1767 assert(Level == OptimizationLevel::O0 && 1768 "buildO0DefaultPipeline should only be used with O0"); 1769 1770 ModulePassManager MPM; 1771 1772 // Perform pseudo probe instrumentation in O0 mode. This is for the 1773 // consistency between different build modes. For example, a LTO build can be 1774 // mixed with an O0 prelink and an O2 postlink. Loading a sample profile in 1775 // the postlink will require pseudo probe instrumentation in the prelink. 1776 if (PGOOpt && PGOOpt->PseudoProbeForProfiling) 1777 MPM.addPass(SampleProfileProbePass(TM)); 1778 1779 if (PGOOpt && (PGOOpt->Action == PGOOptions::IRInstr || 1780 PGOOpt->Action == PGOOptions::IRUse)) 1781 addPGOInstrPassesForO0( 1782 MPM, 1783 /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr), 1784 /* IsCS */ false, PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); 1785 1786 for (auto &C : PipelineStartEPCallbacks) 1787 C(MPM, Level); 1788 1789 if (PGOOpt && PGOOpt->DebugInfoForProfiling) 1790 MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); 1791 1792 for (auto &C : PipelineEarlySimplificationEPCallbacks) 1793 C(MPM, Level); 1794 1795 // Build a minimal pipeline based on the semantics required by LLVM, 1796 // which is just that always inlining occurs. Further, disable generating 1797 // lifetime intrinsics to avoid enabling further optimizations during 1798 // code generation. 1799 MPM.addPass(AlwaysInlinerPass( 1800 /*InsertLifetimeIntrinsics=*/false)); 1801 1802 if (PTO.MergeFunctions) 1803 MPM.addPass(MergeFunctionsPass()); 1804 1805 if (EnableMatrix) 1806 MPM.addPass( 1807 createModuleToFunctionPassAdaptor(LowerMatrixIntrinsicsPass(true))); 1808 1809 if (!CGSCCOptimizerLateEPCallbacks.empty()) { 1810 CGSCCPassManager CGPM; 1811 for (auto &C : CGSCCOptimizerLateEPCallbacks) 1812 C(CGPM, Level); 1813 if (!CGPM.isEmpty()) 1814 MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1815 } 1816 if (!LateLoopOptimizationsEPCallbacks.empty()) { 1817 LoopPassManager LPM; 1818 for (auto &C : LateLoopOptimizationsEPCallbacks) 1819 C(LPM, Level); 1820 if (!LPM.isEmpty()) { 1821 MPM.addPass(createModuleToFunctionPassAdaptor( 1822 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1823 } 1824 } 1825 if (!LoopOptimizerEndEPCallbacks.empty()) { 1826 LoopPassManager LPM; 1827 for (auto &C : LoopOptimizerEndEPCallbacks) 1828 C(LPM, Level); 1829 if (!LPM.isEmpty()) { 1830 MPM.addPass(createModuleToFunctionPassAdaptor( 1831 createFunctionToLoopPassAdaptor(std::move(LPM)))); 1832 } 1833 } 1834 if (!ScalarOptimizerLateEPCallbacks.empty()) { 1835 FunctionPassManager FPM; 1836 for (auto &C : ScalarOptimizerLateEPCallbacks) 1837 C(FPM, Level); 1838 if (!FPM.isEmpty()) 1839 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1840 } 1841 1842 for (auto &C : OptimizerEarlyEPCallbacks) 1843 C(MPM, Level); 1844 1845 if (!VectorizerStartEPCallbacks.empty()) { 1846 FunctionPassManager FPM; 1847 for (auto &C : VectorizerStartEPCallbacks) 1848 C(FPM, Level); 1849 if (!FPM.isEmpty()) 1850 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); 1851 } 1852 1853 ModulePassManager CoroPM; 1854 CoroPM.addPass(CoroEarlyPass()); 1855 CGSCCPassManager CGPM; 1856 CGPM.addPass(CoroSplitPass()); 1857 CoroPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); 1858 CoroPM.addPass(CoroCleanupPass()); 1859 CoroPM.addPass(GlobalDCEPass()); 1860 MPM.addPass(CoroConditionalWrapper(std::move(CoroPM))); 1861 1862 for (auto &C : OptimizerLastEPCallbacks) 1863 C(MPM, Level); 1864 1865 if (LTOPreLink) 1866 addRequiredLTOPreLinkPasses(MPM); 1867 1868 MPM.addPass(createModuleToFunctionPassAdaptor(AnnotationRemarksPass())); 1869 1870 return MPM; 1871 } 1872 1873 AAManager PassBuilder::buildDefaultAAPipeline() { 1874 AAManager AA; 1875 1876 // The order in which these are registered determines their priority when 1877 // being queried. 1878 1879 // First we register the basic alias analysis that provides the majority of 1880 // per-function local AA logic. This is a stateless, on-demand local set of 1881 // AA techniques. 1882 AA.registerFunctionAnalysis<BasicAA>(); 1883 1884 // Next we query fast, specialized alias analyses that wrap IR-embedded 1885 // information about aliasing. 1886 AA.registerFunctionAnalysis<ScopedNoAliasAA>(); 1887 AA.registerFunctionAnalysis<TypeBasedAA>(); 1888 1889 // Add support for querying global aliasing information when available. 1890 // Because the `AAManager` is a function analysis and `GlobalsAA` is a module 1891 // analysis, all that the `AAManager` can do is query for any *cached* 1892 // results from `GlobalsAA` through a readonly proxy. 1893 AA.registerModuleAnalysis<GlobalsAA>(); 1894 1895 // Add target-specific alias analyses. 1896 if (TM) 1897 TM->registerDefaultAliasAnalyses(AA); 1898 1899 return AA; 1900 } 1901